Repository: axolotl-ai-cloud/axolotl Branch: main Commit: b0294b3427da Files: 1070 Total size: 5.4 MB Directory structure: gitextract_1sp7sr39/ ├── .axolotl-complete.bash ├── .bandit ├── .coderabbit.yaml ├── .coveragerc ├── .editorconfig ├── .gitattributes ├── .github/ │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.yaml │ │ ├── config.yml │ │ ├── docs.yml │ │ └── feature-request.yaml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── SECURITY.md │ ├── SUPPORT.md │ ├── release-drafter.yml │ └── workflows/ │ ├── base.yml │ ├── docs.yml │ ├── lint.yml │ ├── main.yml │ ├── multi-gpu-e2e.yml │ ├── nightlies.yml │ ├── precommit-autoupdate.yml │ ├── preview-docs.yml │ ├── pypi.yml │ ├── tests-nightly.yml │ └── tests.yml ├── .gitignore ├── .mypy.ini ├── .pre-commit-config.yaml ├── .runpod/ │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── hub.json │ ├── requirements.txt │ ├── src/ │ │ ├── config/ │ │ │ └── config.yaml │ │ ├── handler.py │ │ ├── test_input.json │ │ ├── train.py │ │ └── utils.py │ ├── test-input.json │ └── tests.json ├── CITATION.cff ├── CNAME ├── FAQS.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── VERSION ├── _quarto.yml ├── benchmarks/ │ ├── bench_entropy.py │ ├── bench_scattermoe_lora.py │ └── bench_selective_logsoftmax.py ├── cicd/ │ ├── Dockerfile-uv.jinja │ ├── Dockerfile.jinja │ ├── __init__.py │ ├── cicd.sh │ ├── cleanup.py │ ├── cleanup.sh │ ├── e2e_tests.py │ ├── multigpu.py │ ├── multigpu.sh │ └── single_gpu.py ├── codecov.yml ├── deepspeed_configs/ │ ├── zero1.json │ ├── zero1_torch_compile.json │ ├── zero2.json │ ├── zero2_torch_compile.json │ ├── zero3.json │ ├── zero3_bf16.json │ ├── zero3_bf16_cpuoffload_all.json │ └── zero3_bf16_cpuoffload_params.json ├── devtools/ │ ├── README.md │ └── dev_chat_template.yml ├── docker/ │ ├── Dockerfile │ ├── Dockerfile-base │ ├── Dockerfile-base-next │ ├── Dockerfile-base-nightly │ ├── Dockerfile-cloud │ ├── Dockerfile-cloud-no-tmux │ ├── Dockerfile-cloud-uv │ ├── Dockerfile-tests │ ├── Dockerfile-uv │ └── Dockerfile-uv-base ├── docker-compose.yaml ├── docs/ │ ├── .gitignore │ ├── amd_hpc.qmd │ ├── attention.qmd │ ├── batch_vs_grad.qmd │ ├── checkpoint_saving.qmd │ ├── cli.qmd │ ├── custom_integrations.qmd │ ├── dataset-formats/ │ │ ├── conversation.qmd │ │ ├── index.qmd │ │ ├── inst_tune.qmd │ │ ├── pretraining.qmd │ │ ├── stepwise_supervised.qmd │ │ ├── template_free.qmd │ │ └── tokenized.qmd │ ├── dataset_loading.qmd │ ├── dataset_preprocessing.qmd │ ├── debugging.qmd │ ├── docker.qmd │ ├── expert_quantization.qmd │ ├── faq.qmd │ ├── fsdp_qlora.qmd │ ├── getting-started.qmd │ ├── gradient_checkpointing.qmd │ ├── inference.qmd │ ├── input_output.qmd │ ├── installation.qmd │ ├── lora_optims.qmd │ ├── lr_groups.qmd │ ├── mac.qmd │ ├── mixed_precision.qmd │ ├── multi-gpu.qmd │ ├── multi-node.qmd │ ├── multimodal.qmd │ ├── multipack.qmd │ ├── nccl.qmd │ ├── nd_parallelism.qmd │ ├── optimizations.qmd │ ├── optimizers.qmd │ ├── qat.qmd │ ├── quantize.qmd │ ├── ray-integration.qmd │ ├── reward_modelling.qmd │ ├── rlhf.qmd │ ├── scripts/ │ │ ├── examples-allowlist.yml │ │ ├── generate_config_docs.py │ │ └── generate_examples_docs.py │ ├── sequence_parallelism.qmd │ ├── streaming.qmd │ ├── telemetry.qmd │ ├── torchao.qmd │ └── unsloth.qmd ├── examples/ │ ├── LiquidAI/ │ │ ├── README.md │ │ ├── lfm2-350m-fft.yaml │ │ ├── lfm2-8b-a1b-lora.yaml │ │ └── lfm2-vl-lora.yaml │ ├── alst/ │ │ ├── README.md │ │ ├── llama3-8b-deepspeed-alst.yaml │ │ └── llama3-8b-fsdp2-alst.yaml │ ├── apertus/ │ │ ├── README.md │ │ └── apertus-8b-qlora.yaml │ ├── arcee/ │ │ ├── README.md │ │ └── afm-4.5b-qlora.yaml │ ├── archived/ │ │ ├── README.md │ │ ├── cerebras/ │ │ │ ├── btlm-ft.yml │ │ │ └── qlora.yml │ │ ├── code-llama/ │ │ │ ├── 13b/ │ │ │ │ ├── lora.yml │ │ │ │ └── qlora.yml │ │ │ ├── 34b/ │ │ │ │ ├── lora.yml │ │ │ │ └── qlora.yml │ │ │ ├── 7b/ │ │ │ │ ├── lora.yml │ │ │ │ └── qlora.yml │ │ │ └── README.md │ │ ├── dbrx/ │ │ │ ├── 16bit-lora.yaml │ │ │ ├── 8bit-lora.yaml │ │ │ ├── README.md │ │ │ └── fft-ds-zero3.yaml │ │ ├── deepcoder/ │ │ │ └── deepcoder-14B-preview-lora.yml │ │ ├── falcon/ │ │ │ ├── config-7b-lora.yml │ │ │ ├── config-7b-qlora.yml │ │ │ └── config-7b.yml │ │ ├── gemma/ │ │ │ └── qlora.yml │ │ ├── gptj/ │ │ │ └── qlora.yml │ │ ├── jeopardy-bot/ │ │ │ └── config.yml │ │ ├── mpt-7b/ │ │ │ ├── README.md │ │ │ └── config.yml │ │ ├── openllama-3b/ │ │ │ ├── README.md │ │ │ ├── config.yml │ │ │ ├── lora.yml │ │ │ └── qlora.yml │ │ ├── pythia/ │ │ │ └── lora.yml │ │ ├── pythia-12b/ │ │ │ ├── README.md │ │ │ └── config.yml │ │ ├── qwen/ │ │ │ ├── README.md │ │ │ ├── lora.yml │ │ │ ├── qlora.yml │ │ │ ├── qwen2-moe-lora.yaml │ │ │ └── qwen2-moe-qlora.yaml │ │ ├── redpajama/ │ │ │ ├── README.md │ │ │ └── config-3b.yml │ │ ├── replit-3b/ │ │ │ └── config-lora.yml │ │ ├── stablelm-2/ │ │ │ ├── 1.6b/ │ │ │ │ ├── fft.yml │ │ │ │ └── lora.yml │ │ │ └── README.md │ │ ├── starcoder2/ │ │ │ └── qlora.yml │ │ ├── tiny-llama/ │ │ │ ├── README.md │ │ │ ├── lora-mps.yml │ │ │ ├── lora.yml │ │ │ ├── pretrain.yml │ │ │ └── qlora.yml │ │ ├── xgen-7b/ │ │ │ └── xgen-7b-8k-qlora.yml │ │ └── yi-34B-chat/ │ │ ├── README.md │ │ └── qlora.yml │ ├── cloud/ │ │ ├── baseten.yaml │ │ └── modal.yaml │ ├── cohere/ │ │ └── command-r-7b-qlora.yml │ ├── colab-notebooks/ │ │ └── colab-axolotl-example.ipynb │ ├── deepcogito/ │ │ ├── cogito-v1-preview-llama-3B-lora.yml │ │ └── cogito-v1-preview-qwen-14B-lora.yml │ ├── deepseek-v2/ │ │ ├── fft-fsdp-16b.yaml │ │ └── qlora-fsdp-2_5.yaml │ ├── devstral/ │ │ ├── README.md │ │ └── devstral-small-qlora.yml │ ├── distributed-parallel/ │ │ ├── README.md │ │ ├── llama-3_1-8b-hsdp-tp.yaml │ │ └── qwen3-8b-fsdp-tp-cp.yaml │ ├── eaft/ │ │ └── eaft-example.yml │ ├── falcon-h1/ │ │ ├── falcon-h1-1b-deep-qlora.yaml │ │ ├── falcon-h1-1b-qlora.yaml │ │ ├── falcon-h1-34b-qlora.yaml │ │ ├── falcon-h1-3b-qlora.yaml │ │ ├── falcon-h1-500m-qlora.yaml │ │ └── falcon-h1-7b-qlora.yaml │ ├── gemma2/ │ │ ├── qlora.yml │ │ └── reward-model.yaml │ ├── gemma3/ │ │ ├── gemma-3-1b-qlora.yml │ │ ├── gemma-3-270m-qlora.yml │ │ ├── gemma-3-4b-qlora.yml │ │ └── gemma-3-4b-vision-qlora.yml │ ├── gemma3n/ │ │ ├── README.md │ │ ├── gemma-3n-e2b-qlora.yml │ │ ├── gemma-3n-e2b-vision-audio-qlora.yml │ │ └── gemma-3n-e2b-vision-qlora.yml │ ├── glm4/ │ │ └── qlora-32b.yaml │ ├── glm45/ │ │ ├── README.md │ │ └── glm-45-air-qlora.yaml │ ├── glm46v/ │ │ ├── README.md │ │ ├── glm-4-6v-flash-ddp.yaml │ │ └── glm-4-6v-flash-qlora.yaml │ ├── glm47-flash/ │ │ ├── README.md │ │ ├── lora.yaml │ │ ├── lora_fsdp.yaml │ │ ├── qlora.yaml │ │ └── qlora_fsdp.yaml │ ├── gpt-oss/ │ │ ├── README.md │ │ ├── gpt-oss-120b-fft-fsdp2-offload.yaml │ │ ├── gpt-oss-20b-fft-deepspeed-zero3.yaml │ │ ├── gpt-oss-20b-fft-fsdp2-offload.yaml │ │ ├── gpt-oss-20b-fft-fsdp2.yaml │ │ ├── gpt-oss-20b-sft-lora-singlegpu.yaml │ │ └── gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml │ ├── granite4/ │ │ ├── README.md │ │ └── granite-4.0-tiny-fft.yaml │ ├── hunyuan/ │ │ ├── README.md │ │ └── hunyuan-v1-dense-qlora.yaml │ ├── internvl3_5/ │ │ ├── README.md │ │ └── internvl3_5-8b-qlora.yml │ ├── jamba/ │ │ ├── README.md │ │ ├── qlora.yaml │ │ ├── qlora_deepspeed.yaml │ │ └── qlora_fsdp_large.yaml │ ├── kimi-linear/ │ │ ├── README.md │ │ └── kimi-48b-lora.yaml │ ├── llama-2/ │ │ ├── README.md │ │ ├── fft_optimized.yml │ │ ├── gptq-lora.yml │ │ ├── lisa.yml │ │ ├── loftq.yml │ │ ├── lora.yml │ │ ├── qlora-fsdp.yml │ │ ├── qlora.yml │ │ └── relora.yml │ ├── llama-3/ │ │ ├── 3b-fp8-fsdp2.yaml │ │ ├── 3b-qat-fsdp2.yaml │ │ ├── 3b-qat-mxfp4.yaml │ │ ├── 3b-qat-nvfp4.yaml │ │ ├── README.md │ │ ├── diffusion/ │ │ │ ├── pretrain-1b.yaml │ │ │ └── sft-1b.yaml │ │ ├── fft-8b-liger-fsdp.yaml │ │ ├── fft-8b.yaml │ │ ├── instruct-dpo-lora-8b.yml │ │ ├── instruct-lora-8b.yml │ │ ├── lora-1b-deduplicate-dpo.yml │ │ ├── lora-1b-deduplicate-sft.yml │ │ ├── lora-1b-kernels.yml │ │ ├── lora-1b-ray.yml │ │ ├── lora-1b-sample-packing-sequentially.yml │ │ ├── lora-1b.yml │ │ ├── lora-8b.yml │ │ ├── opentelemetry-qlora.yml │ │ ├── qlora-1b-gdpo.yaml │ │ ├── qlora-1b-kto.yaml │ │ ├── qlora-1b.yml │ │ ├── qlora-fsdp-405b.yaml │ │ ├── qlora-fsdp-70b.yaml │ │ ├── qlora.yml │ │ └── sparse-finetuning.yaml │ ├── llama-3-vision/ │ │ └── lora-11b.yaml │ ├── llama-4/ │ │ ├── README.md │ │ ├── do-no-use-fa2/ │ │ │ ├── maverick-qlora-fsdp1.yaml │ │ │ ├── scout-qlora-fsdp1.yaml │ │ │ ├── scout-qlora-single-h100.yaml │ │ │ └── scout-vision-qlora-fsdp.yaml │ │ ├── scout-qlora-flexattn-fsdp2.yaml │ │ ├── scout-qlora-single-h100-flex.yaml │ │ └── scout-vision-qlora-fsdp2-flex.yaml │ ├── llava/ │ │ └── lora-7b.yaml │ ├── magistral/ │ │ ├── README.md │ │ ├── magistral-small-fsdp-qlora.yaml │ │ ├── magistral-small-qlora.yaml │ │ ├── think/ │ │ │ ├── README.md │ │ │ └── magistral-small-think-qlora.yaml │ │ └── vision/ │ │ ├── README.md │ │ └── magistral-small-vision-24B-qlora.yml │ ├── mamba/ │ │ └── config.yml │ ├── mimo/ │ │ ├── README.md │ │ └── mimo-7b-qlora.yaml │ ├── ministral/ │ │ ├── README.md │ │ └── ministral-small-qlora.yaml │ ├── ministral3/ │ │ ├── README.md │ │ ├── ministral3-3b-qlora.yaml │ │ ├── think/ │ │ │ ├── README.md │ │ │ └── ministral3-3b-think-qlora.yaml │ │ └── vision/ │ │ ├── README.md │ │ └── ministral3-3b-vision-qlora.yml │ ├── mistral/ │ │ ├── README.md │ │ ├── bigstral/ │ │ │ └── bigstral-ds-zero3.yaml │ │ ├── config.yml │ │ ├── dpo/ │ │ │ └── mistral-dpo-qlora.yml │ │ ├── lora.yml │ │ ├── mistral-qlora-fsdp.yml │ │ ├── mixtral/ │ │ │ ├── mixtral-8x22b-qlora-fsdp.yml │ │ │ ├── mixtral-qlora-fsdp.yml │ │ │ ├── mixtral.yml │ │ │ └── mixtral_22.yml │ │ ├── mps/ │ │ │ └── lora-mps.yml │ │ ├── orpo/ │ │ │ └── mistral-qlora-orpo.yml │ │ └── qlora.yml │ ├── mistral-small/ │ │ ├── README.md │ │ └── mistral-small-3.1-24B-lora.yml │ ├── mistral4/ │ │ ├── README.md │ │ ├── fft-text.yml │ │ ├── fft-vision.yml │ │ ├── qlora-text.yml │ │ └── qlora-vision.yml │ ├── nemotron/ │ │ └── nemotron-mini-4b-qlora.yaml │ ├── olmo3/ │ │ ├── README.md │ │ └── olmo3-7b-qlora.yaml │ ├── orpheus/ │ │ ├── README.md │ │ └── finetune.yml │ ├── phi/ │ │ ├── README.md │ │ ├── lora-3.5.yaml │ │ ├── phi-ft.yml │ │ ├── phi-qlora.yml │ │ ├── phi2-ft.yml │ │ ├── phi3-ft-fsdp.yml │ │ └── phi3-ft.yml │ ├── pixtral/ │ │ └── lora-12b.yml │ ├── plano/ │ │ ├── README.md │ │ └── plano-4b-qlora.yaml │ ├── qat_nvfp4/ │ │ ├── Gemma3-12B_baseline.yml │ │ ├── Gemma3-12B_qat.yml │ │ ├── Math-Gemma3-12B_baseline.yml │ │ ├── Math-Gemma3-12B_qat.yml │ │ ├── Math-Gemma3-27B_baseline.yml │ │ ├── Math-Gemma3-27B_qat.yml │ │ ├── Math-Qwen2.5-72B_baseline.yml │ │ ├── Math-Qwen2.5-72B_qat.yml │ │ ├── Qwen2.5-72B_baseline.yml │ │ └── Qwen2.5-72B_qat.yml │ ├── qwen2/ │ │ ├── adamw-pretrain-fsdp2.yaml │ │ ├── dpo.yaml │ │ ├── muon-pretrain-fsdp2.yaml │ │ ├── prm.yaml │ │ ├── qlora-fsdp.yaml │ │ └── reward-model.yaml │ ├── qwen2-vl/ │ │ └── lora-7b.yaml │ ├── qwen2_5-vl/ │ │ └── lora-7b.yaml │ ├── qwen3/ │ │ ├── 32b-qlora.yaml │ │ ├── 8b-qat-fsdp2.yml │ │ ├── README.md │ │ ├── qlora-fsdp.yaml │ │ └── reward-model.yaml │ ├── qwen3-next/ │ │ ├── README.md │ │ └── qwen3-next-80b-a3b-qlora.yaml │ ├── qwen3.5/ │ │ ├── 122b-a10b-moe-qlora-fsdp.yaml │ │ ├── 122b-a10b-moe-qlora.yaml │ │ ├── 27b-fft.yaml │ │ ├── 27b-qlora-fsdp.yaml │ │ ├── 27b-qlora.yaml │ │ ├── 35b-a3b-moe-qlora-fsdp.yaml │ │ ├── 35b-a3b-moe-qlora.yaml │ │ ├── 9b-fft-vision.yaml │ │ ├── 9b-lora-vision.yaml │ │ └── README.md │ ├── seed-oss/ │ │ ├── README.md │ │ └── seed-oss-36b-qlora.yaml │ ├── slurm/ │ │ ├── README.md │ │ └── axolotl.slurm │ ├── smolvlm2/ │ │ ├── README.md │ │ └── smolvlm2-2B-lora.yaml │ ├── streaming/ │ │ ├── README.md │ │ ├── pretrain.yaml │ │ └── sft.yaml │ ├── swanlab/ │ │ ├── README.md │ │ ├── custom_trainer_profiling.py │ │ ├── dpo-swanlab-completions.yml │ │ ├── dpo-swanlab-full-featured.yml │ │ └── lora-swanlab-profiling.yml │ ├── trinity/ │ │ ├── README.md │ │ └── trinity-nano-preview-qlora.yaml │ └── voxtral/ │ ├── README.md │ ├── voxtral-mini-audio-qlora.yml │ └── voxtral-mini-qlora.yml ├── index.qmd ├── pyproject.toml ├── requirements-dev.txt ├── requirements-tests.txt ├── requirements.txt ├── scripts/ │ ├── chat_datasets.py │ ├── cloud-entrypoint-term.sh │ ├── cloud-entrypoint.sh │ ├── cutcrossentropy_install.py │ ├── motd │ └── unsloth_install.py ├── setup.py ├── src/ │ ├── axolotl/ │ │ ├── __init__.py │ │ ├── cli/ │ │ │ ├── __init__.py │ │ │ ├── args.py │ │ │ ├── art.py │ │ │ ├── checks.py │ │ │ ├── cloud/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── baseten/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── template/ │ │ │ │ │ ├── run.sh │ │ │ │ │ └── train_sft.py │ │ │ │ └── modal_.py │ │ │ ├── config.py │ │ │ ├── delinearize_llama4.py │ │ │ ├── evaluate.py │ │ │ ├── inference.py │ │ │ ├── main.py │ │ │ ├── merge_lora.py │ │ │ ├── merge_sharded_fsdp_weights.py │ │ │ ├── preprocess.py │ │ │ ├── quantize.py │ │ │ ├── train.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── diffusion.py │ │ │ │ ├── fetch.py │ │ │ │ ├── load.py │ │ │ │ ├── sweeps.py │ │ │ │ └── train.py │ │ │ └── vllm_serve.py │ │ ├── common/ │ │ │ ├── __init__.py │ │ │ ├── architectures.py │ │ │ ├── const.py │ │ │ └── datasets.py │ │ ├── convert.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── attention/ │ │ │ │ └── __init__.py │ │ │ ├── builders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── causal.py │ │ │ │ └── rl.py │ │ │ ├── chat/ │ │ │ │ ├── __init__.py │ │ │ │ ├── format/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── chatml.py │ │ │ │ │ ├── llama3x.py │ │ │ │ │ └── shared.py │ │ │ │ └── messages.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chat.py │ │ │ │ └── transforms/ │ │ │ │ ├── __init__.py │ │ │ │ └── chat_builder.py │ │ │ ├── trainers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── dpo/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── args.py │ │ │ │ │ └── trainer.py │ │ │ │ ├── grpo/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── args.py │ │ │ │ │ ├── async_trainer.py │ │ │ │ │ ├── fast_async_trainer.py │ │ │ │ │ ├── replay_buffer.py │ │ │ │ │ ├── sampler.py │ │ │ │ │ └── trainer.py │ │ │ │ ├── mamba.py │ │ │ │ ├── mixins/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── activation_checkpointing.py │ │ │ │ │ ├── checkpoints.py │ │ │ │ │ ├── distributed_parallel.py │ │ │ │ │ ├── optimizer.py │ │ │ │ │ ├── packing.py │ │ │ │ │ ├── rng_state_loader.py │ │ │ │ │ └── scheduler.py │ │ │ │ ├── trl.py │ │ │ │ └── utils.py │ │ │ ├── training_args.py │ │ │ └── training_args_base.py │ │ ├── datasets.py │ │ ├── evaluate.py │ │ ├── integrations/ │ │ │ ├── LICENSE.md │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── config.py │ │ │ ├── cut_cross_entropy/ │ │ │ │ ├── ACKNOWLEDGEMENTS.md │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ └── args.py │ │ │ ├── densemixer/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ └── plugin.py │ │ │ ├── diffusion/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── callbacks.py │ │ │ │ ├── generation.py │ │ │ │ ├── plugin.py │ │ │ │ ├── trainer.py │ │ │ │ └── utils.py │ │ │ ├── grokfast/ │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ └── optimizer.py │ │ │ ├── kd/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── callbacks.py │ │ │ │ ├── chat_template.py │ │ │ │ ├── collator.py │ │ │ │ ├── collator_online_teacher.py │ │ │ │ ├── kernels/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── liger.py │ │ │ │ │ └── models.py │ │ │ │ ├── topk_logprob/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── forward_kl.py │ │ │ │ ├── trainer.py │ │ │ │ └── utils.py │ │ │ ├── kernels/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── autotune_callback.py │ │ │ │ ├── autotune_collector.py │ │ │ │ ├── constants.py │ │ │ │ ├── libs/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── scattermoe_lora/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── kernels/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── lora_ops.py │ │ │ │ │ │ ├── ops.py │ │ │ │ │ │ └── single.py │ │ │ │ │ ├── layers.py │ │ │ │ │ ├── lora_ops.py │ │ │ │ │ ├── parallel_experts.py │ │ │ │ │ ├── parallel_linear_lora.py │ │ │ │ │ ├── selective_dequant.py │ │ │ │ │ └── selective_dequant_kernel.py │ │ │ │ ├── plugin.py │ │ │ │ └── sonicmoe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── patch.py │ │ │ │ ├── routing.py │ │ │ │ └── weight_converter.py │ │ │ ├── liger/ │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── models/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── deepseekv2.py │ │ │ │ │ ├── jamba.py │ │ │ │ │ ├── llama4.py │ │ │ │ │ ├── qwen3.py │ │ │ │ │ └── qwen3_moe.py │ │ │ │ ├── plugin.py │ │ │ │ └── utils.py │ │ │ ├── llm_compressor/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ ├── plugin.py │ │ │ │ └── utils.py │ │ │ ├── lm_eval/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ └── cli.py │ │ │ ├── spectrum/ │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ └── model_snr_results/ │ │ │ │ ├── snr_results_Qwen-Qwen2.5-1.5B-Instruct.json │ │ │ │ ├── snr_results_Qwen-Qwen2.5-1.5B.json │ │ │ │ ├── snr_results_Qwen-Qwen2.5-3B-Instruct.json │ │ │ │ ├── snr_results_Qwen-Qwen2.5-3B.json │ │ │ │ ├── snr_results_Qwen-Qwen2.5-7B-Instruct.json │ │ │ │ ├── snr_results_Qwen-Qwen2.5-7B.json │ │ │ │ ├── snr_results_google-gemma-2-2b.json │ │ │ │ ├── snr_results_meta-llama-Llama-3.2-1B-Instruct.json │ │ │ │ ├── snr_results_meta-llama-Llama-3.2-1B.json │ │ │ │ ├── snr_results_meta-llama-Llama-3.2-3B-Instruct.json │ │ │ │ └── snr_results_meta-llama-Llama-3.2-3B.json │ │ │ └── swanlab/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── args.py │ │ │ ├── callbacks.py │ │ │ ├── completion_logger.py │ │ │ ├── plugins.py │ │ │ └── profiling.py │ │ ├── kernels/ │ │ │ ├── __init__.py │ │ │ ├── geglu.py │ │ │ ├── lora.py │ │ │ ├── quantize.py │ │ │ ├── swiglu.py │ │ │ └── utils.py │ │ ├── loaders/ │ │ │ ├── __init__.py │ │ │ ├── adapter.py │ │ │ ├── adapters/ │ │ │ │ └── __init__.py │ │ │ ├── constants.py │ │ │ ├── model.py │ │ │ ├── patch_manager.py │ │ │ ├── processor.py │ │ │ ├── tokenizer.py │ │ │ └── utils.py │ │ ├── logging_config.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ └── mamba/ │ │ │ ├── __init__.py │ │ │ ├── configuration_mamba.py │ │ │ └── modeling_mamba.py │ │ ├── monkeypatch/ │ │ │ ├── __init__.py │ │ │ ├── accelerate/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fsdp2.py │ │ │ │ └── parallelism_config.py │ │ │ ├── attention/ │ │ │ │ ├── __init__.py │ │ │ │ ├── flash_attn_4.py │ │ │ │ ├── flex_attn.py │ │ │ │ ├── sage_attn.py │ │ │ │ └── xformers.py │ │ │ ├── btlm_attn_hijack_flash.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ └── batch_dataset_fetcher.py │ │ │ ├── deepspeed_utils.py │ │ │ ├── fsdp2_qlora.py │ │ │ ├── gradient_checkpointing/ │ │ │ │ ├── __init__.py │ │ │ │ ├── offload_cpu.py │ │ │ │ └── offload_disk.py │ │ │ ├── llama_attn_hijack_flash.py │ │ │ ├── llama_attn_hijack_xformers.py │ │ │ ├── lora_kernels.py │ │ │ ├── loss/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chunked.py │ │ │ │ └── eaft.py │ │ │ ├── mistral_attn_hijack_flash.py │ │ │ ├── mixtral/ │ │ │ │ └── __init__.py │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── apertus/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── activation.py │ │ │ │ ├── kimi_linear/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── configuration_kimi.py │ │ │ │ │ ├── modeling_kimi.py │ │ │ │ │ ├── patch_kimi_linear.py │ │ │ │ │ └── tokenization_kimi.py │ │ │ │ ├── llama4/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── modeling.py │ │ │ │ ├── mistral3/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── mistral_common_tokenizer.py │ │ │ │ ├── pixtral/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── modeling_flash_attention_utils.py │ │ │ │ ├── qwen3_5/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── modeling.py │ │ │ │ ├── qwen3_next/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── modeling.py │ │ │ │ └── voxtral/ │ │ │ │ ├── __init__.py │ │ │ │ └── modeling.py │ │ │ ├── moe_quant.py │ │ │ ├── multipack.py │ │ │ ├── peft/ │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ ├── relora.py │ │ │ ├── ring_attn/ │ │ │ │ ├── __init__.py │ │ │ │ ├── adapters/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── batch.py │ │ │ │ └── patch.py │ │ │ ├── scaled_softmax_attn.py │ │ │ ├── stablelm_attn_hijack_flash.py │ │ │ ├── tiled_mlp/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── patch.py │ │ │ ├── trainer/ │ │ │ │ ├── __init__.py │ │ │ │ ├── lr.py │ │ │ │ ├── trl.py │ │ │ │ ├── trl_vllm.py │ │ │ │ └── utils.py │ │ │ ├── trainer_accelerator_args.py │ │ │ ├── trainer_fsdp_optim.py │ │ │ ├── transformers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── trainer_context_parallel.py │ │ │ │ └── trainer_loss_calc.py │ │ │ ├── transformers_fa_utils.py │ │ │ ├── unsloth_.py │ │ │ ├── utils.py │ │ │ └── xformers_/ │ │ │ └── __init__.py │ │ ├── processing_strategies.py │ │ ├── prompt_strategies/ │ │ │ ├── __init__.py │ │ │ ├── alpaca_chat.py │ │ │ ├── alpaca_instruct.py │ │ │ ├── alpaca_w_system.py │ │ │ ├── base.py │ │ │ ├── bradley_terry/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── chat_template.py │ │ │ │ └── llama3.py │ │ │ ├── chat_template.py │ │ │ ├── completion.py │ │ │ ├── context_qa.py │ │ │ ├── creative_acr.py │ │ │ ├── dpo/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chat_template.py │ │ │ │ ├── chatml.py │ │ │ │ ├── llama3.py │ │ │ │ ├── passthrough.py │ │ │ │ ├── user_defined.py │ │ │ │ └── zephyr.py │ │ │ ├── input_output.py │ │ │ ├── jinja_template_analyzer.py │ │ │ ├── kto/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chatml.py │ │ │ │ ├── llama3.py │ │ │ │ └── user_defined.py │ │ │ ├── llama2_chat.py │ │ │ ├── messages/ │ │ │ │ ├── __init__.py │ │ │ │ └── chat.py │ │ │ ├── metharme.py │ │ │ ├── orcamini.py │ │ │ ├── orpo/ │ │ │ │ ├── __init__.py │ │ │ │ └── chat_template.py │ │ │ ├── pretrain.py │ │ │ ├── pygmalion.py │ │ │ ├── stepwise_supervised.py │ │ │ └── user_defined.py │ │ ├── prompt_tokenizers.py │ │ ├── prompters.py │ │ ├── scripts/ │ │ │ ├── __init__.py │ │ │ ├── vllm_serve_lora.py │ │ │ └── vllm_worker_ext.py │ │ ├── telemetry/ │ │ │ ├── __init__.py │ │ │ ├── callbacks.py │ │ │ ├── errors.py │ │ │ ├── manager.py │ │ │ ├── runtime_metrics.py │ │ │ └── whitelist.yaml │ │ ├── train.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── bench.py │ │ ├── callbacks/ │ │ │ ├── __init__.py │ │ │ ├── comet_.py │ │ │ ├── dynamic_checkpoint.py │ │ │ ├── generation.py │ │ │ ├── lisa.py │ │ │ ├── mlflow_.py │ │ │ ├── models.py │ │ │ ├── opentelemetry.py │ │ │ ├── perplexity.py │ │ │ ├── profiler.py │ │ │ ├── qat.py │ │ │ ├── swanlab.py │ │ │ ├── tokens_per_second.py │ │ │ └── trackio_.py │ │ ├── chat_templates/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── templates/ │ │ │ ├── alpaca.jinja │ │ │ ├── aya.jinja │ │ │ ├── chatml.jinja │ │ │ ├── cohere.jinja │ │ │ ├── command_a.jinja │ │ │ ├── command_a_rag.jinja │ │ │ ├── command_a_tool_use.jinja │ │ │ ├── deepseek_v2.jinja │ │ │ ├── deepseek_v3.jinja │ │ │ ├── exaone.jinja │ │ │ ├── exaone4.jinja │ │ │ ├── falcon_h1.jinja │ │ │ ├── gemma.jinja │ │ │ ├── gemma3.jinja │ │ │ ├── gemma3n.jinja │ │ │ ├── jamba.jinja │ │ │ ├── llama3.jinja │ │ │ ├── llama3_2_vision.jinja │ │ │ ├── llama4.jinja │ │ │ ├── llava.jinja │ │ │ ├── metharme.jinja │ │ │ ├── mistral_v1.jinja │ │ │ ├── mistral_v2v3.jinja │ │ │ ├── mistral_v3_tekken.jinja │ │ │ ├── mistral_v7_tekken.jinja │ │ │ ├── phi_3.jinja │ │ │ ├── phi_35.jinja │ │ │ ├── phi_4.jinja │ │ │ ├── pixtral.jinja │ │ │ ├── qwen2_vl.jinja │ │ │ ├── qwen3.jinja │ │ │ ├── qwen3_5.jinja │ │ │ └── qwen_25.jinja │ │ ├── collators/ │ │ │ ├── __init__.py │ │ │ ├── batching.py │ │ │ ├── core.py │ │ │ ├── mamba.py │ │ │ └── mm_chat.py │ │ ├── comet_.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ └── models/ │ │ │ └── __init__.py │ │ ├── ctx_managers/ │ │ │ ├── __init__.py │ │ │ └── sequence_parallel.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── lock.py │ │ │ ├── rl.py │ │ │ ├── sft.py │ │ │ ├── shared.py │ │ │ ├── streaming.py │ │ │ ├── utils.py │ │ │ └── wrappers.py │ │ ├── datasets.py │ │ ├── dict.py │ │ ├── distributed.py │ │ ├── environment.py │ │ ├── freeze.py │ │ ├── generation/ │ │ │ ├── __init__.py │ │ │ └── sft.py │ │ ├── import_helper.py │ │ ├── logging.py │ │ ├── lora.py │ │ ├── mistral/ │ │ │ ├── __init__.py │ │ │ ├── mistral3_processor.py │ │ │ └── mistral_tokenizer.py │ │ ├── mlflow_.py │ │ ├── model_shard_quant.py │ │ ├── optimizers/ │ │ │ ├── __init__.py │ │ │ └── adopt.py │ │ ├── quantization.py │ │ ├── samplers/ │ │ │ ├── __init__.py │ │ │ ├── multipack.py │ │ │ └── utils.py │ │ ├── schedulers.py │ │ ├── schemas/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── datasets.py │ │ │ ├── deprecated.py │ │ │ ├── dynamic_checkpoint.py │ │ │ ├── enums.py │ │ │ ├── fsdp.py │ │ │ ├── integrations.py │ │ │ ├── internal/ │ │ │ │ └── __init__.py │ │ │ ├── model.py │ │ │ ├── multimodal.py │ │ │ ├── peft.py │ │ │ ├── quantization.py │ │ │ ├── training.py │ │ │ ├── trl.py │ │ │ ├── utils.py │ │ │ ├── validation.py │ │ │ └── vllm.py │ │ ├── tee.py │ │ ├── tokenization.py │ │ ├── trackio_.py │ │ ├── train.py │ │ ├── trainer.py │ │ └── wandb_.py │ └── setuptools_axolotl_dynamic_dependencies.py ├── styles.css └── tests/ ├── __init__.py ├── cli/ │ ├── __init__.py │ ├── conftest.py │ ├── test_cli_base.py │ ├── test_cli_evaluate.py │ ├── test_cli_fetch.py │ ├── test_cli_inference.py │ ├── test_cli_interface.py │ ├── test_cli_merge_lora.py │ ├── test_cli_merge_sharded_fsdp_weights.py │ ├── test_cli_preprocess.py │ ├── test_cli_sweeps.py │ ├── test_cli_train.py │ ├── test_cli_version.py │ ├── test_nested_options.py │ └── test_utils.py ├── conftest.py ├── constants.py ├── core/ │ ├── chat/ │ │ ├── __init__.py │ │ ├── format/ │ │ │ └── __init__.py │ │ └── test_messages.py │ ├── test_async_grpo.py │ └── test_builders.py ├── e2e/ │ ├── .gitignore │ ├── __init__.py │ ├── integrations/ │ │ ├── test_cut_cross_entropy.py │ │ ├── test_fp8.py │ │ ├── test_hooks.py │ │ ├── test_kd.py │ │ ├── test_liger.py │ │ ├── test_llm_compressor.py │ │ ├── test_scattermoe_lora_kernels.py │ │ ├── test_scattermoe_lora_olmoe.py │ │ └── test_sonicmoe.py │ ├── kernels/ │ │ ├── test_geglu.py │ │ ├── test_lora.py │ │ ├── test_quantize.py │ │ └── test_swiglu.py │ ├── multigpu/ │ │ ├── __init__.py │ │ ├── patched/ │ │ │ ├── __init__.py │ │ │ └── test_sp.py │ │ ├── solo/ │ │ │ ├── __init__.py │ │ │ ├── test_flex.py │ │ │ ├── test_gdpo.py │ │ │ └── test_grpo.py │ │ ├── test_dist_muon_fsdp2.py │ │ ├── test_eval.py │ │ ├── test_fp8_fsdp2.py │ │ ├── test_fsdp1.py │ │ ├── test_fsdp2.py │ │ ├── test_gemma3.py │ │ ├── test_llama.py │ │ ├── test_locking.py │ │ ├── test_ray.py │ │ └── test_tp.py │ ├── patched/ │ │ ├── __init__.py │ │ ├── lora_kernels/ │ │ │ ├── __init__.py │ │ │ └── test_lora_kernel_patching.py │ │ ├── test_4d_multipack_llama.py │ │ ├── test_activation_checkpointing.py │ │ ├── test_cli_integrations.py │ │ ├── test_fa_xentropy.py │ │ ├── test_falcon_samplepack.py │ │ ├── test_flattening.py │ │ ├── test_fsdp2_qlora.py │ │ ├── test_fused_llama.py │ │ ├── test_llama_s2_attention.py │ │ ├── test_lora_llama_multipack.py │ │ ├── test_mistral_samplepack.py │ │ ├── test_mixtral_samplepack.py │ │ ├── test_model_patches.py │ │ ├── test_peft_embeddings.py │ │ ├── test_phi_multipack.py │ │ ├── test_resume.py │ │ ├── test_unsloth_integration.py │ │ └── test_unsloth_qlora.py │ ├── solo/ │ │ ├── __init__.py │ │ ├── test_flex.py │ │ └── test_relora_llama.py │ ├── test_activation_offloading.py │ ├── test_deepseekv3.py │ ├── test_diffusion.py │ ├── test_dpo.py │ ├── test_embeddings_lr.py │ ├── test_evaluate.py │ ├── test_falcon.py │ ├── test_gemma2.py │ ├── test_gemma3_text.py │ ├── test_imports.py │ ├── test_llama.py │ ├── test_llama_pretrain.py │ ├── test_llama_vision.py │ ├── test_load_model.py │ ├── test_lora_llama.py │ ├── test_mamba.py │ ├── test_mistral.py │ ├── test_mixtral.py │ ├── test_optimizers.py │ ├── test_packing_loss.py │ ├── test_phi.py │ ├── test_preprocess.py │ ├── test_process_reward_model_smollm2.py │ ├── test_profiler.py │ ├── test_qat.py │ ├── test_quantization.py │ ├── test_qwen.py │ ├── test_reward_model_smollm2.py │ ├── test_save_first_step.py │ ├── test_schedulers.py │ ├── test_streaming.py │ ├── test_tokenizer.py │ └── utils.py ├── fixtures/ │ ├── alpaca/ │ │ └── alpaca.json │ ├── conversation.json │ ├── conversation.missingturns.json │ ├── conversation.tokenized.json │ └── conversation.tokenized_llama2chat.json ├── hf_offline_utils.py ├── integrations/ │ ├── __init__.py │ ├── test_diffusion.py │ ├── test_diffusion_callback.py │ ├── test_kd_chat_template.py │ ├── test_liger.py │ ├── test_routing_parity.py │ ├── test_scattermoe_autotune_telemetry.py │ ├── test_scattermoe_lora.py │ ├── test_scattermoe_lora_kernels.py │ ├── test_sonicmoe.py │ ├── test_sonicmoe_gradients.py │ └── test_swanlab.py ├── monkeypatch/ │ ├── test_llama_attn_hijack_flash.py │ ├── test_pixtral_flash_attention_patch.py │ ├── test_qwen3_next_modeling_patch.py │ ├── test_trainer_accelerator_args.py │ ├── test_trainer_context_parallel_patch.py │ ├── test_trainer_loss_calc.py │ ├── test_trl_vllm.py │ └── test_voxtral_modeling_patch.py ├── patched/ │ └── test_validation.py ├── prompt_strategies/ │ ├── __init__.py │ ├── conftest.py │ ├── messages/ │ │ ├── __init__.py │ │ └── test_chat.py │ ├── test_alpaca.py │ ├── test_chat_template_ds_schema_unification.py │ ├── test_chat_template_utils.py │ ├── test_chat_templates.py │ ├── test_chat_templates_advanced.py │ ├── test_chat_templates_mistral.py │ ├── test_chat_templates_thinking.py │ ├── test_chat_templates_tool_call_string_arguments.py │ ├── test_dpo_chat_templates.py │ ├── test_dpo_chatml.py │ ├── test_jinja_template_analyzer.py │ ├── test_raw_io.py │ └── test_stepwise.py ├── telemetry/ │ ├── __init__.py │ ├── conftest.py │ ├── test_callbacks.py │ ├── test_errors.py │ ├── test_manager.py │ └── test_runtime_metrics.py ├── test_chunked_xentropy.py ├── test_context_parallel_batch_size.py ├── test_convert.py ├── test_data.py ├── test_datasets.py ├── test_dict.py ├── test_exact_deduplication.py ├── test_freeze.py ├── test_loaders.py ├── test_logging_config_file_capture.py ├── test_lora.py ├── test_normalize_config.py ├── test_opentelemetry_callback.py ├── test_packed_batch_sampler.py ├── test_packed_dataset.py ├── test_packed_pretraining.py ├── test_perplexity.py ├── test_prompt_tokenizers.py ├── test_prompters.py ├── test_revision_parameter.py ├── test_save_deduplicated.py ├── test_schedulers.py ├── test_streaming.py ├── test_tensor_parallel_batch_size.py ├── test_tokenizers.py ├── test_train.py ├── test_triton_kernels.py ├── test_utils_tee.py ├── test_validation_dataset.py └── utils/ ├── callbacks/ │ └── test_dynamic_checkpoint.py ├── data/ │ └── test_utils.py ├── lora/ │ ├── test_config_validation_lora.py │ ├── test_freeze_lora.py │ └── test_merge_lora.py ├── schemas/ │ └── validation/ │ ├── test_activation_offloading.py │ ├── test_default_values.py │ ├── test_fsdp.py │ └── test_moe_quant.py ├── test_grpo_rw_fnc.py ├── test_import_helper.py ├── test_mistral3_processor.py └── test_train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .axolotl-complete.bash ================================================ #!/bin/bash _axolotl_completions() { local cur prev COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" # If we're completing the first argument (the command) if [[ $COMP_CWORD -eq 1 ]]; then mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur") return 0 fi # Commands that should complete with directories and YAML files local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train") # Check if previous word is in our list if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then # Use filename completion which handles directories properly compopt -o filenames mapfile -t COMPREPLY < <(compgen -f -- "$cur") # Filter to only include directories and YAML files local -a filtered=() for item in "${COMPREPLY[@]}"; do if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then filtered+=("$item") fi done COMPREPLY=("${filtered[@]}") return 0 fi # Default: no completion return 0 } # Remove the -o nospace option - let filenames handle it complete -F _axolotl_completions axolotl ================================================ FILE: .bandit ================================================ [bandit] exclude = tests skips = B101,B615,B102,B110 ================================================ FILE: .coderabbit.yaml ================================================ # yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json language: "en-US" early_access: false reviews: profile: "chill" request_changes_workflow: false high_level_summary: true review_status: true collapse_walkthrough: true poem: false sequence_diagrams: false auto_review: enabled: true drafts: false auto_incremental_review: false chat: auto_reply: true ================================================ FILE: .coveragerc ================================================ [run] source = axolotl omit = */tests/* setup.py [report] exclude_lines = pragma: no cover def __repr__ raise NotImplementedError if __name__ == .__main__.: pass raise ImportError ================================================ FILE: .editorconfig ================================================ root = true [*] end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true [*.py] indent_style = space indent_size = 4 [**.yml] indent_style = space indent_size = 2 ================================================ FILE: .gitattributes ================================================ data/*.jsonl filter=lfs diff=lfs merge=lfs -text ================================================ FILE: .github/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement on Discord at https://discord.gg/QYF8QrtEUm All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: .github/CONTRIBUTING.md ================================================ # Contributing to axolotl First of all, thank you for your interest in contributing to axolotl! We appreciate the time and effort you're willing to invest in making our project better. This document provides guidelines and information to make the contribution process as smooth as possible. ## Table of Contents - [Code of Conduct](#code-of-conduct) - [Getting Started](#getting-started) - [How to Contribute](#how-to-contribute) - [Reporting Bugs](#reporting-bugs) - [Suggesting Enhancements](#suggesting-enhancements) - [Submitting Pull Requests](#submitting-pull-requests) - [Style Guidelines](#style-guidelines) - [Code Style](#code-style) - [Commit Messages](#commit-messages) - [Additional Resources](#additional-resources) ## Code of Conduct All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it before participating in the axolotl community. ## Getting Started Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new). PRs are **greatly welcome**! 1. Fork the repository and clone it to your local machine. 2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file. 3. Explore the codebase, run tests, and verify that everything works as expected. Please run below to setup env ```bash pip3 install -r requirements-dev.txt -r requirements-tests.txt pre-commit install # test pytest tests/ ``` ## How to Contribute ### Reporting Bugs If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs. ### Suggesting Enhancements We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project. ### Submitting Pull Requests 1. Create a new branch for your feature or bugfix. Use a descriptive name like `feature/your-feature-name` or `fix/your-bugfix-name`. 2. Make your changes, following the [Style Guidelines](#style-guidelines) below. 3. Test your changes and ensure that they don't introduce new issues or break existing functionality. 4. Commit your changes, following the [commit message guidelines](#commit-messages). 5. Push your branch to your fork on GitHub. 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues. #### Skipping CI Checks You can skip certain CI checks by including specific keywords in your commit messages: - `[skip ci]` or `skip ci` - Skips all CI checks for that commit - `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR. ## Style Guidelines ### Code Style axolotl uses [Ruff](https://docs.astral.sh/ruff/) as its code style guide. Please ensure that your code follows these guidelines. Use the pre-commit linter to ensure that your code is formatted consistently. ```bash pre-commit run --all-files ``` ### Commit Messages Write clear and concise commit messages that briefly describe the changes made in each commit. Use the imperative mood and start with a capitalized verb, e.g., "Add new feature" or "Fix bug in function". ## Additional Resources - [GitHub Help](https://help.github.com/) - [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests) - [Ruff](https://docs.astral.sh/ruff/) Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together! ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.yaml ================================================ name: Bug Report description: File a bug report labels: ["bug", "needs triage"] body: - type: markdown attributes: value: | ## Before you start Please **make sure you are on the latest version.** If you encountered the issue after you installed, updated, or reloaded, **please try restarting before reporting the bug**. - type: checkboxes id: no-duplicate-issues attributes: label: "Please check that this issue hasn't been reported before." description: "The **Label filters** may help make your search more focussed." options: - label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports." required: true - type: textarea id: expected attributes: label: Expected Behavior description: Tell us what **should** happen. validations: required: true - type: textarea id: what-happened attributes: label: Current behaviour description: | Tell us what happens instead of the expected behavior. Provide stacktrace and/or screenshots. validations: required: true - type: textarea id: reproduce attributes: label: Steps to reproduce description: | Which exact steps can a developer take to reproduce the issue? The more detail you provide, the easier it will be to narrow down and fix the bug. Please paste in tasks and/or queries **as text, not screenshots**. placeholder: | Example of the level of detail needed to reproduce any bugs efficiently and reliably. 1. Go to the '...' page. 2. Click on the '...' button. 3. Scroll down to '...'. 4. Observe the error. validations: required: true - type: textarea id: config attributes: label: Config yaml description: | Please attach the config yaml! render: yaml - type: textarea id: possible-solution attributes: label: Possible solution description: | Not obligatory, but please suggest a fix or reason for the bug, if you have an idea. - type: checkboxes id: operating-systems attributes: label: Which Operating Systems are you using? description: You may select more than one. options: - label: Linux - label: macOS - label: Windows - type: input id: Python-version attributes: label: Python Version description: Which {Programming} version are you using? placeholder: 3.10 / please change accordingly validations: required: true - type: input id: axolotl-branch-commit attributes: label: axolotl branch-commit description: On which branch/commit are you? placeholder: main/4d6490b validations: required: true - type: checkboxes id: acknowledgements attributes: label: 'Acknowledgements' description: 'Please confirm the following:' options: - label: 'My issue title is concise, descriptive, and in title casing.' required: true - label: 'I have searched the existing issues to make sure this bug has not been reported yet.' required: true - label: 'I am using the latest version of axolotl.' required: true - label: 'I have provided enough information for the maintainers to reproduce and diagnose the issue.' required: true ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: Ask a question url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a about: Ask questions and discuss with other community members - name: Discuss the Project in Discord url: https://discord.gg/HhrNrHJPRb ================================================ FILE: .github/ISSUE_TEMPLATE/docs.yml ================================================ name: Documentation Improvement / Clarity description: Make a suggestion to improve the project documentation. labels: ['needs triage', 'docs'] body: - type: markdown attributes: value: '## :book: Documentation :book:' - type: markdown attributes: value: | * Ask questions in [Discord](https://discord.gg/HhrNrHJPRb). * Before you file an issue read the [Contributing guide](./CONTRIBUTING.md). * Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues). - type: textarea attributes: label: What piece of documentation is affected? description: Please link to the article you'd like to see updated. validations: required: true - type: textarea attributes: label: What part(s) of the article would you like to see updated? description: | - Give as much detail as you can to help us understand the change you want to see. - Why should the docs be changed? What use cases does it support? - What is the expected outcome? validations: required: true - type: textarea attributes: label: Additional Information description: Add any other context or screenshots about the feature request here. validations: required: false - type: checkboxes id: acknowledgements attributes: label: 'Acknowledgements' description: 'Please confirm the following:' options: - label: 'My issue title is concise, descriptive, and in title casing.' required: true - label: 'I have searched the existing issues to make sure this feature has not been requested yet.' required: true - label: 'I have provided enough information for the maintainers to understand and evaluate this request.' required: true ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.yaml ================================================ name: Feature Request / Enhancement description: Suggest a new feature or feature enhancement for the project labels: ["enhancement", "needs triage"] body: - type: checkboxes id: no-duplicate-issues attributes: label: "⚠️ Please check that this feature request hasn't been suggested before." description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed." options: - label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests." required: true - label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests." required: true - type: textarea id: feature-description validations: required: true attributes: label: "🔖 Feature description" description: "A clear and concise description of what the feature request is." placeholder: "You should add ..." - type: textarea id: solution validations: required: true attributes: label: "✔️ Solution" description: "A clear and concise description of what you want to happen, and why." placeholder: "In my use-case, ..." - type: textarea id: alternatives validations: required: false attributes: label: "❓ Alternatives" description: "A clear and concise description of any alternative solutions or features you've considered." placeholder: "I have considered ..." - type: textarea id: additional-context validations: required: false attributes: label: "📝 Additional Context" description: "Add any other context or screenshots about the feature request here." placeholder: "..." - type: checkboxes id: acknowledgements attributes: label: 'Acknowledgements' description: 'Please confirm the following:' options: - label: 'My issue title is concise, descriptive, and in title casing.' required: true - label: 'I have searched the existing issues to make sure this feature has not been requested yet.' required: true - label: 'I have provided enough information for the maintainers to understand and evaluate this request.' required: true ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ # Description ## Motivation and Context ## How has this been tested? ## AI Usage Disclaimer ## Screenshots (if appropriate) ## Types of changes ## Social Handles (Optional) ================================================ FILE: .github/SECURITY.md ================================================ # Security Policy ## Supported Versions Due to the nature of the fast development that is happening in this project, only the latest released version can be supported. ## Reporting a Vulnerability If you find a vulnerability, please contact us on [Discord](https://discord.gg/xcu3ECkH9a) rather than creating a GitHub issue to allow us some time to fix it before it is a known vulnerability to others. ================================================ FILE: .github/SUPPORT.md ================================================ # Support If you need help with this project or have questions, please: 1. Check the documentation. 2. Search the existing issues and pull requests. 3. Create a new issue if your question is not answered or your problem is not solved. 4. Have a look in the [Discord server](https://discord.gg/HhrNrHJPRb) Please note that this project is maintained by volunteers who have limited availability. We'll do our best to address your questions and concerns in a timely manner. ================================================ FILE: .github/release-drafter.yml ================================================ name-template: 'v$RESOLVED_VERSION' tag-template: 'v$RESOLVED_VERSION' categories: - title: '🚀 Features' labels: - 'feature' - 'enhancement' - title: '🐛 Bug Fixes' labels: - 'fix' - 'bugfix' - 'bug' - title: '🧰 Maintenance' label: 'chore' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. version-resolver: major: labels: - 'major' minor: labels: - 'minor' patch: labels: - 'patch' default: patch template: | ## What’s Changed $CHANGES ================================================ FILE: .github/workflows/base.yml ================================================ name: ci-cd-base on: push: branches: - "main" paths: - 'docker/Dockerfile-base' - 'docker/Dockerfile-uv-base' - '.github/workflows/base.yml' pull_request: paths: - 'docker/Dockerfile-base' - 'docker/Dockerfile-uv-base' - '.github/workflows/base.yml' workflow_dispatch: permissions: contents: read jobs: build-base: if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }} timeout-minutes: 480 # this job needs to be run on self-hosted GPU runners... runs-on: ubuntu-latest-m env: HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }} strategy: fail-fast: false matrix: include: - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.8.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.9.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.9.1 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.10.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.12" pytorch: 2.10.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" # - cuda: "129" # cuda_version: 12.9.1 # cudnn_version: "" # python_version: "3.12" # pytorch: 2.9.1 # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # dockerfile: "Dockerfile-base" # platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.11" pytorch: 2.9.1 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.12" pytorch: 2.9.1 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.12" pytorch: 2.10.0 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-base" platforms: "linux/amd64,linux/arm64" # - cuda: "128" # cuda_version: 12.8.1 # cudnn_version: "" # python_version: "3.11" # pytorch: nightly # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # dockerfile: "Dockerfile-base-nightly" # # "next" is for release candidates of pytorch # - cuda: "128" # cuda_version: 12.8.1 # cudnn_version: "" # python_version: "3.11" # pytorch: next # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # dockerfile: "Dockerfile-base-next" steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-base - name: Login to Docker Hub uses: docker/login-action@v3 if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }} with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . file: ./docker/${{ matrix.dockerfile }} platforms: ${{ matrix.platforms }} push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} build-args: | CUDA_VERSION=${{ matrix.cuda_version }} CUDNN_VERSION=${{ matrix.cudnn_version }} CUDA=${{ matrix.cuda }} PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch }} TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }} build-base-uv: if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }} timeout-minutes: 480 runs-on: ubuntu-latest-m env: HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }} strategy: fail-fast: false matrix: include: - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.8.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.9.1 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.12" pytorch: 2.9.1 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.9.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: 2.10.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.12" pytorch: 2.10.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" # - cuda: "129" # cuda_version: 12.9.1 # cudnn_version: "" # python_version: "3.12" # pytorch: 2.9.1 # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # dockerfile: "Dockerfile-uv-base" # platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.11" pytorch: 2.9.1 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.12" pytorch: 2.9.1 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" python_version: "3.12" pytorch: 2.10.0 torch_cuda_arch_list: "9.0+PTX" dockerfile: "Dockerfile-uv-base" platforms: "linux/amd64,linux/arm64" steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-base-uv - name: Login to Docker Hub uses: docker/login-action@v3 if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }} with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . file: ./docker/${{ matrix.dockerfile }} platforms: ${{ matrix.platforms }} push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} build-args: | CUDA_VERSION=${{ matrix.cuda_version }} CUDNN_VERSION=${{ matrix.cudnn_version }} CUDA=${{ matrix.cuda }} PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch }} TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }} ================================================ FILE: .github/workflows/docs.yml ================================================ name: Publish Docs on: push: branches: - main permissions: contents: write pages: write jobs: build-deploy: runs-on: ubuntu-latest steps: - name: cleanup node run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL - name: Check out repository uses: actions/checkout@v4 - name: Set up Quarto uses: quarto-dev/quarto-actions/setup@v2 - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install dependencies run: | python3 -m pip install jupyter quartodoc python3 -m pip install -e . - name: Build autodoc run: quartodoc build - name: Publish to GitHub Pages (and render) uses: quarto-dev/quarto-actions/publish@v2 with: target: gh-pages env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/lint.yml ================================================ name: lint on: # check on PRs, and manual triggers merge_group: pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - '**.py' - 'requirements.txt' - '.github/workflows/*.yml' - "*.[q]md" - "examples/**/*.y[a]?ml" - ".pre-commit-config.yaml" workflow_dispatch: permissions: contents: read jobs: pre-commit: name: pre-commit runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: 'pip' # caching pip dependencies - uses: pre-commit/action@v3.0.1 ================================================ FILE: .github/workflows/main.yml ================================================ name: ci-cd on: push: branches: - "main" tags: - "v*" workflow_dispatch: permissions: contents: read jobs: build-axolotl: if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: platforms: "linux/amd64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" is_latest: true - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" # - cuda: 129 # cuda_version: 12.9.1 # python_version: "3.12" # pytorch: 2.9.1 # axolotl_extras: # platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl tags: | type=ref,event=branch type=pep440,pattern={{version}} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/ - name: Build and export to Docker uses: docker/build-push-action@v5 with: context: . platforms: ${{ matrix.platforms }} build-args: | BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} CUDA=${{ matrix.cuda }} PYTORCH_VERSION=${{ matrix.pytorch }} AXOLOTL_ARGS=${{ matrix.axolotl_args }} AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}} file: ./docker/Dockerfile push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }} labels: ${{ steps.metadata.outputs.labels }} build-axolotl-uv: if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" is_latest: true - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-uv tags: | type=ref,event=branch type=pep440,pattern={{version}} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/ - name: Build and export to Docker uses: docker/build-push-action@v5 with: context: . platforms: ${{ matrix.platforms }} build-args: | BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} CUDA=${{ matrix.cuda }} PYTORCH_VERSION=${{ matrix.pytorch }} AXOLOTL_ARGS=${{ matrix.axolotl_args }} AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}} file: ./docker/Dockerfile-uv push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }} labels: ${{ steps.metadata.outputs.labels }} build-axolotl-cloud: needs: build-axolotl if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: platforms: "linux/amd64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: is_latest: true platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" # - cuda: 129 # cuda_version: 12.9.1 # python_version: "3.12" # pytorch: 2.9.1 # axolotl_extras: # platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-cloud tags: | type=ref,event=branch type=pep440,pattern={{version}} - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . platforms: ${{ matrix.platforms }} build-args: | BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} CUDA=${{ matrix.cuda }} file: ./docker/Dockerfile-cloud push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }} labels: ${{ steps.metadata.outputs.labels }} build-axolotl-cloud-uv: needs: build-axolotl-uv if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.9.1 axolotl_extras: is_latest: true platforms: "linux/amd64,linux/arm64" - cuda: 128 cuda_version: 12.8.1 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: platforms: "linux/amd64,linux/arm64" - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.10.0 axolotl_extras: platforms: "linux/amd64,linux/arm64" runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-cloud-uv tags: | type=ref,event=branch type=pep440,pattern={{version}} - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . platforms: ${{ matrix.platforms }} build-args: | BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} CUDA=${{ matrix.cuda }} file: ./docker/Dockerfile-cloud-uv push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }} labels: ${{ steps.metadata.outputs.labels }} build-axolotl-cloud-no-tmux: needs: build-axolotl if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: is_latest: true - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: is_latest: runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-cloud-term tags: | type=ref,event=branch type=pep440,pattern={{version}} - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . platforms: linux/amd64,linux/arm64 build-args: | BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} CUDA=${{ matrix.cuda }} file: ./docker/Dockerfile-cloud-no-tmux push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }} labels: ${{ steps.metadata.outputs.labels }} ================================================ FILE: .github/workflows/multi-gpu-e2e.yml ================================================ name: docker-multigpu-tests-biweekly on: pull_request: paths: - 'tests/e2e/multigpu/**.py' - 'requirements.txt' - 'setup.py' - 'pyproject.toml' - '.github/workflows/multi-gpu-e2e.yml' - 'scripts/cutcrossentropy_install.py' - 'src/axolotl/core/trainers/mixins/sequence_parallel.py' - 'src/axolotl/utils/distributed.py' workflow_dispatch: schedule: - cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday # Cancel jobs on the same ref if a new one is triggered concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: contents: read env: MODAL_IMAGE_BUILDER_VERSION: "2025.06" jobs: test-axolotl-multigpu: if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }} strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: fbgemm-gpu num_gpus: 2 # - cuda: 129 # cuda_version: 12.9.1 # python_version: "3.12" # pytorch: 2.9.1 # axolotl_extras: "fbgemm-gpu" # num_gpus: 2 # dockerfile: "Dockerfile-uv.jinja" - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: # axolotl_extras: fbgemm-gpu num_gpus: 2 - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.10.0 axolotl_extras: "fbgemm-gpu" num_gpus: 2 dockerfile: "Dockerfile-uv.jinja" runs-on: [self-hosted, modal] timeout-minutes: 120 steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run -m cicd.multigpu ================================================ FILE: .github/workflows/nightlies.yml ================================================ name: docker-nightlies on: workflow_dispatch: schedule: - cron: '0 0 * * *' # Runs at 00:00 UTC every day permissions: contents: read jobs: build-axolotl: if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl tags: | type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/ - name: Build and export to Docker uses: docker/build-push-action@v5 with: context: . build-args: | BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} CUDA=${{ matrix.cuda }} PYTORCH_VERSION=${{ matrix.pytorch }} AXOLOTL_ARGS=${{ matrix.axolotl_args }} file: ./docker/Dockerfile push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} build-axolotl-cloud: needs: build-axolotl if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... strategy: matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout uses: actions/checkout@v4 - name: Docker metadata id: metadata uses: docker/metadata-action@v5 with: images: | axolotlai/axolotl-cloud tags: | type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }} - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build uses: docker/build-push-action@v5 with: context: . build-args: | BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} CUDA=${{ matrix.cuda }} file: ./docker/Dockerfile-cloud push: ${{ github.event_name != 'pull_request' }} tags: | ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} ================================================ FILE: .github/workflows/precommit-autoupdate.yml ================================================ name: Pre-commit auto-update on: schedule: - cron: '0 0 1 * *' # Run monthly workflow_dispatch: # Manual kickoff permissions: {} jobs: auto-update: runs-on: ubuntu-latest permissions: contents: write pull-requests: write steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: '3.11' - name: Update pre-commit hooks id: update run: | pip install pre-commit pre-commit autoupdate if [[ -n $(git status --porcelain) ]]; then echo "changes=true" >> $GITHUB_OUTPUT fi - name: Create Pull Request if: steps.update.outputs.changes == 'true' uses: peter-evans/create-pull-request@v6 with: token: ${{ secrets.GITHUB_TOKEN }} branch: update/pre-commit-hooks delete-branch: true title: "chore: update pre-commit hooks" commit-message: "chore: update pre-commit hooks" body: | Automated PR to update pre-commit hooks to their latest versions. ================================================ FILE: .github/workflows/preview-docs.yml ================================================ name: Preview on: workflow_dispatch: pull_request: types: [opened, synchronize, reopened, ready_for_review] # Run the workflow only when one of these files changes paths: - '**/*.md' # any Markdown file - '**/*.qmd' # any Quarto file - '_quarto.yml' - docs/scripts/generate_config_docs.py - src/axolotl/utils/schemas/**.py - .github/workflows/preview-docs.yml permissions: contents: read pull-requests: write jobs: preview: runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} steps: - name: cleanup node run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL - name: Check out repository uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Quarto uses: quarto-dev/quarto-actions/setup@v2 - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install dependencies run: | python3 -m pip install jupyter quartodoc python3 -m pip install -e . - name: Build autodoc run: quartodoc build - name: Quarto render run: quarto render - name: Netlify Publish uses: nwtgck/actions-netlify@v3.0 if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} id: netlify with: publish-dir: './_site' enable-pull-request-comment: false enable-github-deployment: false github-token: ${{ secrets.GITHUB_TOKEN }} deploy-message: "Deployed On Netlify" github-deployment-environment: 'preview' github-deployment-description: 'Preview Deployment' env: NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} - name: Update PR with preview link if: ${{ steps.netlify.outcome == 'success' }} uses: marocchino/sticky-pull-request-comment@v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} message: | 📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }} Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }} ================================================ FILE: .github/workflows/pypi.yml ================================================ name: publish pypi on: push: tags: - "v*" workflow_dispatch: permissions: {} jobs: setup_release: name: Create Release runs-on: ubuntu-latest permissions: contents: write steps: - name: Checkout code uses: actions/checkout@v4 - name: Create release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: gh release create "$GITHUB_REF_NAME" --generate-notes pypi-publish: name: Upload release to PyPI runs-on: ubuntu-latest needs: [setup_release] environment: name: pypi url: https://pypi.org/p/axolotl permissions: contents: read id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - name: Check out repository code uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install dependencies run: | pip3 install wheel packaging==26.0 pip3 install --no-build-isolation -e . pip3 install -r requirements-dev.txt -r requirements-tests.txt - name: Extract tag name id: tag run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT" - name: Update version in VERSION file run: | echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION - name: Build a source dist run: | python setup.py sdist - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 ================================================ FILE: .github/workflows/tests-nightly.yml ================================================ name: Tests Nightly against upstream main on: workflow_dispatch: schedule: - cron: '0 0 * * *' # Runs at 00:00 UTC every day pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - '.github/workflows/tests-nightly.yml' permissions: contents: read jobs: pre-commit: name: pre-commit runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: 'pip' # caching pip dependencies - uses: pre-commit/action@v3.0.1 env: SKIP: no-commit-to-branch prime-cdn-s3-cache: name: Prefetch S3 once to prime the CDN cache runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} timeout-minutes: 10 steps: - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null pytest: name: PyTest runs-on: ubuntu-latest needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged pytorch_version: ["2.8.0", "2.9.1", "2.10.0"] timeout-minutes: 20 steps: - name: Check out repository code uses: actions/checkout@v4 - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | mkdir -p /home/runner/.cache/huggingface/hub curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python_version }} cache: 'pip' # caching pip dependencies - name: upgrade pip run: | pip3 install --upgrade pip pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel - name: Install PyTorch run: | pip3 install torch==${{ matrix.pytorch_version }} torchvision - name: Update requirements.txt run: | sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt - name: Install dependencies run: | pip3 show torch pip3 install --no-build-isolation -U -e . python scripts/unsloth_install.py | sh python scripts/cutcrossentropy_install.py | sh pip3 install -r requirements-dev.txt -r requirements-tests.txt - name: Make sure PyTorch version wasn't clobbered run: | python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__" - name: Ensure axolotl CLI was installed run: | axolotl --help - name: Run tests run: | pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ pytest -v --durations=10 tests/patched/ pytest -v --durations=10 tests/cli/ - name: cleanup pip cache run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; docker-e2e-tests: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 needs: [pre-commit, pytest] strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: nightly_build: "true" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.10.0 num_gpus: 1 axolotl_extras: - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: dockerfile: "Dockerfile-uv.jinja" nightly_build: "true" steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.e2e_tests docker-e2e-multigpu-tests: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 needs: [pre-commit, pytest, docker-e2e-tests] strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 2 axolotl_extras: nightly_build: "true" steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.multigpu ================================================ FILE: .github/workflows/tests.yml ================================================ name: Tests on: # check on push/merge to main, PRs, and manual triggers merge_group: push: branches: - "main" paths: - '**.py' - 'requirements.txt' - '.github/workflows/*.yml' - 'requirements-tests.txt' - 'cicd/cicd.sh' - 'cicd/Dockerfile.jinja' pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - '**.py' - 'requirements.txt' - '.github/workflows/*.yml' - 'requirements-tests.txt' - 'cicd/cicd.sh' - 'cicd/Dockerfile.jinja' workflow_dispatch: # Cancel jobs on the same ref if a new one is triggered concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: contents: read env: TRANSFORMERS_IS_CI: "yes" jobs: pre-commit: name: pre-commit runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: 'pip' # caching pip dependencies - uses: pre-commit/action@v3.0.1 env: SKIP: no-commit-to-branch prime-cdn-s3-cache: name: Prefetch S3 once to prime the CDN cache runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} timeout-minutes: 10 steps: - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null pytest: name: PyTest runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged pytorch_version: ["2.8.0", "2.9.1", "2.10.0"] # exclude: # - python_version: "3.14" # pytorch_version: "2.8.0" # - python_version: "3.14" # pytorch_version: "2.9.1" timeout-minutes: 20 steps: - name: cleanup node run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL - name: Check out repository code uses: actions/checkout@v4 - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | mkdir -p ~/.cache/huggingface/hub curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1 ls -ltr ~/.cache/huggingface/hub/ - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python_version }} cache: 'pip' # caching pip dependencies - name: upgrade pip run: | pip3 install --upgrade pip pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel - name: Install PyTorch run: | pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision - name: Install dependencies run: | pip3 show torch pip3 install --no-cache-dir --no-build-isolation -U -e . python scripts/unsloth_install.py | sh python scripts/cutcrossentropy_install.py | sh pip3 install -r requirements-dev.txt -r requirements-tests.txt - name: cleanup pip cache run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; - name: Make sure PyTorch version wasn't clobbered run: | python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__" - name: Ensure axolotl CLI was installed run: | axolotl --help - name: Pre-Download dataset fixture run: | hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures - name: Show HF cache run: hf cache ls - name: Run tests run: | df -h pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml df -h pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml df -h pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml df -h pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml - name: Show HF cache run: hf cache ls - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml flags: unittests,pytorch-${{ matrix.pytorch_version }} fail_ci_if_error: false pytest-sdist: name: PyTest from Source Dist runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged pytorch_version: ["2.8.0", "2.9.1", "2.10.0"] # exclude: # - python_version: "3.14" # pytorch_version: "2.8.0" # - python_version: "3.14" # pytorch_version: "2.9.1" timeout-minutes: 30 steps: - name: cleanup node run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL - name: Check out repository code uses: actions/checkout@v4 - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | mkdir -p ~/.cache/huggingface/hub curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1 ls -ltr ~/.cache/huggingface/hub/ - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python_version }} cache: 'pip' # caching pip dependencies - name: upgrade pip run: | pip3 install --upgrade pip pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil - name: Install PyTorch run: | pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision - name: Install dependencies run: | pip3 show torch python -m build --no-isolation --sdist pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz python scripts/unsloth_install.py | sh python scripts/cutcrossentropy_install.py | sh pip3 install -r requirements-dev.txt -r requirements-tests.txt - name: cleanup pip cache run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; - name: Make sure PyTorch version wasn't clobbered run: | python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__" - name: Ensure axolotl CLI was installed run: | axolotl --help - name: Show HF cache run: hf cache ls - name: Run tests run: | pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml pytest -v --durations=10 tests/cli/ - name: Show HF cache run: hf cache ls gate-skip-e2e: needs: [pre-commit] runs-on: ubuntu-latest outputs: skip: ${{ steps.compute.outputs.skip }} steps: - uses: actions/github-script@v7 id: compute with: script: | const token = /\[skip-e2e\]/i; let msg = ''; if (context.eventName === 'push') { msg = context.payload.head_commit?.message || ''; } else if (context.eventName === 'pull_request') { const { owner, repo } = context.repo; const prNumber = context.payload.pull_request.number; const commits = await github.paginate( github.rest.pulls.listCommits, { owner, repo, pull_number: prNumber, per_page: 100 } ); msg = commits.at(-1)?.commit?.message || ''; } const title = context.payload.pull_request?.title || ''; const body = context.payload.pull_request?.body || ''; const skip = token.test(msg) || token.test(title) || token.test(body); core.setOutput('skip', String(skip)); docker-e2e-tests-1st: # Run this job first as a gate for running the remainder of the test matrix if: > github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) && needs.gate-skip-e2e.outputs.skip != 'true' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 needs: [pre-commit, pytest] strategy: fail-fast: false matrix: include: - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: dockerfile: "Dockerfile-uv.jinja" steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.e2e_tests docker-e2e-tests: if: > github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) && needs.gate-skip-e2e.outputs.skip != 'true' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 # Only run the remainder of the matrix if the first e2e check passed; # this is to save on wasted compute costs for known failures that get caught in the first run needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st] strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 num_gpus: 1 gpu_type: "B200" axolotl_extras: fbgemm-gpu - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.10.0 num_gpus: 1 axolotl_extras: - cuda: 130 cuda_version: 13.0.0 python_version: "3.11" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.e2e_tests docker-e2e-cleanup: runs-on: [self-hosted, modal] timeout-minutes: 90 needs: [docker-e2e-tests] if: ${{ !github.event.pull_request.draft }} strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV - name: Run tests job on Modal run: | modal run cicd.cleanup ================================================ FILE: .gitignore ================================================ **/axolotl.egg-info configs last_run_prepared/ outputs .vscode _site/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ venv3.10/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ # WandB # wandb creates a folder to store logs for training runs wandb # Runs lora-out/* qlora-out/* mlruns/* /.quarto/ prepared-datasets/ submit.sh *.out* # Quartodoc generated files objects.json site_libs/ typings/ out/ # vim *.swp # scm auto-versioning src/axolotl/_version.py ================================================ FILE: .mypy.ini ================================================ [mypy] plugins = pydantic.mypy exclude = venv [mypy-alpaca_lora_4bit.*] ignore_missing_imports = True [mypy-axolotl.monkeypatch.*] ignore_errors = True [mypy-axolotl.models.mixtral.*] ignore_errors = True [mypy-axolotl.integrations.liger.models.*] ignore_errors = True [mypy-axolotl.models.phi.*] ignore_errors = True [mypy-flash_attn.*] ignore_missing_imports = True [mypy-huggingface_hub] ignore_missing_imports = True [mypy-transformers.*] ignore_missing_imports = True [mypy-peft] ignore_missing_imports = True [mypy-wandb] ignore_missing_imports = True [mypy-bitsandbytes] ignore_missing_imports = True [mypy-requests] ignore_missing_imports = True [mypy-datasets] ignore_missing_imports = True [mypy-fire] ignore_missing_imports = True [mypy-setuptools] ignore_missing_imports = True [mypy-addict] ignore_missing_imports = True [mypy-xformers.*] ignore_missing_imports = True ================================================ FILE: .pre-commit-config.yaml ================================================ default_language_version: python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - id: no-commit-to-branch args: ['--branch', 'main'] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.4 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.19.1 hooks: - id: mypy additional_dependencies: [ 'types-PyYAML', 'pydantic>=2.5.3', ] - repo: https://github.com/PyCQA/bandit rev: 1.9.4 hooks: - id: bandit args: [ '--ini', '.bandit', ] ================================================ FILE: .runpod/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ pod/scripts/config.yaml ================================================ FILE: .runpod/Dockerfile ================================================ FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0 COPY .runpod/requirements.txt /requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade pip && \ python3 -m pip install --upgrade -r /requirements.txt # Environment settings ARG BASE_VOLUME="/runpod-volume" ENV BASE_VOLUME=$BASE_VOLUME ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets" ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub" ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub" ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub" COPY .runpod/src /src WORKDIR /src CMD ["python3", "/src/handler.py"] ================================================ FILE: .runpod/README.md ================================================

LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more

# Configuration Options This document outlines all available configuration options for training models. The configuration can be provided as a JSON request. ## Usage You can use these configuration Options: 1. As a JSON request body: ```json { "input": { "user_id": "user", "model_id": "model-name", "run_id": "run-id", "credentials": { "wandb_api_key": "", # add your Weights & biases key. TODO: you will be able to set this in Enviornment variables. "hf_token": "", # add your HF_token. TODO: you will be able to set this in Enviornment variables. }, "args": { "base_model": "NousResearch/Llama-3.2-1B", // ... other options } } } ``` ## Configuration Options ### Model Configuration | Option | Description | Default | | ------------------- | --------------------------------------------------------------------------------------------- | -------------------- | | `base_model` | Path to the base model (local or HuggingFace) | Required | | `base_model_config` | Configuration path for the base model | Same as base_model | | `revision_of_model` | Specific model revision from HuggingFace hub | Latest | | `tokenizer_config` | Custom tokenizer configuration path | Optional | | `model_type` | Type of model to load | AutoModelForCausalLM | | `tokenizer_type` | Type of tokenizer to use | AutoTokenizer | | `hub_model_id` | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional | ## Model Family Identification | Option | Default | Description | | -------------------------- | ------- | ------------------------------ | | `is_falcon_derived_model` | `false` | Whether model is Falcon-based | | `is_llama_derived_model` | `false` | Whether model is LLaMA-based | | `is_qwen_derived_model` | `false` | Whether model is Qwen-based | | `is_mistral_derived_model` | `false` | Whether model is Mistral-based | ## Model Configuration Overrides | Option | Default | Description | | ----------------------------------------------- | ---------- | ---------------------------------- | | `overrides_of_model_config.rope_scaling.type` | `"linear"` | RoPE scaling type (linear/dynamic) | | `overrides_of_model_config.rope_scaling.factor` | `1.0` | RoPE scaling factor | ### Model Loading Options | Option | Description | Default | | -------------- | ----------------------------- | ------- | | `load_in_8bit` | Load model in 8-bit precision | false | | `load_in_4bit` | Load model in 4-bit precision | false | | `bf16` | Use bfloat16 precision | false | | `fp16` | Use float16 precision | false | | `tf32` | Use tensor float 32 precision | false | ## Memory and Device Settings | Option | Default | Description | | ------------------ | --------- | ----------------------- | | `gpu_memory_limit` | `"20GiB"` | GPU memory limit | | `lora_on_cpu` | `false` | Load LoRA on CPU | | `device_map` | `"auto"` | Device mapping strategy | | `max_memory` | `null` | Max memory per device | ## Training Hyperparameters | Option | Default | Description | | ----------------------------- | --------- | --------------------------- | | `gradient_accumulation_steps` | `1` | Gradient accumulation steps | | `micro_batch_size` | `2` | Batch size per GPU | | `eval_batch_size` | `null` | Evaluation batch size | | `num_epochs` | `4` | Number of training epochs | | `warmup_steps` | `100` | Warmup steps | | `warmup_ratio` | `0.05` | Warmup ratio | | `learning_rate` | `0.00003` | Learning rate | | `lr_quadratic_warmup` | `false` | Quadratic warmup | | `logging_steps` | `null` | Logging frequency | | `eval_steps` | `null` | Evaluation frequency | | `evals_per_epoch` | `null` | Evaluations per epoch | | `save_strategy` | `"epoch"` | Checkpoint saving strategy | | `save_steps` | `null` | Saving frequency | | `saves_per_epoch` | `null` | Saves per epoch | | `save_total_limit` | `null` | Maximum checkpoints to keep | | `max_steps` | `null` | Maximum training steps | ### Dataset Configuration ```yaml datasets: - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path. type: alpaca # Format type (alpaca, gpteacher, oasst, etc.) ds_type: json # Dataset type data_files: path/to/data # Source data files train_on_split: train # Dataset split to use ``` ## Chat Template Settings | Option | Default | Description | | ------------------------ | -------------------------------- | ---------------------- | | `chat_template` | `"tokenizer_default"` | Chat template type | | `chat_template_jinja` | `null` | Custom Jinja template | | `default_system_message` | `"You are a helpful assistant."` | Default system message | ## Dataset Processing | Option | Default | Description | | --------------------------------- | -------------------------- | ----------------------------------- | | `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | | `push_dataset_to_hub` | `""` | Push dataset to HF hub | | `dataset_num_proc` | `4` | Number of preprocessing processes | | `dataset_keep_in_memory` | `false` | Keep dataset in memory | | `shuffle_merged_datasets` | `true` | Shuffle merged datasets | | `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging | | `dataset_exact_deduplication` | `true` | Deduplicate datasets | ## LoRA Configuration | Option | Default | Description | | -------------------------- | ---------------------- | ------------------------------ | | `adapter` | `"lora"` | Adapter type (lora/qlora) | | `lora_model_dir` | `""` | Directory with pretrained LoRA | | `lora_r` | `8` | LoRA attention dimension | | `lora_alpha` | `16` | LoRA alpha parameter | | `lora_dropout` | `0.05` | LoRA dropout | | `lora_target_modules` | `["q_proj", "v_proj"]` | Modules to apply LoRA | | `lora_target_linear` | `false` | Target all linear modules | | `peft_layers_to_transform` | `[]` | Layers to transform | | `lora_modules_to_save` | `[]` | Modules to save | | `lora_fan_in_fan_out` | `false` | Fan in/out structure | ## Optimization Settings | Option | Default | Description | | ------------------------- | ------- | -------------------------- | | `train_on_inputs` | `false` | Train on input prompts | | `group_by_length` | `false` | Group by sequence length | | `gradient_checkpointing` | `false` | Use gradient checkpointing | | `early_stopping_patience` | `3` | Early stopping patience | ## Learning Rate Scheduling | Option | Default | Description | | -------------------------- | ---------- | -------------------- | | `lr_scheduler` | `"cosine"` | Scheduler type | | `lr_scheduler_kwargs` | `{}` | Scheduler parameters | | `cosine_min_lr_ratio` | `null` | Minimum LR ratio | | `cosine_constant_lr_ratio` | `null` | Constant LR ratio | | `lr_div_factor` | `null` | LR division factor | ## Optimizer Settings | Option | Default | Description | | ---------------------- | ------------ | ------------------- | | `optimizer` | `"adamw_hf"` | Optimizer choice | | `optim_args` | `{}` | Optimizer arguments | | `optim_target_modules` | `[]` | Target modules | | `weight_decay` | `null` | Weight decay | | `adam_beta1` | `null` | Adam beta1 | | `adam_beta2` | `null` | Adam beta2 | | `adam_epsilon` | `null` | Adam epsilon | | `max_grad_norm` | `null` | Gradient clipping | ## Attention Implementations | Option | Default | Description | | -------------------------- | ------- | ----------------------------- | | `flash_optimum` | `false` | Use better transformers | | `xformers_attention` | `false` | Use xformers | | `flash_attention` | `false` | Use flash attention | | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy | | `flash_attn_rms_norm` | `false` | Flash attention RMS norm | | `flash_attn_fuse_mlp` | `false` | Fuse MLP operations | | `sdp_attention` | `false` | Use scaled dot product | | `s2_attention` | `false` | Use shifted sparse attention | ## Tokenizer Modifications | Option | Default | Description | | ---------------- | ------- | ---------------------------- | | `special_tokens` | - | Special tokens to add/modify | | `tokens` | `[]` | Additional tokens | ## Distributed Training | Option | Default | Description | | ----------------------- | ------- | --------------------- | | `fsdp` | `null` | FSDP configuration | | `fsdp_config` | `null` | FSDP config options | | `deepspeed` | `null` | Deepspeed config path | | `ddp_timeout` | `null` | DDP timeout | | `ddp_bucket_cap_mb` | `null` | DDP bucket capacity | | `ddp_broadcast_buffers` | `null` | DDP broadcast buffers |

Example Configuration Request:

Here's a complete example for fine-tuning a LLaMA model using LoRA: ```json { "input": { "user_id": "user", "model_id": "llama-test", "run_id": "test-run", "credentials": { "wandb_api_key": "", "hf_token": "" }, "args": { "base_model": "NousResearch/Llama-3.2-1B", "load_in_8bit": false, "load_in_4bit": false, "strict": false, "datasets": [ { "path": "teknium/GPT4-LLM-Cleaned", "type": "alpaca" } ], "dataset_prepared_path": "last_run_prepared", "val_set_size": 0.1, "output_dir": "./outputs/lora-out", "adapter": "lora", "sequence_len": 2048, "sample_packing": true, "eval_sample_packing": true, "pad_to_sequence_len": true, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_modules": [ "gate_proj", "down_proj", "up_proj", "q_proj", "v_proj", "k_proj", "o_proj" ], "gradient_accumulation_steps": 2, "micro_batch_size": 2, "num_epochs": 1, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "learning_rate": 0.0002, "train_on_inputs": false, "group_by_length": false, "bf16": "auto", "tf32": false, "gradient_checkpointing": true, "logging_steps": 1, "flash_attention": true, "loss_watchdog_threshold": 5, "loss_watchdog_patience": 3, "warmup_steps": 10, "evals_per_epoch": 4, "saves_per_epoch": 1, "weight_decay": 0, "hub_model_id": "runpod/llama-fr-lora", "wandb_name": "test-run-1", "wandb_project": "test-run-1", "wandb_entity": "axo-test", "special_tokens": { "pad_token": "<|end_of_text|>" } } } } ```
### Advanced Features #### Wandb Integration - `wandb_project`: Project name for Weights & Biases - `wandb_entity`: Team name in W&B - `wandb_watch`: Monitor model with W&B - `wandb_name`: Name of the W&B run - `wandb_run_id`: ID for the W&B run #### Performance Optimization - `sample_packing`: Enable efficient sequence packing - `eval_sample_packing`: Use sequence packing during evaluation - `torch_compile`: Enable PyTorch 2.0 compilation - `flash_attention`: Use Flash Attention implementation - `xformers_attention`: Use xFormers attention implementation ### Available Optimizers The following optimizers are supported: - `adamw_hf`: HuggingFace's AdamW implementation - `adamw_torch`: PyTorch's AdamW - `adamw_torch_fused`: Fused AdamW implementation - `adamw_torch_xla`: XLA-optimized AdamW - `adamw_apex_fused`: NVIDIA Apex fused AdamW - `adafactor`: Adafactor optimizer - `adamw_anyprecision`: Anyprecision AdamW - `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes - `lion_8bit`: 8-bit Lion optimizer - `lion_32bit`: 32-bit Lion optimizer - `sgd`: Stochastic Gradient Descent - `adagrad`: Adagrad optimizer ## Notes - Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training - Enable `flash_attention: true` for faster training on modern GPUs - Use `gradient_checkpointing: true` to reduce memory usage - Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html). ### Errors: - if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start. ================================================ FILE: .runpod/hub.json ================================================ { "title": "Axolotl Fine-Tuning", "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.", "type": "serverless", "category": "language", "iconUrl": "https://avatars.githubusercontent.com/u/167502477", "config": { "runsOn": "GPU", "containerDiskInGb": 200, "gpuCount": 1, "allowedCudaVersions": [ "12.8", "12.7", "12.6", "12.5", "12.4" ], "presets": [], "env": [ { "key": "TOKENIZER", "input": { "name": "Tokenizer", "type": "string", "description": "Name or path of the Hugging Face tokenizer to use.", "default": "", "advanced": true } }, { "key": "MAX_NUM_SEQS", "input": { "name": "Max Num Seqs", "type": "number", "description": "Maximum number of sequences per iteration.", "default": 256, "advanced": true } }, { "key": "DISABLE_LOG_STATS", "input": { "name": "Disable Log Stats", "type": "boolean", "description": "Disable logging statistics.", "default": false, "trueValue": "true", "falseValue": "false" } }, { "key": "LOAD_FORMAT", "input": { "name": "Load Format", "type": "string", "description": "The format of the model weights to load.", "default": "auto", "options": [ { "label": "auto", "value": "auto" }, { "label": "pt", "value": "pt" }, { "label": "safetensors", "value": "safetensors" }, { "label": "npcache", "value": "npcache" }, { "label": "dummy", "value": "dummy" }, { "label": "tensorizer", "value": "tensorizer" }, { "label": "bitsandbytes", "value": "bitsandbytes" } ], "advanced": true } } ] } } ================================================ FILE: .runpod/requirements.txt ================================================ # Required Python packages get listed here, one per line. # Reccomended to lock the version number to avoid unexpected changes. # You can also install packages from a git repository, e.g.: # git+https://github.com/runpod/runpod-python.git # To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/ runpod~=1.7.0 ================================================ FILE: .runpod/src/config/config.yaml ================================================ # # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files # # This can also be a relative path to a model on disk # base_model: ./llama-7b-hf # # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc) # base_model_ignore_patterns: # # If the base_model repo on hf hub doesn't include configuration .json files, # # You can set that here, or leave this empty to default to base_model # base_model_config: ./llama-7b-hf # # You can specify to choose a specific model revision from huggingface hub # model_revision: # # Optional tokenizer configuration override in case you want to use a different tokenizer # # than the one defined in the base model # tokenizer_config: # # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too # model_type: AutoModelForCausalLM # # Corresponding tokenizer for the model AutoTokenizer is a good choice # tokenizer_type: AutoTokenizer # # Trust remote code for untrusted source # trust_remote_code: # # use_fast option for tokenizer loading from_pretrained, default to True # tokenizer_use_fast: # # Whether to use the legacy tokenizer setting, defaults to True # tokenizer_legacy: # # Resize the model embeddings when new tokens are added to multiples of 32 # # This is reported to improve training speed on some models # resize_token_embeddings_to_32x: # # Used to identify which the model is based on # is_falcon_derived_model: # is_llama_derived_model: # # Please note that if you set this to true, `padding_side` will be set to "left" by default # is_mistral_derived_model: # is_qwen_derived_model: # # optional overrides to the base model configuration # model_config: # # RoPE Scaling https://github.com/huggingface/transformers/pull/24653 # rope_scaling: # type: # linear | dynamic # factor: # float # # Whether you are training a 4-bit GPTQ quantized model # gptq: true # gptq_groupsize: 128 # group size # gptq_model_v1: false # v1 or v2 # # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer # load_in_8bit: true # # Use bitsandbytes 4 bit # load_in_4bit: # # Use CUDA bf16 # bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere # # Use CUDA fp16 # fp16: true # # Use CUDA tf32 # tf32: true # require >=ampere # # No AMP (automatic mixed precision) # bfloat16: true # require >=ampere # float16: true # # A list of one or more datasets to finetune the model with # datasets: # # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files # - path: vicgalle/alpaca-gpt4 # # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] # type: alpaca # format | format: (chat/instruct) | .load_ # ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file # data_files: # Optional[str] path to source data files # shards: # Optional[int] number of shards to split data into # name: # Optional[str] name of dataset configuration to load # train_on_split: train # Optional[str] name of dataset split to load from # # Optional[str] fastchat conversation type, only used with type: sharegpt # conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py # field_human: # Optional[str]. Human key to use for conversation. # field_model: # Optional[str]. Assistant key to use for conversation. # # Custom user prompt # - path: repo # type: # # The below are defaults. only set what's needed. # system_prompt: "" # system_format: "{system}" # field_system: system # field_instruction: instruction # field_input: input # field_output: output # # Customizable to be single line or multi-line # # 'format' can include {input} # format: |- # User: {instruction} {input} # Assistant: # # 'no_input_format' cannot include {input} # no_input_format: "{instruction} " # # For `completion` datasets only, uses the provided field instead of `text` column # field: # # Axolotl attempts to save the dataset as an arrow after packing the data together so # # subsequent training attempts load faster, relative path # dataset_prepared_path: data/last_run_prepared # # Push prepared dataset to hub # push_dataset_to_hub: # repo path # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` # # if not set. # dataset_num_proc: # defaults to os.cpu_count() if not set # # push checkpoints to hub # hub_model_id: # repo path to push finetuned model # # how to push checkpoints to hub # # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy # hub_strategy: # # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets # # Required to be true when used in combination with `push_dataset_to_hub` # hf_use_auth_token: # boolean # # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval. # val_set_size: 0.04 # # Num shards for whole dataset # dataset_shard_num: # # Index of shard to use for whole dataset # dataset_shard_idx: # # The maximum length of an input to train with, this should typically be less than 2048 # # as most models have a token/context limit of 2048 # sequence_len: 2048 # # Pad inputs so each step uses constant sized buffers # # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently # pad_to_sequence_len: # # Max sequence length to concatenate training samples together up to # # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning # # FutureWarning: This will soon be DEPRECATED # max_packed_sequence_len: 1024 # # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true' # sample_packing: # # Set to 'false' if getting errors during eval with sample_packing on. # eval_sample_packing: # # You can set these packing optimizations AFTER starting a training at least once. # # The trainer will provide recommended values for these values. # sample_packing_eff_est: # total_num_tokens: # # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model # adapter: lora # # If you already have a lora model trained that you want to load, put that here. # # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`. # lora_model_dir: # # LoRA hyperparameters # # For more details about the following options, see: # # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 # lora_r: 8 # lora_alpha: 16 # lora_dropout: 0.05 # lora_target_modules: # - q_proj # - v_proj # # - k_proj # # - o_proj # # - gate_proj # # - down_proj # # - up_proj # lora_target_linear: # If true, will target all linear layers # # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. # # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. # # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities. # # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994 # lora_modules_to_save: # # - embed_tokens # # - lm_head # # Once you complete training, the model will be saved to the following directory. # # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory. # # Make sure `lora_model_dir` points to this directory if you want to use the trained model. # lora_out_dir: # lora_fan_in_fan_out: false # # ReLoRA configuration # # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed # relora_steps: # Number of steps per ReLoRA restart # relora_warmup_steps: # Number of per-restart warmup steps # relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings # # wandb configuration if you're using it # wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb # wandb_project: # Your wandb project name # wandb_entity: # A wandb Team name if using a Team # wandb_watch: # wandb_run_id: # Set the name of your wandb run # wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training # # Where to save the full-finetuned model to # output_dir: ./completed-model # # Whether to use torch.compile and which backend to use # torch_compile: # bool # torch_compile_backend: # Optional[str] # # Training hyperparameters # # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps. # gradient_accumulation_steps: 1 # # The number of samples to include in each batch. This is the number of samples sent to each GPU. # micro_batch_size: 2 # eval_batch_size: # num_epochs: 4 # warmup_steps: 100 # cannot use with warmup_ratio # warmup_ratio: 0.05 # cannot use with warmup_steps # learning_rate: 0.00003 # lr_quadratic_warmup: # logging_steps: # save_strategy: # Set to `no` to skip checkpoint saves # save_steps: # Leave empty to save at each epoch # eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps # save_total_limit: # Checkpoints saved at a time # # Maximum number of iterations to train for. It precedes num_epochs which means that # # if both are set, num_epochs will not be guaranteed. # # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps # max_steps: # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128 # # Whether to mask out or include the human's prompt from the training labels # train_on_inputs: false # # Group similarly sized data to minimize padding. # # May be slower to start, as it must download and sort the entire dataset. # # Note that training loss may have an oscillating pattern with this enabled. # group_by_length: false # # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing # gradient_checkpointing: false # # Stop training after this many evaluation losses have increased in a row # # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback # early_stopping_patience: 3 # # Specify a scheduler and kwargs to use with the optimizer # lr_scheduler: # 'one_cycle' | empty for cosine # lr_scheduler_kwargs: # # For one_cycle optim # lr_div_factor: # Learning rate div factor # # Specify optimizer # # Valid values are driven by the Transformers OptimizerNames class, see: # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134 # # # # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of # # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used # # in the examples/ for your model and fine-tuning use case. # # # # Valid values for 'optimizer' include: # # - adamw_hf # # - adamw_torch # # - adamw_torch_fused # # - adamw_torch_xla # # - adamw_apex_fused # # - adafactor # # - adamw_anyprecision # # - sgd # # - adagrad # # - adamw_bnb_8bit # # - lion_8bit # # - lion_32bit # # - paged_adamw_32bit # # - paged_adamw_8bit # # - paged_lion_32bit # # - paged_lion_8bit # optimizer: # # Specify weight decay # weight_decay: # # adamw hyperparams # adam_beta1: # adam_beta2: # adam_epsilon: # # Gradient clipping max norm # max_grad_norm: # # Augmentation techniques # # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings # # currently only supported on Llama and Mistral # noisy_embedding_alpha: # # Whether to bettertransformers # flash_optimum: # # Whether to use xformers attention patch https://github.com/facebookresearch/xformers: # xformers_attention: # # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention: # flash_attention: # flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only # flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation # # Whether to use scaled-dot-product attention # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html # sdp_attention: # # Landmark attention (only llama) # landmark_attention: # # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py # # LLaMA only # xpos_rope: # # Resume from a specific checkpoint dir # resume_from_checkpoint: # # If resume_from_checkpoint isn't set and you simply want it to start where it left off. # # Be careful with this being turned on between different models. # auto_resume_from_checkpoints: false # # Don't mess with this, it's here for accelerate and torchrun # local_rank: # # Add or change special tokens. # # If you add tokens here, you don't need to add them to the `tokens` list. # special_tokens: # # bos_token: "" # # eos_token: "" # # unk_token: "" # # Add extra tokens. # tokens: # # FSDP # fsdp: # fsdp_config: # # Deepspeed config path. e.g., deepspeed/zero3.json # deepspeed: # # Advanced DDP Arguments # ddp_timeout: # ddp_bucket_cap_mb: # ddp_broadcast_buffers: # # Path to torch distx for optim 'adamw_anyprecision' # torchdistx_path: # # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize # pretraining_dataset: # # Debug mode # debug: # # Seed # seed: # # Allow overwrite yml config using from cli # strict: base_model: ${BASE_MODEL} base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS} base_model_config: ${BASE_MODEL_CONFIG} revision_of_model: ${REVISION_OF_MODEL} tokenizer_config: ${TOKENIZER_CONFIG} model_type: ${MODEL_TYPE} tokenizer_type: ${TOKENIZER_TYPE} trust_remote_code: ${TRUST_REMOTE_CODE} tokenizer_use_fast: ${TOKENIZER_USE_FAST} tokenizer_legacy: ${TOKENIZER_LEGACY} resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X} is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL} is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL} is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL} is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL} overrides_of_model_config: rope_scaling: type: ${ROPE_SCALING_TYPE} factor: ${ROPE_SCALING_FACTOR} bnb_config_kwargs: llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT} bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE} bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT} gptq: ${GPTQ} load_in_8bit: ${LOAD_IN_8BIT} load_in_4bit: ${LOAD_IN_4BIT} bf16: ${BF16} fp16: ${FP16} tf32: ${TF32} bfloat16: ${BFLOAT16} float16: ${FLOAT16} gpu_memory_limit: ${GPU_MEMORY_LIMIT} lora_on_cpu: ${LORA_ON_CPU} datasets: - path: ${DATASET_PATH} type: ${DATASET_TYPE} ds_type: ${DATASET_DS_TYPE} data_files: ${DATASET_DATA_FILES} shards: ${DATASET_SHARDS} name: ${DATASET_NAME} train_on_split: ${DATASET_TRAIN_ON_SPLIT} revision: ${DATASET_REVISION} trust_remote_code: ${DATASET_TRUST_REMOTE_CODE} rl: ${RL} dpo_use_weighting: ${DPO_USE_WEIGHTING} chat_template: ${CHAT_TEMPLATE} chat_template_jinja: ${CHAT_TEMPLATE_JINJA} default_system_message: ${DEFAULT_SYSTEM_MESSAGE} dataset_prepared_path: ${DATASET_PREPARED_PATH} push_dataset_to_hub: ${PUSH_DATASET_TO_HUB} dataset_num_proc: ${DATASET_NUM_PROC} dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY} hub_model_id: ${HUB_MODEL_ID} hub_strategy: ${HUB_STRATEGY} hf_use_auth_token: ${HF_USE_AUTH_TOKEN} val_set_size: ${VAL_SET_SIZE} dataset_shard_num: ${DATASET_SHARD_NUM} dataset_shard_idx: ${DATASET_SHARD_IDX} sequence_len: ${SEQUENCE_LEN} pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN} sample_packing: ${SAMPLE_PACKING} eval_sample_packing: ${EVAL_SAMPLE_PACKING} sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST} total_num_tokens: ${TOTAL_NUM_TOKENS} sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE} sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE} batch_flattening: ${BATCH_FLATTENING} device_map: ${DEVICE_MAP} max_memory: ${MAX_MEMORY} adapter: ${ADAPTER} lora_model_dir: ${LORA_MODEL_DIR} lora_r: ${LORA_R} lora_alpha: ${LORA_ALPHA} lora_dropout: ${LORA_DROPOUT} lora_target_modules: - ${LORA_TARGET_MODULES} lora_target_linear: ${LORA_TARGET_LINEAR} peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM} lora_modules_to_save: ${LORA_MODULES_TO_SAVE} lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT} loraplus_lr_ratio: ${LORAPLUS_LR_RATIO} loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING} peft: loftq_config: loftq_bits: ${LOFTQ_BITS} relora_steps: ${RELORA_STEPS} relora_warmup_steps: ${RELORA_WARMUP_STEPS} relora_anneal_steps: ${RELORA_ANNEAL_STEPS} relora_prune_ratio: ${RELORA_PRUNE_RATIO} relora_cpu_offload: ${RELORA_CPU_OFFLOAD} wandb_mode: ${WANDB_MODE} wandb_project: ${WANDB_PROJECT} wandb_entity: ${WANDB_ENTITY} wandb_watch: ${WANDB_WATCH} wandb_name: ${WANDB_NAME} wandb_run_id: ${WANDB_RUN_ID} wandb_log_model: ${WANDB_LOG_MODEL} mlflow_tracking_uri: ${MLFLOW_TRACKING_URI} mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME} mlflow_run_name: ${MLFLOW_RUN_NAME} hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS} use_comet: ${USE_COMET} comet_api_key: ${COMET_API_KEY} comet_workspace: ${COMET_WORKSPACE} comet_project_name: ${COMET_PROJECT_NAME} comet_experiment_key: ${COMET_EXPERIMENT_KEY} comet_mode: ${COMET_MODE} comet_online: ${COMET_ONLINE} comet_experiment_config: ${COMET_EXPERIMENT_CONFIG} output_dir: ${OUTPUT_DIR} torch_compile: ${TORCH_COMPILE} torch_compile_backend: ${TORCH_COMPILE_BACKEND} gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS} micro_batch_size: ${MICRO_BATCH_SIZE} eval_batch_size: ${EVAL_BATCH_SIZE} num_epochs: ${NUM_EPOCHS} warmup_steps: ${WARMUP_STEPS} warmup_ratio: ${WARMUP_RATIO} learning_rate: ${LEARNING_RATE} lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP} logging_steps: ${LOGGING_STEPS} eval_steps: ${EVAL_STEPS} evals_per_epoch: ${EVALS_PER_EPOCH} save_strategy: ${SAVE_STRATEGY} save_steps: ${SAVE_STEPS} saves_per_epoch: ${SAVES_PER_EPOCH} save_total_limit: ${SAVE_TOTAL_LIMIT} max_steps: ${MAX_STEPS} eval_table_size: ${EVAL_TABLE_SIZE} eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS} eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS} profiler_steps: ${PROFILER_STEPS} loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD} loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE} train_on_inputs: ${TRAIN_ON_INPUTS} group_by_length: ${GROUP_BY_LENGTH} gradient_checkpointing: ${GRADIENT_CHECKPOINTING} early_stopping_patience: ${EARLY_STOPPING_PATIENCE} lr_scheduler: ${LR_SCHEDULER} lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS} cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO} cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO} lr_div_factor: ${LR_DIV_FACTOR} optimizer: ${OPTIMIZER} optim_args: ${OPTIM_ARGS} optim_target_modules: ${OPTIM_TARGET_MODULES} weight_decay: ${WEIGHT_DECAY} adam_beta1: ${ADAM_BETA1} adam_beta2: ${ADAM_BETA2} adam_epsilon: ${ADAM_EPSILON} max_grad_norm: ${MAX_GRAD_NORM} neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA} flash_optimum: ${FLASH_OPTIMUM} xformers_attention: ${XFORMERS_ATTENTION} flash_attention: ${FLASH_ATTENTION} flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY} flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM} flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP} sdp_attention: ${SDP_ATTENTION} s2_attention: ${S2_ATTENTION} resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT} auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS} local_rank: ${LOCAL_RANK} special_tokens: bos_token: ${SPECIAL_TOKEN_BOS} eos_token: ${SPECIAL_TOKEN_EOS} unk_token: ${SPECIAL_TOKEN_UNK} pad_token: ${SPECIAL_TOKEN_PAD} tokens: ${TOKENS} fsdp: ${FSDP} fsdp_config: ${FSDP_CONFIG} deepspeed: ${DEEPSPEED} ddp_timeout: ${DDP_TIMEOUT} ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB} ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS} torchdistx_path: ${TORCHDISTX_PATH} pretraining_dataset: ${PRETRAINING_DATASET} debug: ${DEBUG} seed: ${SEED} strict: ${STRICT} ================================================ FILE: .runpod/src/handler.py ================================================ """ Runpod serverless entrypoint handler """ import os import runpod import yaml from huggingface_hub._login import login from train import train from utils import get_output_dir BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume") if not os.path.exists(BASE_VOLUME): os.makedirs(BASE_VOLUME) logger = runpod.RunPodLogger() async def handler(job): runpod_job_id = job["id"] inputs = job["input"] run_id = inputs.get("run_id", "default_run_id") args = inputs.get("args", {}) # Set output directory output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id)) args["output_dir"] = output_dir # First save args to a temporary config file config_path = "/workspace/test_config.yaml" # Add run_name and job_id to args before saving args["run_name"] = run_id args["runpod_job_id"] = runpod_job_id yaml_data = yaml.dump(args, default_flow_style=False) with open(config_path, "w", encoding="utf-8") as file: file.write(yaml_data) # Handle credentials credentials = inputs.get("credentials", {}) if "wandb_api_key" in credentials: os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"] if "hf_token" in credentials: os.environ["HF_TOKEN"] = credentials["hf_token"] if os.environ.get("HF_TOKEN"): login(token=os.environ["HF_TOKEN"]) else: logger.info("No HF_TOKEN provided. Skipping login.") logger.info("Starting Training.") async for result in train(config_path): # Pass the config path instead of args logger.info(result) logger.info("Training Complete.") # Cleanup if "WANDB_API_KEY" in os.environ: del os.environ["WANDB_API_KEY"] if "HF_TOKEN" in os.environ: del os.environ["HF_TOKEN"] runpod.serverless.start({"handler": handler, "return_aggregate_stream": True}) ================================================ FILE: .runpod/src/test_input.json ================================================ { "input": { "user_id": "user", "model_id": "llama-test", "run_id": "llama-test", "credentials": { "wandb_api_key": "", "hf_token": "" }, "args": { "base_model": "NousResearch/Meta-Llama-3-8B", "model_type": "LlamaForCausalLM", "tokenizer_type": "AutoTokenizer", "load_in_8bit": true, "load_in_4bit": false, "strict": false, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca" } ], "val_set_size": 0.05, "output_dir": "./outputs/lora-out", "sequence_len": 4096, "sample_packing": true, "eval_sample_packing": false, "pad_to_sequence_len": true, "adapter": "lora", "lora_r": 32, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": true, "lora_modules_to_save": [ "embed_tokens", "lm_head" ], "gradient_accumulation_steps": 4, "micro_batch_size": 2, "num_epochs": 1, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "learning_rate": 0.0002, "train_on_inputs": false, "group_by_length": false, "bf16": "auto", "tf32": false, "gradient_checkpointing": true, "logging_steps": 1, "flash_attention": true, "warmup_steps": 1, "evals_per_epoch": 1, "eval_max_new_tokens": 128, "saves_per_epoch": 1, "weight_decay": 0.0, "special_tokens": { "pad_token": "<|end_of_text|>" } } } } ================================================ FILE: .runpod/src/train.py ================================================ """ Runpod train entrypoint """ import asyncio async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True): """ Run preprocessing (if enabled) and training with the given config file :param config_path: Path to the YAML config file :param gpu_id: GPU ID to use (default: "0") :param preprocess: Whether to run preprocessing (default: True) """ # First check if preprocessing is needed if preprocess: # Preprocess command preprocess_cmd = ( f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}" ) process = await asyncio.create_subprocess_shell( preprocess_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, ) if process.stdout is not None: async for line in process.stdout: yield f"Preprocessing: {line.decode().strip()}" await process.wait() yield "Preprocessing completed." else: yield "Skipping preprocessing step." # Training command train_cmd = f"axolotl train {config_path}" process = await asyncio.create_subprocess_shell( train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT ) if process.stdout is not None: async for line in process.stdout: yield f"Training: {line.decode().strip()}" await process.wait() ================================================ FILE: .runpod/src/utils.py ================================================ """ Runpod launcher utils """ import os import yaml def get_output_dir(run_id): path = f"fine-tuning/{run_id}" return path def make_valid_config(input_args): """ Creates and saves updated config file, returns the path to the new config :param input_args: dict of input args :return: str, path to the updated config file """ # Load default config with open("config/config.yaml", "r", encoding="utf-8") as fin: all_args = yaml.safe_load(fin) if not input_args: print("No args provided, using defaults") else: all_args.update(input_args) # Create updated config path updated_config_path = "config/updated_config.yaml" # Save updated config to new file with open(updated_config_path, "w", encoding="utf-8") as f: yaml.dump(all_args, f) return updated_config_path def set_config_env_vars(args: dict): """ Convert API arguments into environment variables. Handles nested dictionaries, lists, and special values. Args: args (dict): The arguments dictionary from the API request """ def process_value(value): """Convert Python values to string format for environment variables""" if value is None: return "" if isinstance(value, bool): return str(value).lower() if isinstance(value, (list, dict)): return str(value) return str(value) def set_env_vars(data, prefix=""): """Recursively set environment variables from nested dictionary""" for key, value in data.items(): env_key = prefix + key.upper() # Handle special cases if isinstance(value, dict): # For nested dictionaries (like special_tokens) set_env_vars(value, f"{env_key}_") elif isinstance(value, list): # Handle list of dictionaries (like datasets) if value and isinstance(value[0], dict): for i, item in enumerate(value): set_env_vars(item, f"{env_key}_{i}_") else: # For simple lists (like lora_target_modules) os.environ[env_key] = process_value(value) else: # Handle all other cases os.environ[env_key] = process_value(value) # Clear any existing related environment variables # This prevents old values from persisting for key in list(os.environ.keys()): if key.startswith( ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_") ): del os.environ[key] # Set new environment variables set_env_vars(args) ================================================ FILE: .runpod/test-input.json ================================================ { "input": { "name": "quick_smoke_test_sft", "user_id": "user", "model_id": "llama-test", "run_id": "llama-test", "credentials": { "wandb_api_key": "", "hf_token": "" }, "args": { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "load_in_4bit": true, "strict": false, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", "split": "train[:10%]" } ], "val_set_size": 0.02, "output_dir": "./outputs/lora-out", "sequence_len": 4096, "sample_packing": true, "eval_sample_packing": false, "pad_to_sequence_len": true, "adapter": "qlora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": true, "lora_modules_to_save": [ "embed_tokens", "lm_head" ], "gradient_accumulation_steps": 2, "micro_batch_size": 1, "num_epochs": 1, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "learning_rate": 0.0002, "train_on_inputs": false, "group_by_length": false, "bf16": "auto", "tf32": true, "gradient_checkpointing": true, "logging_steps": 1, "flash_attention": true, "warmup_steps": 1, "evals_per_epoch": 1, "eval_max_new_tokens": 128, "saves_per_epoch": 1, "weight_decay": 0.0, "special_tokens": { "pad_token": "<|endoftext|>" }, "max_steps": 20 }, "timeout": 100000 }, "config": { "gpuTypeId": "NVIDIA GeForce RTX 4090", "gpuCount": 1, "containerDiskInGb": 200, "env": [ { "key": "TOKENIZER", "value": "" }, { "key": "DISABLE_LOG_STATS", "value": "true" } ], "allowedCudaVersions": [ "12.8", "12.7", "12.6", "12.5", "12.4" ] } } ================================================ FILE: .runpod/tests.json ================================================ { "tests": [ { "name": "quick_smoke_test_sft", "input": { "user_id": "user", "model_id": "llama-test", "run_id": "llama-test", "credentials": { "wandb_api_key": "", "hf_token": "" }, "args": { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "load_in_4bit": true, "strict": false, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", "split": "train[:10%]" } ], "val_set_size": 0.02, "output_dir": "./outputs/lora-out", "sequence_len": 4096, "sample_packing": true, "eval_sample_packing": false, "pad_to_sequence_len": true, "adapter": "qlora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": true, "lora_modules_to_save": [ "embed_tokens", "lm_head" ], "gradient_accumulation_steps": 2, "micro_batch_size": 1, "num_epochs": 1, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "learning_rate": 0.0002, "train_on_inputs": false, "group_by_length": false, "bf16": "auto", "tf32": true, "gradient_checkpointing": true, "logging_steps": 1, "flash_attention": true, "warmup_steps": 1, "evals_per_epoch": 1, "eval_max_new_tokens": 128, "saves_per_epoch": 1, "weight_decay": 0.0, "special_tokens": { "pad_token": "<|endoftext|>" }, "max_steps": 20 } }, "timeout": 100000 } ], "config": { "gpuTypeId": "NVIDIA GeForce RTX 4090", "gpuCount": 1, "containerDiskInGb": 200, "env": [ { "key": "TOKENIZER", "value": "" }, { "key": "DISABLE_LOG_STATS", "value": "true" } ], "allowedCudaVersions": [ "12.8", "12.7", "12.6", "12.5", "12.4" ] } } ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 type: software title: "Axolotl: Open Source LLM Post-Training" message: "If you use this software, please cite it as below." authors: - name: "Axolotl maintainers and contributors" repository-code: "https://github.com/axolotl-ai-cloud/axolotl" url: "https://axolotl.ai/" license: Apache-2.0 date-released: "2023-05-30" ================================================ FILE: CNAME ================================================ docs.axolotl.ai ================================================ FILE: FAQS.md ================================================ # FAQs - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874) - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases - `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c` `/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598: arrow::fs::FinalizeS3 was not called even though S3 was initialized.` This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source. ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include requirements.txt include README.md include LICENSE include src/setuptools_axolotl_dynamic_dependencies.py include src/axolotl/utils/chat_templates/templates/*.jinja recursive-include axolotl *.py ================================================ FILE: README.md ================================================

Axolotl

A Free and Open Source LLM Fine-tuning Framework

GitHub License tests codecov Releases
contributors GitHub Repo stars
discord twitter google-colab
tests-nightly multigpu-semi-weekly tests

## 🎉 Latest Updates - 2026/03: - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45). - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat). - 2026/02: - [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels. - Axolotl now has support for [SageAttention](https://github.com/axolotl-ai-cloud/axolotl/pull/2823) and [GDPO](https://github.com/axolotl-ai-cloud/axolotl/pull/3353) (Generalized DPO). - 2026/01: - New integration for [EAFT](https://github.com/axolotl-ai-cloud/axolotl/pull/3366) (Entropy-Aware Focal Training), weights loss by entropy of the top-k logit distribution, and [Scalable Softmax](https://github.com/axolotl-ai-cloud/axolotl/pull/3338), improves long context in attention. - 2025/12: - Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html). - [Distributed Muon Optimizer](https://github.com/axolotl-ai-cloud/axolotl/pull/3264) support has been added for FSDP2 pretraining. - 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
Expand older updates - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion). - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107). - 2025/07: - ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info. - Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html). - FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)! - [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl! - TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl! - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl! - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more! - 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version! - 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning. - 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own! - 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try. - 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun! - 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
## ✨ Overview Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs). Features: - **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub. - **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, GLM-4.6V, InternVL 3.5, Gemma 3n, and audio models like Voxtral with image, video, and audio support. - **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO, GDPO), and Reward Modelling (RM) / Process Reward Modelling (PRM). - **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference. - **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention 2/3/4](https://docs.axolotl.ai/docs/attention.html#flash-attention), [Xformers](https://docs.axolotl.ai/docs/attention.html#xformers), [Flex Attention](https://docs.axolotl.ai/docs/attention.html#flex-attention), [SageAttention](https://docs.axolotl.ai/docs/attention.html#sageattention), [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels), [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy), [ScatterMoE](https://docs.axolotl.ai/docs/custom_integrations.html#kernels-integration), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more! - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets. - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware. ## 🚀 Quick Start - LLM Fine-tuning in Minutes **Requirements**: - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU - Python 3.11 - PyTorch ≥2.8.0 ### Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa) ### Installation #### Using pip ```bash pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] # Download example axolotl configs, deepspeed configs axolotl fetch examples axolotl fetch deepspeed_configs # OPTIONAL ``` #### Using Docker Installing with Docker can be less error prone than installing in your own environment. ```bash docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest ``` Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html). #### Cloud Providers
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz) - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme) - [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true) - [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - [Novita](https://novita.ai/gpus-console?templateId=311) - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl) - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
### Your First Fine-tune ```bash # Fetch axolotl examples axolotl fetch examples # Or, specify a custom path axolotl fetch examples --dest path/to/folder # Train a model using LoRA axolotl train examples/llama-3/lora-1b.yml ``` That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough. ## 📚 Documentation - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments - [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples - [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [Multipacking](https://docs.axolotl.ai/docs/multipack.html) - [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation - [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions ## 🤝 Getting Help - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory - Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html) - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options ## 🌟 Contributing Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details. ## 📈 Telemetry Axolotl has opt-out telemetry that helps us understand how the project is being used and prioritize improvements. We collect basic system information, model types, and error rates—never personal data or file paths. Telemetry is enabled by default. To disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html). ## ❤️ Sponsors Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai) ## 📝 Citing Axolotl If you use Axolotl in your research or projects, please cite it as follows: ```bibtex @software{axolotl, title = {Axolotl: Open Source LLM Post-Training}, author = {{Axolotl maintainers and contributors}}, url = {https://github.com/axolotl-ai-cloud/axolotl}, license = {Apache-2.0}, year = {2023} } ``` ## 📜 License This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details. ================================================ FILE: VERSION ================================================ 0.16.0.dev0 ================================================ FILE: _quarto.yml ================================================ project: type: website pre-render: - docs/scripts/generate_config_docs.py - docs/scripts/generate_examples_docs.py quartodoc: dir: docs/api package: axolotl title: API Reference parser: google sections: - title: Core desc: Core functionality for training contents: - train - evaluate - datasets - convert - prompt_tokenizers - logging_config - core.builders.base - core.builders.causal - core.builders.rl - core.training_args - core.chat.messages - core.chat.format.chatml - core.chat.format.llama3x - core.chat.format.shared - core.datasets.chat - core.datasets.transforms.chat_builder - title: CLI desc: Command-line interface contents: - cli.main - cli.train - cli.evaluate - cli.args - cli.art - cli.checks - cli.config - cli.delinearize_llama4 - cli.inference - cli.merge_lora - cli.merge_sharded_fsdp_weights - cli.preprocess - cli.quantize - cli.vllm_serve - cli.cloud.base - cli.cloud.modal_ - cli.utils - cli.utils.args - cli.utils.fetch - cli.utils.load - cli.utils.sweeps - cli.utils.train - title: Trainers desc: Training implementations contents: - core.trainers.base - core.trainers.trl - core.trainers.mamba - core.trainers.dpo.trainer - core.trainers.grpo.trainer - core.trainers.grpo.sampler - core.trainers.utils - title: Model Loading desc: Functionality for loading and patching models, tokenizers, etc. contents: - loaders.model - loaders.tokenizer - loaders.processor - loaders.adapter - loaders.patch_manager - loaders.constants - title: Mixins desc: Mixin classes for augmenting trainers contents: - core.trainers.mixins.optimizer - core.trainers.mixins.rng_state_loader - core.trainers.mixins.scheduler - title: Context Managers desc: Context managers for altering trainer behaviors contents: - utils.ctx_managers.sequence_parallel - title: Prompt Strategies desc: Prompt formatting strategies contents: - prompt_strategies.base - prompt_strategies.chat_template - prompt_strategies.alpaca_chat - prompt_strategies.alpaca_instruct - prompt_strategies.alpaca_w_system - prompt_strategies.user_defined - prompt_strategies.llama2_chat - prompt_strategies.completion - prompt_strategies.input_output - prompt_strategies.stepwise_supervised - prompt_strategies.metharme - prompt_strategies.orcamini - prompt_strategies.pygmalion - prompt_strategies.messages.chat - prompt_strategies.dpo.chat_template - prompt_strategies.dpo.llama3 - prompt_strategies.dpo.chatml - prompt_strategies.dpo.zephyr - prompt_strategies.dpo.user_defined - prompt_strategies.dpo.passthrough - prompt_strategies.kto.llama3 - prompt_strategies.kto.chatml - prompt_strategies.kto.user_defined - prompt_strategies.orpo.chat_template - prompt_strategies.bradley_terry.llama3 - title: Kernels desc: Low-level performance optimizations contents: - kernels.lora - kernels.geglu - kernels.swiglu - kernels.quantize - kernels.utils - title: Monkey Patches desc: Runtime patches for model optimizations contents: - monkeypatch.llama_attn_hijack_flash - monkeypatch.llama_attn_hijack_xformers - monkeypatch.mistral_attn_hijack_flash - monkeypatch.multipack - monkeypatch.relora - monkeypatch.lora_kernels - monkeypatch.utils - monkeypatch.btlm_attn_hijack_flash - monkeypatch.stablelm_attn_hijack_flash - monkeypatch.trainer_fsdp_optim - monkeypatch.transformers_fa_utils - monkeypatch.unsloth_ - monkeypatch.data.batch_dataset_fetcher - monkeypatch.mixtral - monkeypatch.gradient_checkpointing.offload_cpu - monkeypatch.gradient_checkpointing.offload_disk - title: Utils desc: Utility functions contents: - utils.tokenization - utils.chat_templates - utils.lora - utils.model_shard_quant - utils.bench - utils.freeze - utils.trainer - utils.schedulers - utils.distributed - utils.dict - utils.optimizers.adopt - utils.data.streaming - utils.data.sft - utils.quantization - title: Schemas desc: Pydantic data models for Axolotl config contents: - utils.schemas.config - utils.schemas.model - utils.schemas.training - utils.schemas.datasets - utils.schemas.peft - utils.schemas.trl - utils.schemas.multimodal - utils.schemas.integrations - utils.schemas.enums - utils.schemas.utils - title: Integrations desc: Third-party integrations and extensions contents: - integrations.base - integrations.cut_cross_entropy.args - integrations.grokfast.optimizer - integrations.kd.trainer - integrations.liger.args - integrations.lm_eval.args - integrations.spectrum.args - title: Common desc: Common utilities and shared functionality contents: - common.architectures - common.const - common.datasets - title: Models desc: Custom model implementations contents: - models.mamba.modeling_mamba - title: Data Processing desc: Data processing utilities contents: - utils.collators.core - utils.collators.batching - utils.collators.mamba - utils.collators.mm_chat - utils.samplers.multipack - title: Callbacks desc: Training callbacks contents: - utils.callbacks.perplexity - utils.callbacks.profiler - utils.callbacks.lisa - utils.callbacks.mlflow_ - utils.callbacks.comet_ - utils.callbacks.qat website: title: "Axolotl" description: "We make fine-tuning accessible, scalable, and fun" favicon: favicon.jpg google-analytics: "G-9KYCVJBNMQ" navbar: logo: image/axolotl_logo_digital_white.svg title: false background: dark pinned: false collapse: false tools: - icon: twitter href: https://twitter.com/axolotl_ai - icon: github href: https://github.com/axolotl-ai-cloud/axolotl/ - icon: discord href: https://discord.gg/7m9sfhzaf3 sidebar: pinned: true collapse-level: 2 style: docked contents: - text: Home href: index.qmd - section: "Getting Started" contents: - docs/getting-started.qmd - docs/installation.qmd - docs/inference.qmd - section: "Model Guides" contents: - docs/models/kimi-linear.qmd - docs/models/plano.qmd - docs/models/mimo.qmd - docs/models/internvl3_5.qmd - docs/models/olmo3.qmd - docs/models/trinity.qmd - docs/models/arcee.qmd - section: "Ministral3" contents: - docs/models/ministral3.qmd - docs/models/ministral3/think.qmd - docs/models/ministral3/vision.qmd - section: "Magistral" contents: - docs/models/magistral.qmd - docs/models/magistral/think.qmd - docs/models/magistral/vision.qmd - docs/models/ministral.qmd - docs/models/mistral-small.qmd - docs/models/voxtral.qmd - docs/models/devstral.qmd - docs/models/mistral.qmd - docs/models/llama-4.qmd - docs/models/llama-2.qmd - docs/models/qwen3-next.qmd - docs/models/qwen3.qmd - docs/models/gemma3n.qmd - docs/models/apertus.qmd - docs/models/gpt-oss.qmd - docs/models/seed-oss.qmd - docs/models/phi.qmd - docs/models/smolvlm2.qmd - docs/models/granite4.qmd - docs/models/LiquidAI.qmd - docs/models/hunyuan.qmd - docs/models/jamba.qmd - docs/models/orpheus.qmd - docs/cli.qmd - docs/telemetry.qmd - docs/config-reference.qmd - text: "API Reference" href: docs/api - section: "Dataset Formats" contents: docs/dataset-formats/* - section: "Deployments" contents: - docs/docker.qmd - docs/multi-gpu.qmd - docs/multi-node.qmd - docs/ray-integration.qmd - docs/amd_hpc.qmd - docs/mac.qmd - section: "How To Guides" contents: - docs/multimodal.qmd - docs/rlhf.qmd - docs/reward_modelling.qmd - docs/lr_groups.qmd - docs/lora_optims.qmd - docs/dataset_loading.qmd - docs/qat.qmd - docs/quantize.qmd - docs/optimizations.qmd - section: "Core Concepts" contents: - docs/batch_vs_grad.qmd - docs/dataset_preprocessing.qmd - docs/streaming.qmd - docs/multipack.qmd - docs/mixed_precision.qmd - docs/optimizers.qmd - docs/attention.qmd - section: "Advanced Features" contents: - docs/fsdp_qlora.qmd - docs/unsloth.qmd - docs/torchao.qmd - docs/custom_integrations.qmd - docs/sequence_parallelism.qmd - docs/gradient_checkpointing.qmd - docs/nd_parallelism.qmd - docs/expert_quantization.qmd - section: "Troubleshooting" contents: - docs/faq.qmd - docs/debugging.qmd - docs/nccl.qmd format: html: theme: darkly css: styles.css toc: true # Enable better handling of line breaks in markdown preserve-tabs: true html-math-method: mathjax # Improved markdown processing options md-extensions: - markdown_it - def_list - attr_list - fenced_divs - tables - html_admonition - lineblocks - fancy_lists # Control whitespace handling whitespace: preserve # Process newlines in paragraphs wrap: preserve # Better line break handling preserve-linebreaks: true ================================================ FILE: benchmarks/bench_entropy.py ================================================ """Benchmark for entropy_from_logits Triton kernel vs original chunked implementation. Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_entropy.py """ import gc import statistics import torch import torch.nn.functional as F from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 151936 # Qwen vocab WARMUP = 5 BENCH_ITERS = 20 MEM_ITERS = 10 def entropy_from_logits_original(logits: torch.Tensor, chunk_size: int = 128): """Original chunked implementation (reference).""" original_shape = logits.shape[:-1] num_classes = logits.shape[-1] flat_logits = logits.reshape(-1, num_classes) entropies = [] for chunk in flat_logits.split(chunk_size, dim=0): logps = F.log_softmax(chunk, dim=-1) chunk_entropy = -(torch.exp(logps) * logps).sum(-1) entropies.append(chunk_entropy) return torch.cat(entropies, dim=0).reshape(original_shape) def _clean_gpu(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() torch.cuda.reset_accumulated_memory_stats() torch.cuda.synchronize() def profile_time(fn, logits, n_iters=BENCH_ITERS): for _ in range(WARMUP): out = fn(logits, chunk_size=128) del out torch.cuda.synchronize() times = [] for _ in range(n_iters): s = torch.cuda.Event(enable_timing=True) e = torch.cuda.Event(enable_timing=True) s.record() out = fn(logits, chunk_size=128) e.record() torch.cuda.synchronize() times.append(s.elapsed_time(e)) del out return times def profile_memory(fn, logits, n_iters=MEM_ITERS): for _ in range(WARMUP): out = fn(logits, chunk_size=128) del out torch.cuda.synchronize() peaks = [] for _ in range(n_iters): _clean_gpu() base = torch.cuda.max_memory_allocated() out = fn(logits, chunk_size=128) torch.cuda.synchronize() peaks.append(torch.cuda.max_memory_allocated() - base) del out return [p / 1e6 for p in peaks] def fmt(values, unit=""): mean = statistics.mean(values) std = statistics.stdev(values) if len(values) > 1 else 0.0 return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]" def benchmark_contiguous(): print("=" * 60) print( f"CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})" ) print("=" * 60) configs = [ (1, 2048), (1, 8192), (1, 16384), (4, 4096), (8, 2048), (16, 2048), (16, 4096), ] for B, L in configs: mem_gb = B * L * V * 2 / 1e9 if mem_gb > 28: print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)") continue N = B * L print(f"\n{'─' * 60}") print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)") print(f"{'─' * 60}") torch.manual_seed(42) logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16) t_orig = profile_time(entropy_from_logits_original, logits) t_triton = profile_time(entropy_from_logits, logits) orig_mean = statistics.mean(t_orig) triton_mean = statistics.mean(t_triton) print(" TIME (ms):") print(f" original: {fmt(t_orig, 'ms')}") print(f" triton: {fmt(t_triton, 'ms')}") print(f" speedup: {orig_mean / triton_mean:.2f}x") m_orig = profile_memory(entropy_from_logits_original, logits) m_triton = profile_memory(entropy_from_logits, logits) orig_peak = statistics.mean(m_orig) triton_peak = statistics.mean(m_triton) print(" MEMORY (peak overhead):") print(f" original: {fmt(m_orig, 'MB')}") print(f" triton: {fmt(m_triton, 'MB')}") print(f" saved: {orig_peak - triton_peak:.1f} MB") del logits _clean_gpu() def benchmark_noncontiguous(): print("\n" + "=" * 60) print( f"NON-CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})" ) print("=" * 60) configs = [ (4, 2048, "transpose"), (4, 8192, "transpose"), (8, 2048, "transpose"), (4, 4096, "slice_batch"), ] for B, L, method in configs: torch.manual_seed(42) if method == "transpose": raw = torch.randn(L, B, V, device="cuda", dtype=torch.bfloat16) logits_nc = raw.transpose(0, 1) raw_gb = L * B * V * 2 / 1e9 elif method == "slice_batch": raw = torch.randn(B * 2, L, V, device="cuda", dtype=torch.bfloat16) logits_nc = raw[::2] raw_gb = B * 2 * L * V * 2 / 1e9 else: continue if raw_gb > 28: print(f"\n skip B={B}, L={L}, {method} ({raw_gb:.1f} GB)") del raw, logits_nc torch.cuda.empty_cache() continue N = B * L print(f"\n{'─' * 60}") print(f"B={B}, L={L} {method} ({N} rows, raw {raw_gb:.2f} GB)") print(f"{'─' * 60}") def original_with_copy(logits, chunk_size=128): return entropy_from_logits_original( logits.contiguous(), chunk_size=chunk_size ) t_orig = profile_time(original_with_copy, logits_nc) t_triton = profile_time(entropy_from_logits, logits_nc) orig_mean = statistics.mean(t_orig) triton_mean = statistics.mean(t_triton) print(" TIME (ms):") print(f" orig+copy: {fmt(t_orig, 'ms')}") print(f" triton-strided:{fmt(t_triton, 'ms')}") print(f" speedup: {orig_mean / triton_mean:.2f}x") m_orig = profile_memory(original_with_copy, logits_nc) m_triton = profile_memory(entropy_from_logits, logits_nc) orig_peak = statistics.mean(m_orig) triton_peak = statistics.mean(m_triton) print(" MEMORY (peak overhead):") print(f" orig+copy: {fmt(m_orig, 'MB')}") print(f" triton-strided:{fmt(m_triton, 'MB')}") print(f" saved: {orig_peak - triton_peak:.1f} MB") del raw, logits_nc _clean_gpu() if __name__ == "__main__": benchmark_contiguous() benchmark_noncontiguous() ================================================ FILE: benchmarks/bench_scattermoe_lora.py ================================================ """Benchmark for ScatterMoE LoRA Triton kernels. Measures forward, backward dX, and backward dA/dB kernels at common MoE model shapes. Reports per-kernel timings, LoRA overhead vs base scatter2scatter, and full fwd+bwd autograd throughput. Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --ranks 16 64 CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --models Qwen/Qwen3.5-35B-A3B """ import argparse import gc import time from functools import partial import torch from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import ( lora_ops, ops as base_ops, ) from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import ( flatten_sort_count, ) from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import ( ScatterMoELoRA, ) DEVICE = "cuda" DTYPE = torch.bfloat16 WARMUP = 5 ITERS = 20 # ─── Model configs ────────────────────────────────────────────────────────── BUILTIN_CONFIGS = { "Qwen3.5-35B-A3B": (256, 2048, 512, 8), # E, H, I, k "Qwen3-30B-A3B": (128, 2048, 768, 8), "OLMoE-1B-7B": (64, 2048, 1024, 8), "Mixtral-8x7B": (8, 4096, 14336, 2), } def _resolve_config(spec): """Resolve a model spec to (E, H, I, k). Accepts builtin names or HF IDs.""" key = spec.lower().replace("/", "-") for name, cfg in BUILTIN_CONFIGS.items(): if key in name.lower() or name.lower() in key: return name, cfg from transformers import AutoConfig hf_cfg = AutoConfig.from_pretrained(spec, trust_remote_code=True) if callable(getattr(hf_cfg, "get_text_config", None)): tc = hf_cfg.get_text_config() if hasattr(tc, "model_type") and tc.model_type != hf_cfg.model_type: hf_cfg = tc hidden = hf_cfg.hidden_size inter = getattr(hf_cfg, "moe_intermediate_size", None) or hf_cfg.intermediate_size experts = ( getattr(hf_cfg, "num_experts", None) or getattr(hf_cfg, "num_local_experts", None) or getattr(hf_cfg, "n_routed_experts", None) ) top_k = ( getattr(hf_cfg, "num_experts_per_tok", None) or getattr(hf_cfg, "num_experts_per_token", None) or 2 ) name = spec.split("/")[-1] return name, (experts, hidden, inter, top_k) # ─── Benchmark helpers ────────────────────────────────────────────────────── def _clean(): gc.collect() torch.cuda.empty_cache() torch.cuda.synchronize() def _bench(fn, warmup=WARMUP, iters=ITERS): for _ in range(warmup): fn() torch.cuda.synchronize() times = [] for _ in range(iters): torch.cuda.synchronize() t0 = time.perf_counter() fn() torch.cuda.synchronize() times.append((time.perf_counter() - t0) * 1000) times.sort() return times[len(times) // 2] def _setup(num_experts, K, N, T, top_k, R): torch.manual_seed(42) x = torch.randn(T, K, device=DEVICE, dtype=DTYPE) W = torch.randn(num_experts, K, N, device=DEVICE, dtype=DTYPE) * 0.02 lora_A = torch.randn(R * num_experts, K, device=DEVICE, dtype=DTYPE) * 0.01 lora_B = torch.randn(N, R * num_experts, device=DEVICE, dtype=DTYPE) * 0.01 logits = torch.randn(T, num_experts, device=DEVICE) _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1) sei, ssi, eo = flatten_sort_count(top_idx, num_experts) gx = base_ops.group(x, ssi, fan_out=top_k) dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE) return x, W, lora_A, lora_B, sei, ssi, eo, gx, dy # ─── Kernel wrappers (avoid B023 loop-variable capture) ────────────────────── def _call_fwd(x, W, sei, ssi, top_k, lA, lB): return lora_ops.scatter2scatter_lora( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=top_k, lora_A=lA, lora_B=lB, scaling=2.0, ) def _call_base(x, W, sei, ssi, top_k): return base_ops.scatter2scatter( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=top_k, ) def _call_dx(dy, W, sei, ssi, lA, lB): return lora_ops.scatter2scatter_lora_dX( DY=dy, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=1, lora_A=lA, lora_B=lB, scaling=2.0, dy_grouped=True, dx_grouped=False, ) def _call_bwd(dy, gx, lA, lB, eo, num_experts): return lora_ops.group_bwd_lora( DY=dy, X=gx, lora_A=lA, lora_B=lB, expert_offsets=eo, E=num_experts, scaling=2.0, ) # ─── Main ──────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="ScatterMoE LoRA kernel benchmark") parser.add_argument( "--models", "-m", nargs="+", help="Model names or HF IDs (default: all builtins)", ) parser.add_argument("--ranks", "-r", nargs="+", type=int, default=[16, 32, 64]) parser.add_argument("--seq-len", "-T", type=int, default=2048) args = parser.parse_args() T = args.seq_len print(f"GPU: {torch.cuda.get_device_name()}") print(f"T={T}, ranks={args.ranks}\n") if args.models: configs = [_resolve_config(m) for m in args.models] else: configs = list(BUILTIN_CONFIGS.items()) for model_name, (num_experts, hidden, inter, top_k) in configs: print(f"{'=' * 70}") print(f" {model_name}: E={num_experts}, H={hidden}, I={inter}, k={top_k}") print(f"{'=' * 70}") for R in args.ranks: for proj, K, N in [("gate_up", hidden, 2 * inter), ("down", inter, hidden)]: _clean() x, W, lA, lB, sei, ssi, eo, gx, dy = _setup( num_experts, K, N, T, top_k, R ) # Forward with LoRA (auto-dispatched: fused or split) dispatch = ( "split" if ( num_experts <= lora_ops._SPLIT_LORA_FWD_MAX_EXPERTS and K * N >= lora_ops._SPLIT_LORA_FWD_THRESHOLD ) else "fused" ) t_fwd = _bench(partial(_call_fwd, x, W, sei, ssi, top_k, lA, lB)) t_base = _bench(partial(_call_base, x, W, sei, ssi, top_k)) t_dx = _bench(partial(_call_dx, dy, W, sei, ssi, lA, lB)) t_bwd = _bench(partial(_call_bwd, dy, gx, lA, lB, eo, num_experts)) total = t_fwd + t_dx + t_bwd overhead = t_fwd / t_base - 1 if t_base > 0 else 0 print( f" R={R:>2} {proj:<8} " f"fwd={t_fwd:>6.2f}ms [{dispatch}] " f"base={t_base:>6.2f}ms " f"(+{overhead * 100:.0f}%) " f"dx={t_dx:>6.2f}ms bwd={t_bwd:>6.2f}ms " f"total={total:>6.2f}ms" ) # Full autograd fwd+bwd with memory measurement x_ag = x.clone().requires_grad_(True) lA_ag = lA.clone().requires_grad_(True) lB_ag = lB.clone().requires_grad_(True) def _run_autograd( _x=x_ag, _W=W, _k=top_k, _sei=sei, _ssi=ssi, _eo=eo, _lA=lA_ag, _lB=lB_ag, ): out = ScatterMoELoRA.apply( _x, _W, _k, _sei, _ssi, _eo, _lA, _lB, 2.0, None, None, False, False, True, False, ) out.sum().backward() _x.grad = None _lA.grad = None _lB.grad = None t_full = _bench(_run_autograd) _clean() torch.cuda.reset_peak_memory_stats() mem_before = torch.cuda.memory_allocated() _run_autograd() torch.cuda.synchronize() mem_peak = torch.cuda.max_memory_allocated() - mem_before print( f" full_fwd_bwd={t_full:>6.2f}ms " f"peak_delta={mem_peak / 1e6:>6.1f}MB" ) print() if __name__ == "__main__": main() ================================================ FILE: benchmarks/bench_selective_logsoftmax.py ================================================ """Benchmark for selective_log_softmax Triton kernel vs original implementation. Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_selective_logsoftmax.py """ import gc import statistics import torch from axolotl.monkeypatch.trainer.utils import ( selective_log_softmax, selective_log_softmax_original, ) V = 151936 # Qwen vocab WARMUP = 5 BENCH_ITERS = 20 MEM_ITERS = 10 def _clean_gpu(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() torch.cuda.reset_accumulated_memory_stats() torch.cuda.synchronize() def profile_time(fn, args, n_iters=BENCH_ITERS): for _ in range(WARMUP): fn(*args) torch.cuda.synchronize() times = [] for _ in range(n_iters): s = torch.cuda.Event(enable_timing=True) e = torch.cuda.Event(enable_timing=True) s.record() fn(*args) e.record() torch.cuda.synchronize() times.append(s.elapsed_time(e)) return times def profile_memory(fn, args, n_iters=MEM_ITERS): for _ in range(WARMUP): out = fn(*args) del out torch.cuda.synchronize() peaks = [] for _ in range(n_iters): _clean_gpu() base = torch.cuda.max_memory_allocated() out = fn(*args) torch.cuda.synchronize() peaks.append(torch.cuda.max_memory_allocated() - base) del out return [p / 1e6 for p in peaks] def fmt(values, unit=""): mean = statistics.mean(values) std = statistics.stdev(values) if len(values) > 1 else 0.0 return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]" def benchmark_forward(): print("=" * 60) print(f"FORWARD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})") print("=" * 60) configs = [ (1, 2048), (1, 8192), (4, 4096), (8, 2048), (16, 2048), (16, 4096), ] for B, L in configs: mem_gb = B * L * V * 2 / 1e9 if mem_gb > 28: print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)") continue N = B * L print(f"\n{'─' * 60}") print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)") print(f"{'─' * 60}") torch.manual_seed(42) logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16) index = torch.randint(0, V, (B, L), device="cuda") t_orig = profile_time(selective_log_softmax_original, (logits, index)) t_triton = profile_time(selective_log_softmax, (logits, index)) orig_mean = statistics.mean(t_orig) triton_mean = statistics.mean(t_triton) print(" TIME (ms):") print(f" original: {fmt(t_orig, 'ms')}") print(f" triton: {fmt(t_triton, 'ms')}") print(f" speedup: {orig_mean / triton_mean:.2f}x") m_orig = profile_memory(selective_log_softmax_original, (logits, index)) m_triton = profile_memory(selective_log_softmax, (logits, index)) orig_peak = statistics.mean(m_orig) triton_peak = statistics.mean(m_triton) print(" MEMORY (peak overhead):") print(f" original: {fmt(m_orig, 'MB')}") print(f" triton: {fmt(m_triton, 'MB')}") print(f" saved: {orig_peak - triton_peak:.1f} MB") del logits, index _clean_gpu() def benchmark_backward(): print("\n" + "=" * 60) print(f"FWD+BWD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})") print("=" * 60) configs = [ (1, 2048), (1, 8192), (4, 4096), (8, 2048), (16, 2048), (16, 4096), ] def fwd_bwd_original(logits, index): logits.grad = None out = selective_log_softmax_original(logits, index) out.sum().backward() def fwd_bwd_triton(logits, index): logits.grad = None out = selective_log_softmax(logits, index) out.sum().backward() for B, L in configs: mem_gb = B * L * V * 2 / 1e9 if mem_gb > 20: print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB, need room for grads)") continue N = B * L print(f"\n{'─' * 60}") print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)") print(f"{'─' * 60}") torch.manual_seed(42) logits_orig = torch.randn( B, L, V, device="cuda", dtype=torch.bfloat16, requires_grad=True ) logits_tri = logits_orig.detach().clone().requires_grad_(True) index = torch.randint(0, V, (B, L), device="cuda") t_orig = profile_time(fwd_bwd_original, (logits_orig, index)) t_triton = profile_time(fwd_bwd_triton, (logits_tri, index)) orig_mean = statistics.mean(t_orig) triton_mean = statistics.mean(t_triton) print(" FWD+BWD TIME (ms):") print(f" original: {fmt(t_orig, 'ms')}") print(f" triton: {fmt(t_triton, 'ms')}") print(f" speedup: {orig_mean / triton_mean:.2f}x") m_orig = profile_memory(fwd_bwd_original, (logits_orig, index)) m_triton = profile_memory(fwd_bwd_triton, (logits_tri, index)) orig_peak = statistics.mean(m_orig) triton_peak = statistics.mean(m_triton) print(" FWD+BWD MEMORY (peak overhead):") print(f" original: {fmt(m_orig, 'MB')}") print(f" triton: {fmt(m_triton, 'MB')}") print(f" saved: {orig_peak - triton_peak:.1f} MB") del logits_orig, logits_tri, index _clean_gpu() if __name__ == "__main__": benchmark_forward() benchmark_backward() ================================================ FILE: cicd/Dockerfile-uv.jinja ================================================ FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }} ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}" ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}" ENV CUDA="{{ CUDA }}" ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}" ENV GITHUB_REF="{{ GITHUB_REF }}" ENV GITHUB_SHA="{{ GITHUB_SHA }}" ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" ENV HF_HOME="{{ HF_HOME }}" RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git WORKDIR /workspace/axolotl RUN git fetch origin +$GITHUB_REF && \ git checkout FETCH_HEAD # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \ sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \ sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \ sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \ sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \ sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \ fi RUN uv pip install packaging==26.0 setuptools==78.1.1 RUN uv pip install torchvision RUN uv pip uninstall causal_conv1d RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \ fi RUN python scripts/unsloth_install.py --uv | sh RUN python scripts/cutcrossentropy_install.py --uv | sh # So we can test the Docker image RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch # helper for huggingface-login cli RUN git config --global credential.helper store ================================================ FILE: cicd/Dockerfile.jinja ================================================ FROM axolotlai/axolotl-base:{{ BASE_TAG }} ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}" ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}" ENV CUDA="{{ CUDA }}" ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}" ENV GITHUB_REF="{{ GITHUB_REF }}" ENV GITHUB_SHA="{{ GITHUB_SHA }}" ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" ENV HF_HOME="{{ HF_HOME }}" ENV AXOLOTL_DATASET_NUM_PROC="8" RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git WORKDIR /workspace/axolotl RUN git fetch origin +$GITHUB_REF && \ git checkout FETCH_HEAD # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \ sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \ sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \ sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \ sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \ sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \ fi RUN pip install packaging==26.0 setuptools==78.1.1 psutil RUN pip uninstall -y causal_conv1d RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \ fi RUN python scripts/unsloth_install.py | sh RUN python scripts/cutcrossentropy_install.py | sh # So we can test the Docker image RUN pip install -r requirements-dev.txt -r requirements-tests.txt # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch # helper for huggingface-login cli RUN git config --global credential.helper store ================================================ FILE: cicd/__init__.py ================================================ ================================================ FILE: cicd/cicd.sh ================================================ #!/bin/bash set -e python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__" curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1 # hf download "NousResearch/Meta-Llama-3-8B" # hf download "NousResearch/Meta-Llama-3-8B-Instruct" # hf download "microsoft/Phi-4-reasoning" # hf download "microsoft/Phi-3.5-mini-instruct" # hf download "microsoft/Phi-3-medium-128k-instruct" # Run unit tests with initial coverage report pytest -v --durations=10 -n8 \ --ignore=tests/e2e/ \ --ignore=tests/patched/ \ --ignore=tests/cli \ /workspace/axolotl/tests/ \ --cov=axolotl # Run lora kernels tests with coverage append pytest -v --durations=10 \ /workspace/axolotl/tests/e2e/patched/lora_kernels \ --cov=axolotl \ --cov-append # Run patched tests excluding lora kernels with coverage append pytest --full-trace -vvv --durations=10 \ --ignore=tests/e2e/patched/lora_kernels \ /workspace/axolotl/tests/e2e/patched \ --cov=axolotl \ --cov-append # Run solo tests with coverage append pytest -v --durations=10 -n1 \ /workspace/axolotl/tests/e2e/solo/ \ --cov=axolotl \ --cov-append # Run integration tests with coverage append pytest -v --durations=10 \ /workspace/axolotl/tests/e2e/integrations/ \ --cov=axolotl \ --cov-append pytest -v --durations=10 /workspace/axolotl/tests/cli \ --cov=axolotl \ --cov-append # Run remaining e2e tests with coverage append and final report pytest -v --durations=10 \ --ignore=tests/e2e/solo/ \ --ignore=tests/e2e/patched/ \ --ignore=tests/e2e/multigpu/ \ --ignore=tests/e2e/integrations/ \ --ignore=tests/cli \ /workspace/axolotl/tests/e2e/ \ --cov=axolotl \ --cov-append \ --cov-report=xml:e2e-coverage.xml codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true ================================================ FILE: cicd/cleanup.py ================================================ """Modal app to run axolotl GPU cleanup""" from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd @app.function( image=cicd_image, timeout=60 * 60, cpu=8.0, memory=131072, volumes=VOLUME_CONFIG, ) def cleanup(): run_cmd("./cicd/cleanup.sh", "/workspace/axolotl") @app.local_entrypoint() def main(): cleanup.remote() ================================================ FILE: cicd/cleanup.sh ================================================ #!/bin/bash set -e # cleanup old cache files for datasets processing and intermediate mappings find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \; find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \; ================================================ FILE: cicd/e2e_tests.py ================================================ """Modal app to run axolotl GPU tests""" from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd @app.function( image=cicd_image, gpu=GPU_CONFIG, timeout=120 * 60, # 90 min cpu=8.0, memory=131072, volumes=VOLUME_CONFIG, ) def cicd_pytest(): run_cmd("./cicd/cicd.sh", "/workspace/axolotl") @app.local_entrypoint() def main(): cicd_pytest.remote() ================================================ FILE: cicd/multigpu.py ================================================ """ modal application to run axolotl gpu tests in Modal """ import os import pathlib import tempfile import jinja2 import modal from jinja2 import select_autoescape from modal import App, Image cicd_path = pathlib.Path(__file__).parent.resolve() template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) template_env = jinja2.Environment( loader=template_loader, autoescape=select_autoescape() ) dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja") df_template = template_env.get_template(dockerfile) df_args = { "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"), "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"), "CUDA": os.environ.get("CUDA", "126"), "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""), "HF_HOME": "/workspace/data/huggingface-cache/hub", "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"), "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"), } dockerfile_contents = df_template.render(**df_args) temp_dir = tempfile.mkdtemp() with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: f.write(dockerfile_contents) cicd_image = Image.from_dockerfile( pathlib.Path(temp_dir) / "Dockerfile", force_build=True, gpu="A10G", ).env(df_args) app = App("Axolotl CI/CD", secrets=[]) hf_cache_volume = modal.Volume.from_name( "axolotl-ci-hf-hub-cache", create_if_missing=True ) VOLUME_CONFIG = { "/workspace/data/huggingface-cache/hub": hf_cache_volume, } N_GPUS = int(os.environ.get("N_GPUS", 2)) GPU_CONFIG = f"H100:{N_GPUS}" def run_cmd(cmd: str, run_folder: str): import subprocess # nosec # Propagate errors from subprocess. if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec exit(exit_code) @app.function( image=cicd_image, gpu=GPU_CONFIG, timeout=120 * 60, cpu=16.0, memory=131072 * N_GPUS, volumes=VOLUME_CONFIG, ) def cicd_pytest(): run_cmd("./cicd/multigpu.sh", "/workspace/axolotl") @app.local_entrypoint() def main(): cicd_pytest.remote() ================================================ FILE: cicd/multigpu.sh ================================================ #!/bin/bash set -e # Only run two tests at a time to avoid OOM on GPU (with coverage collection) pytest -v --durations=10 -n2 --maxfail=3 \ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \ --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \ /workspace/axolotl/tests/e2e/multigpu/ \ --cov=axolotl # Run solo tests with coverage append pytest -v --durations=10 -n1 \ /workspace/axolotl/tests/e2e/multigpu/solo/ \ --cov=axolotl \ --cov-append pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \ --cov=axolotl \ --cov-append \ --cov-report=xml:multigpu-coverage.xml # Upload coverage to Codecov if CODECOV_TOKEN is available if [ -n "$CODECOV_TOKEN" ]; then codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true fi ================================================ FILE: cicd/single_gpu.py ================================================ """Modal app to run axolotl GPU tests""" import os import pathlib import tempfile import jinja2 import modal import modal.experimental from jinja2 import select_autoescape from modal import App cicd_path = pathlib.Path(__file__).parent.resolve() template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) template_env = jinja2.Environment( loader=template_loader, autoescape=select_autoescape() ) dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja") df_template = template_env.get_template(dockerfile) df_args = { "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"), "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"), "CUDA": os.environ.get("CUDA", "126"), "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""), "HF_HOME": "/workspace/data/huggingface-cache/hub", "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"), "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"), } dockerfile_contents = df_template.render(**df_args) temp_dir = tempfile.mkdtemp() with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: f.write(dockerfile_contents) cicd_image = modal.experimental.raw_dockerfile_image( pathlib.Path(temp_dir) / "Dockerfile", # context_mount=None, force_build=True, # gpu="A10G", ).env(df_args) app = App("Axolotl CI/CD", secrets=[]) hf_cache_volume = modal.Volume.from_name( "axolotl-ci-hf-hub-cache", create_if_missing=True ) VOLUME_CONFIG = { "/workspace/data/huggingface-cache/hub": hf_cache_volume, } N_GPUS = int(os.environ.get("N_GPUS", 1)) GPU_TYPE = os.environ.get("GPU_TYPE", "L40S") GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}" def run_cmd(cmd: str, run_folder: str): import subprocess # nosec sp_env = os.environ.copy() sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8" # Propagate errors from subprocess. exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec if exit_code: raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}") ================================================ FILE: codecov.yml ================================================ codecov: require_ci_to_pass: yes notify: wait_for_ci: true coverage: precision: 2 round: down range: "70...100" status: project: default: # basic target: auto threshold: 1% base: auto # advanced branches: null if_no_uploads: error if_not_found: success if_ci_failed: error only_pulls: true flags: null paths: null informational: true patch: default: # basic target: auto threshold: 1% base: auto # advanced branches: null if_no_uploads: error if_not_found: success if_ci_failed: error only_pulls: false flags: null paths: null parsers: gcov: branch_detection: conditional: yes loop: yes method: no macro: no comment: layout: "reach,diff,flags,files,footer" behavior: default require_changes: no require_base: no require_head: yes github_checks: annotations: false ================================================ FILE: deepspeed_configs/zero1.json ================================================ { "zero_optimization": { "stage": 1, "overlap_comm": true }, "bf16": { "enabled": "auto" }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero1_torch_compile.json ================================================ { "zero_optimization": { "stage": 1, "overlap_comm": true }, "bf16": { "enabled": "auto" }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "compile": { "disable": false, "backend": "inductor" }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero2.json ================================================ { "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu" }, "contiguous_gradients": true, "overlap_comm": true }, "bf16": { "enabled": "auto" }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero2_torch_compile.json ================================================ { "compile": { "disable": false, "backend": "inductor" }, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu" }, "contiguous_gradients": true, "overlap_comm": true }, "bf16": { "enabled": "auto" }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero3.json ================================================ { "zero_optimization": { "stage": 3, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 0, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "max_live_parameters": 0, "max_reuse_distance": 0, "gather_16bit_weights_on_model_save": true }, "bf16": { "enabled": "auto" }, "fp16": { "enabled": "auto", "auto_cast": false, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero3_bf16.json ================================================ { "zero_optimization": { "stage": 3, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 0, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "max_live_parameters": 0, "max_reuse_distance": 0, "gather_16bit_weights_on_model_save": true }, "bf16": { "enabled": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero3_bf16_cpuoffload_all.json ================================================ { "zero_force_ds_cpu_optimizer": false, "zero_allow_untested_optimizer": true, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 0, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "max_live_parameters": 0, "max_reuse_distance": 0, "gather_16bit_weights_on_model_save": true }, "bf16": { "enabled": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: deepspeed_configs/zero3_bf16_cpuoffload_params.json ================================================ { "zero_force_ds_cpu_optimizer": false, "zero_allow_untested_optimizer": true, "zero_optimization": { "stage": 3, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 0, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "max_live_parameters": 0, "max_reuse_distance": 0, "gather_16bit_weights_on_model_save": true }, "bf16": { "enabled": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: devtools/README.md ================================================ This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information. ================================================ FILE: devtools/dev_chat_template.yml ================================================ # Example config for debugging the chat_template prompt format base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true load_in_4bit: false datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template shards: 10 val_set_size: 0 output_dir: temp_debug/axolotl_outputs/model dataset_prepared_path: temp_debug/axolotl_outputs/data dataset_num_proc: 1 sequence_len: 4096 sample_packing: false pad_to_sequence_len: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: micro_batch_size: 1 num_epochs: 1 max_steps: 10 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: false fp16: true tf32: false gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_steps: 10 weight_decay: 0.0 ================================================ FILE: docker/Dockerfile ================================================ ARG BASE_TAG=main-base FROM axolotlai/axolotl-base:$BASE_TAG ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ARG AXOLOTL_EXTRAS="" ARG AXOLOTL_ARGS="" ARG CUDA="118" ARG PYTORCH_VERSION="2.1.2" ARG TARGETARCH ENV PYTORCH_VERSION=$PYTORCH_VERSION RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \ rm -rf /var/cache/apt/archives && \ rm -rf /var/lib/apt/lists/* WORKDIR /workspace RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git WORKDIR /workspace/axolotl # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64 RUN pip uninstall -y causal_conv1d RUN if [ "$TARGETARCH" = "arm64" ]; then \ BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \ else \ BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \ fi && \ if [ "$AXOLOTL_EXTRAS" != "" ]; then \ pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \ fi && \ python scripts/unsloth_install.py | sh && \ python scripts/cutcrossentropy_install.py | sh && \ pip install pytest && \ pip cache purge # fix so that git fetch/pull from remote works with shallow clone RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch && \ git config --global credential.helper store COPY .axolotl-complete.bash /root/.axolotl-complete.bash RUN chmod +x /root/.axolotl-complete.bash && \ echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc ================================================ FILE: docker/Dockerfile-base ================================================ ARG CUDA_VERSION="11.8.0" ARG CUDNN_VERSION="8" ARG UBUNTU_VERSION="22.04" ARG MAX_JOBS=4 ARG TARGETARCH FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder ENV PATH="/root/miniconda3/bin:${PATH}" ARG TARGETARCH ARG PYTHON_VERSION="3.11" ARG PYTORCH_VERSION="2.1.2" ARG CUDA="128" ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" ENV PYTHON_VERSION=$PYTHON_VERSION ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST RUN apt-get update \ && apt-get install -y --no-install-recommends \ wget git build-essential ninja-build git-lfs libaio-dev pkg-config \ ibverbs-providers ibverbs-utils infiniband-diags \ librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \ && rm -rf /var/cache/apt/archives \ && rm -rf /var/lib/apt/lists/* \ && if [ "$TARGETARCH" = "amd64" ]; then \ MINICONDA_ARCH="x86_64"; \ elif [ "$TARGETARCH" = "arm64" ]; then \ MINICONDA_ARCH="aarch64"; \ else \ echo "Unsupported architecture: $TARGETARCH"; exit 1; \ fi \ && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \ && mkdir /root/.conda \ && bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \ && rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \ && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \ python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \ python3 -m pip cache purge RUN if [ "$CUDA" != "130" ] ; then \ CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \ python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \ python3 -m pip cache purge; \ fi RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working pip3 install -U --no-cache-dir pydantic==1.10.10 && \ pip3 cache purge # Map Python version (e.g., 3.12 -> cp312) RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \ # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10) TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \ # Map architecture case "$TARGETARCH" in \ amd64) ARCH_TAG="x86_64" ;; \ arm64) ARCH_TAG="aarch64" ;; \ *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \ esac && \ WHL_VERSION="v0.7.16" && \ WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \ wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \ pip3 install --no-cache-dir "${WHL_FILE}" && \ rm "${WHL_FILE}" ================================================ FILE: docker/Dockerfile-base-next ================================================ ARG CUDA_VERSION="12.8.1" ARG CUDNN_VERSION="8" ARG UBUNTU_VERSION="22.04" ARG MAX_JOBS=4 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder ENV PATH="/root/miniconda3/bin:${PATH}" ARG PYTHON_VERSION="3.11" ARG PYTORCH_VERSION="next" ARG CUDA="128" ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" ENV PYTHON_VERSION=$PYTHON_VERSION ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST RUN apt-get update \ && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \ && wget \ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && mkdir /root/.conda \ && bash Miniconda3-latest-Linux-x86_64.sh -b \ && rm -f Miniconda3-latest-Linux-x86_64.sh \ && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" RUN git lfs install --skip-repo && \ pip3 install awscli && \ pip3 install -U --no-cache-dir pydantic==2.10.6 ================================================ FILE: docker/Dockerfile-base-nightly ================================================ ARG CUDA_VERSION="12.8.1" ARG CUDNN_VERSION="8" ARG UBUNTU_VERSION="22.04" ARG MAX_JOBS=4 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder ENV PATH="/root/miniconda3/bin:${PATH}" ARG PYTHON_VERSION="3.11" ARG PYTORCH_VERSION="nightly" ARG CUDA="128" ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" ENV PYTHON_VERSION=$PYTHON_VERSION ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST RUN apt-get update \ && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \ && wget \ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && mkdir /root/.conda \ && bash Miniconda3-latest-Linux-x86_64.sh -b \ && rm -f Miniconda3-latest-Linux-x86_64.sh \ && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \ python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \ python3 -m pip cache purge RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working pip3 install -U --no-cache-dir pydantic==1.10.10 && \ pip3 cache purge ================================================ FILE: docker/Dockerfile-cloud ================================================ ARG BASE_TAG=main FROM axolotlai/axolotl:$BASE_TAG ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub" ENV HF_HOME="/workspace/data/huggingface-cache/hub" ENV HF_HUB_ENABLE_HF_TRANSFER="1" EXPOSE 8888 EXPOSE 22 COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh COPY scripts/motd /etc/motd RUN pip install jupyterlab notebook ipywidgets && \ jupyter lab clean RUN apt update && \ apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \ rm -rf /var/cache/apt/archives && \ rm -rf /var/lib/apt/lists/* && \ mkdir -p ~/.ssh && \ chmod 700 ~/.ssh && \ printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \ printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \ chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \ chmod +x /root/cloud-entrypoint.sh && \ echo 'set-option -g history-limit 5000' >> ~/.tmux.conf ENTRYPOINT ["/root/cloud-entrypoint.sh"] CMD ["sleep", "infinity"] ================================================ FILE: docker/Dockerfile-cloud-no-tmux ================================================ ARG BASE_TAG=main FROM axolotlai/axolotl:$BASE_TAG ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub" ENV HF_HOME="/workspace/data/huggingface-cache/hub" ENV HF_HUB_ENABLE_HF_TRANSFER="1" EXPOSE 8888 EXPOSE 22 COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh COPY scripts/motd /etc/motd RUN pip install jupyterlab notebook ipywidgets && \ jupyter lab clean RUN apt update && \ apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \ rm -rf /var/cache/apt/archives && \ rm -rf /var/lib/apt/lists/* && \ mkdir -p ~/.ssh && \ chmod 700 ~/.ssh && \ printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \ chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \ chmod +x /root/cloud-entrypoint.sh ENTRYPOINT ["/root/cloud-entrypoint.sh"] CMD ["sleep", "infinity"] ================================================ FILE: docker/Dockerfile-cloud-uv ================================================ ARG BASE_TAG=main FROM axolotlai/axolotl-uv:$BASE_TAG ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub" ENV HF_HOME="/workspace/data/huggingface-cache/hub" ENV HF_HUB_ENABLE_HF_TRANSFER="1" EXPOSE 8888 EXPOSE 22 COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh COPY scripts/motd /etc/motd RUN uv pip install jupyterlab notebook ipywidgets && \ jupyter lab clean RUN apt update && \ apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \ rm -rf /var/cache/apt/archives && \ rm -rf /var/lib/apt/lists/* && \ mkdir -p ~/.ssh && \ chmod 700 ~/.ssh && \ printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \ printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \ chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \ chmod +x /root/cloud-entrypoint.sh && \ echo 'set-option -g history-limit 5000' >> ~/.tmux.conf ENTRYPOINT ["/root/cloud-entrypoint.sh"] CMD ["sleep", "infinity"] ================================================ FILE: docker/Dockerfile-tests ================================================ ARG BASE_TAG=main-base FROM axolotlai/axolotl-base:$BASE_TAG ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ARG AXOLOTL_EXTRAS="" ARG AXOLOTL_ARGS="" ARG CUDA="118" ARG PYTORCH_VERSION="2.1.2" ARG GITHUB_REF="main" ENV PYTORCH_VERSION=$PYTORCH_VERSION RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev WORKDIR /workspace RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git WORKDIR /workspace/axolotl RUN git fetch origin +$GITHUB_REF && \ git checkout FETCH_HEAD # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \ fi # So we can test the Docker image RUN pip install pytest # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch # helper for huggingface-login cli RUN git config --global credential.helper store ================================================ FILE: docker/Dockerfile-uv ================================================ ARG BASE_TAG=main-base FROM axolotlai/axolotl-base-uv:$BASE_TAG ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ARG AXOLOTL_EXTRAS="" ARG AXOLOTL_ARGS="" ARG CUDA="118" ARG PYTORCH_VERSION="2.1.2" ARG TARGETARCH ENV PYTORCH_VERSION=$PYTORCH_VERSION RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \ rm -rf /var/cache/apt/archives && \ rm -rf /var/lib/apt/lists/* WORKDIR /workspace RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git WORKDIR /workspace/axolotl # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64 RUN uv pip uninstall causal_conv1d RUN if [ "$TARGETARCH" = "arm64" ]; then \ BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \ else \ BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \ fi && \ if [ "$AXOLOTL_EXTRAS" != "" ]; then \ uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \ fi && \ python scripts/unsloth_install.py --uv | sh && \ python scripts/cutcrossentropy_install.py --uv | sh && \ uv pip install pytest && \ uv cache clean # fix so that git fetch/pull from remote works with shallow clone RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch && \ git config --global credential.helper store COPY .axolotl-complete.bash /root/.axolotl-complete.bash RUN chmod +x /root/.axolotl-complete.bash && \ echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc ================================================ FILE: docker/Dockerfile-uv-base ================================================ ARG CUDA_VERSION="12.6.3" ARG CUDNN_VERSION="" ARG UBUNTU_VERSION="22.04" ARG MAX_JOBS=4 ARG TARGETARCH FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder ARG TARGETARCH ARG PYTHON_VERSION="3.11" ARG PYTORCH_VERSION="2.6.0" ARG CUDA="126" ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" ENV PYTHON_VERSION=$PYTHON_VERSION ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST ENV UV_TORCH_BACKEND="cu${CUDA}" RUN apt-get update \ && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \ && git lfs install --skip-repo \ && curl -LsSf https://astral.sh/uv/install.sh | sh ENV PATH="/root/.local/bin:${PATH}" RUN uv python install ${PYTHON_VERSION} WORKDIR /workspace RUN uv venv --no-project --relocatable axolotl-venv ENV PATH="/workspace/axolotl-venv/bin:${PATH}" RUN uv pip install packaging setuptools wheel psutil \ && uv pip install torch==${PYTORCH_VERSION} torchvision \ && uv pip install awscli pydantic RUN if [ "$TARGETARCH" = "amd64" ]; then \ uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \ uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \ fi # Map Python version (e.g., 3.12 -> cp312) RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \ # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10) TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \ # Map architecture case "$TARGETARCH" in \ amd64) ARCH_TAG="x86_64" ;; \ arm64) ARCH_TAG="aarch64" ;; \ *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \ esac && \ WHL_VERSION="v0.7.16" && \ WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \ wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \ uv pip install --no-cache-dir "${WHL_FILE}" && \ rm "${WHL_FILE}" ================================================ FILE: docker-compose.yaml ================================================ # version: '3.8' services: axolotl: build: context: . dockerfile: ./docker/Dockerfile volumes: - .:/workspace/axolotl - ~/.cache/huggingface/:/root/.cache/huggingface/ # set environment variables environment: # Set environment variables - GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME} - GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL} - GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME} - GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL} - WANDB_API_KEY=${WANDB_API_KEY} deploy: resources: reservations: devices: - driver: nvidia # count: 1 capabilities: [gpu] command: tail -f /dev/null ================================================ FILE: docs/.gitignore ================================================ /.quarto/ _site/ /api/*.qmd /api/*.html config-reference.qmd models/**/*.qmd models/**/*.html ================================================ FILE: docs/amd_hpc.qmd ================================================ --- title: AMD GPUs on HPC Systems description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs --- This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs. ## Setup ### 1. Install Python We recommend using Miniforge, a minimal conda-based Python distribution: ```bash curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" bash Miniforge3-$(uname)-$(uname -m).sh ``` ### 2. Configure Python Environment Add Python to your PATH and ensure it's available at login: ```bash echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile ``` ### 3. Load AMD GPU Software Load the ROCm module: ```bash module load rocm/5.7.1 ``` Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name. ### 4. Install PyTorch Install PyTorch with ROCm support: ```bash pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall ``` ### 5. Install Flash Attention Clone and install the Flash Attention repository: ```bash git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git export GPU_ARCHS="gfx90a" cd flash-attention export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch pip install --no-build-isolation . ``` ### 6. Install Axolotl Clone and install Axolotl: ```bash git clone https://github.com/axolotl-ai-cloud/axolotl cd axolotl pip install packaging ninja pip install --no-build-isolation -e . ``` ### 7. Apply xformers Workaround xformers appears to be incompatible with ROCm. Apply the following workarounds: - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers. - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement. ### 8. Prepare Job Submission Script Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include ```bash export TRANSFORMERS_OFFLINE=1 export HF_DATASETS_OFFLINE=1 ``` ### 9. Download Base Model Download a base model using the Hugging Face CLI: ```bash hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B ``` ### 10. Create Axolotl Configuration Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training. Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know. ### 11. Preprocess Data Run preprocessing on the login node: ```bash CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml ``` ### 12. Train You are now ready to submit your previously prepared job script. 🚂 ================================================ FILE: docs/attention.qmd ================================================ --- title: Attention description: Supported attention modules in Axolotl --- ## SDP Attention This is the default built-in attention in PyTorch. ```yaml sdp_attention: true ``` For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) ## Flash Attention Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically based on your installed packages and GPU. ```yaml flash_attention: true ``` For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/) ### Flash Attention 2 Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported) ```bash pip install flash-attn --no-build-isolation ``` ::: {.callout-tip} If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl. Alternatively, try reinstall or downgrade a version. ::: ### Flash Attention 3 Requirements: Hopper only and CUDA 12.8 (recommended) ```bash git clone https://github.com/Dao-AILab/flash-attention.git cd flash-attention/hopper python setup.py install ``` ### Flash Attention 4 Requirements: Hopper or Blackwell GPUs ```bash pip install flash-attn-4 ``` Or from source: ```bash git clone https://github.com/Dao-AILab/flash-attention.git cd flash-attention/flash_attn/cute pip install -e . # FA2's flash_attn package includes a cute/ stub that shadows FA4. # Remove it so Python can find the real FA4 module: rm -r $(python -c "import flash_attn; print(flash_attn.__path__[0])")/cute ``` ::: {.callout-note} **Hopper (SM90) users**: The backward kernel is not yet included in the pip package. To use FA4 for training on Hopper, install from source using the instructions above. ::: ::: {.callout-warning} FA4 only supports head dimensions up to 128 (`d ≤ 128`). The DeepSeek shape `(192, 128)` is also supported but only on Blackwell. Axolotl automatically detects incompatible head dimensions and falls back to FA2/3. ::: For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute) ### AMD Requirements: ROCm 6.0 and above. See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support). ## Flex Attention A flexible PyTorch API for attention used in combination with `torch.compile`. ```yaml flex_attention: true # recommended torch_compile: true ``` ::: {.callout-note} We recommend using latest stable version of PyTorch for best performance. ::: For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/) ## SageAttention Attention kernels with QK Int8 and PV FP16 accumulator. ```yaml sage_attention: true ``` Requirements: Ampere, Ada, or Hopper GPUs ```bash pip install sageattention==2.2.0 --no-build-isolation ``` ::: {.callout-warning} Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198). ::: For more details: [Sage Attention](https://github.com/thu-ml/SageAttention) ::: {.callout-note} We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue. ::: ## xFormers ```yaml xformers_attention: true ``` ::: {.callout-tip} We recommend using with Turing GPUs or below (such as on Colab). ::: For more details: [xFormers](https://github.com/facebookresearch/xformers) ## Shifted Sparse Attention ::: {.callout-warning} We plan to deprecate this! If you use this feature, we recommend switching to methods above. ::: Requirements: LLaMA model architecture ```yaml flash_attention: true s2_attention: true ``` ::: {.callout-tip} No sample packing support! ::: ================================================ FILE: docs/batch_vs_grad.qmd ================================================ --- title: Batch size vs Gradient accumulation description: Understanding of batch size and gradient accumulation steps --- Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning. This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why: 1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption. 2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch. **Example 1:** Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18 ``` | GPU 1 | GPU 2 | GPU 3 | |----------------|----------------|----------------| | S1, S2, S3 | S4, S5, S6 | S7, S8, S9 | | e1, e2, e3 | e4, e5, e6 | e7, e8, e9 | |----------------|----------------|----------------| | → (accumulate) | → (accumulate) | → (accumulate) | |----------------|----------------|----------------| | S10, S11, S12 | S13, S14, S15 | S16, S17, S18 | | e10, e11, e12 | e13, e14, e15 | e16, e17, e18 | |----------------|----------------|----------------| | → (apply) | → (apply) | → (apply) | Accumulated gradient for the weight w1 after the second iteration (considering all GPUs): Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18 Weight update for w1: w1_new = w1_old - learning rate x (Total gradient for w1 / 18) ``` **Example 2:** Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6 ``` | GPU 1 | GPU 2 | GPU 3 | |-----------|-----------|-----------| | S1, S2 | S3, S4 | S5, S6 | | e1, e2 | e3, e4 | e5, e6 | |-----------|-----------|-----------| | → (apply) | → (apply) | → (apply) | Accumulated gradient for the weight w1 (considering all GPUs): Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 Weight update for w1: w1_new = w1_old - learning rate × (Total gradient for w1 / 6) ``` ================================================ FILE: docs/checkpoint_saving.qmd ================================================ --- title: "Checkpoint Saving" format: html: toc: true toc-depth: 2 number-sections: true execute: enabled: false --- ## Overview Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use). ## File-Based Checkpoint Trigger ### Configuration Enable in your config: ```yaml dynamic_checkpoint: enabled: true check_interval: 100 # Optional: check every N steps (default: 100) trigger_file_path: "axolotl_checkpoint.save" # Optional: custom filename ``` **Options:** - `enabled`: `true` to enable (required) - `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead. - `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save` ### How It Works 1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir` 2. When detected, file is deleted and checkpoint is saved 3. In distributed training, rank 0 broadcasts to synchronize all ranks ### Usage **Command line:** ```bash touch /path/to/output_dir/axolotl_checkpoint.save ``` **Programmatic:** ```python from pathlib import Path Path("/path/to/output_dir/axolotl_checkpoint.save").touch() ``` Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times. **Custom filename:** ```yaml dynamic_checkpoint: enabled: true trigger_file_path: "my_trigger.save" ``` ```bash touch /path/to/output_dir/my_trigger.save ``` ## Control+C (SIGINT) Checkpoint Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger. ## Best Practices - **Check interval**: Lower values (10-50) for fast training, default 100 for slower training - **Distributed training**: Create trigger file once; rank 0 handles synchronization - **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint` ## Example ```yaml output_dir: ./outputs/lora-out save_steps: 500 # Scheduled checkpoints dynamic_checkpoint: enabled: true check_interval: 50 ``` This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps). ================================================ FILE: docs/cli.qmd ================================================ --- title: "Command Line Interface (CLI)" format: html: toc: true toc-expand: 2 toc-depth: 3 execute: enabled: false --- The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers the CLI commands, their usage, and common examples. ## Basic Commands All Axolotl commands follow this general structure: ```bash axolotl [config.yml] [options] ``` The config file can be local or a URL to a raw YAML file. ### Launcher Arguments For commands that support multi-GPU (`train`, `evaluate`, ...), you can pass launcher-specific arguments using the `--` separator: ```bash # Pass torchrun arguments axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1 # Pass accelerate arguments axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4 ``` Arguments after `--` are passed directly to the launcher (torchrun, accelerate launch, etc.). ## Command Reference ### fetch Downloads example configurations and deepspeed configs to your local machine. ```bash # Get example YAML files axolotl fetch examples # Get deepspeed config files axolotl fetch deepspeed_configs # Specify custom destination axolotl fetch examples --dest path/to/folder ``` ### preprocess Preprocesses and tokenizes your dataset before training. This is recommended for large datasets. ```bash # Basic preprocessing axolotl preprocess config.yml # Preprocessing with one GPU CUDA_VISIBLE_DEVICES="0" axolotl preprocess config.yml # Debug mode to see processed examples axolotl preprocess config.yml --debug # Debug with limited examples axolotl preprocess config.yml --debug --debug-num-examples 5 ``` Configuration options: ```yaml dataset_prepared_path: Local folder for saving preprocessed data push_dataset_to_hub: HuggingFace repo to push preprocessed data (optional) ``` ### train Trains or fine-tunes a model using the configuration specified in your YAML file. ```bash # Basic training axolotl train config.yml # Train and set/override specific options axolotl train config.yml \ --learning-rate 1e-4 \ --micro-batch-size 2 \ --num-epochs 3 # Training without accelerate axolotl train config.yml --launcher python # Pass launcher-specific arguments using -- separator axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1 axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml # Resume training from checkpoint axolotl train config.yml --resume-from-checkpoint path/to/checkpoint ``` It is possible to run sweeps over multiple hyperparameters by passing in a sweeps config. ```bash # Basic training with sweeps axolotl train config.yml --sweep path/to/sweep.yaml ``` Example sweep config: ```yaml _: # This section is for dependent variables we need to fix - load_in_8bit: false load_in_4bit: false adapter: lora - load_in_8bit: true load_in_4bit: false adapter: lora # These are independent variables learning_rate: [0.0003, 0.0006] lora_r: - 16 - 32 lora_alpha: - 16 - 32 - 64 ``` ### inference Runs inference using your trained model in either CLI or Gradio interface mode. ```bash # CLI inference with LoRA axolotl inference config.yml --lora-model-dir="./outputs/lora-out" # CLI inference with full model axolotl inference config.yml --base-model="./completed-model" # Gradio web interface axolotl inference config.yml --gradio \ --lora-model-dir="./outputs/lora-out" # Inference with input from file cat prompt.txt | axolotl inference config.yml \ --base-model="./completed-model" ``` ### merge-lora Merges trained LoRA adapters into the base model. ```bash # Basic merge axolotl merge-lora config.yml # Specify LoRA directory (usually used with checkpoints) axolotl merge-lora config.yml --lora-model-dir="./lora-output/checkpoint-100" # Merge using CPU (if out of GPU memory) CUDA_VISIBLE_DEVICES="" axolotl merge-lora config.yml ``` Configuration options: ```yaml gpu_memory_limit: Limit GPU memory usage lora_on_cpu: Load LoRA weights on CPU ``` ### merge-sharded-fsdp-weights Merges sharded FSDP model checkpoints into a single combined checkpoint. ```bash # Basic merge axolotl merge-sharded-fsdp-weights config.yml ``` ### evaluate Evaluates a model's performance (loss etc) on the train and eval datasets. ```bash # Basic evaluation axolotl evaluate config.yml # Evaluation with launcher arguments axolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2 ``` ### lm-eval Runs LM Evaluation Harness on your model. ```bash # Basic evaluation axolotl lm-eval config.yml ``` Configuration options: ```yaml lm_eval_model: # model to evaluate (local or hf path) # List of tasks to evaluate lm_eval_tasks: - arc_challenge - hellaswag lm_eval_batch_size: # Batch size for evaluation output_dir: # Directory to save evaluation results ``` See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details. ### delinearize-llama4 Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model. ```bash axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir ``` This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing. ### quantize Quantizes a model using the quantization configuration specified in your YAML file. ```bash axolotl quantize config.yml ``` See [Quantization](./quantize.qmd) for more details. ## Legacy CLI Usage While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI: ```bash # Preprocess python -m axolotl.cli.preprocess config.yml # Train accelerate launch -m axolotl.cli.train config.yml # Inference accelerate launch -m axolotl.cli.inference config.yml \ --lora_model_dir="./outputs/lora-out" # Gradio interface accelerate launch -m axolotl.cli.inference config.yml \ --lora_model_dir="./outputs/lora-out" --gradio ``` ::: {.callout-important} When overriding CLI parameters in the legacy CLI, use same notation as in yaml file (e.g., `--lora_model_dir`). **Note:** This differs from the new Click-based CLI, which uses dash notation (e.g., `--lora-model-dir`). Keep this in mind if you're referencing newer documentation or switching between CLI versions. ::: ## Remote Compute with Modal Cloud Axolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a cloud YAML file alongside your regular Axolotl config. ### Cloud Configuration Create a cloud config YAML with your Modal settings: ```yaml # cloud_config.yml provider: modal gpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4 gpu_count: 1 # Number of GPUs to use timeout: 86400 # Maximum runtime in seconds (24 hours) branch: main # Git branch to use (optional) volumes: # Persistent storage volumes - name: axolotl-cache mount: /workspace/cache - name: axolotl-data mount: /workspace/data - name: axolotl-artifacts mount: /workspace/artifacts secrets: # Secrets to inject - WANDB_API_KEY - HF_TOKEN ``` ### Running on Modal Cloud Commands that support the --cloud flag: ```bash # Preprocess on cloud axolotl preprocess config.yml --cloud cloud_config.yml # Train on cloud axolotl train config.yml --cloud cloud_config.yml # Run lm-eval on cloud axolotl lm-eval config.yml --cloud cloud_config.yml ``` ### Cloud Configuration Options ```yaml provider: # compute provider, currently only `modal` is supported gpu: # GPU type to use gpu_count: # Number of GPUs (default: 1) memory: # RAM in GB (default: 128) timeout: # Maximum runtime in seconds timeout_preprocess: # Preprocessing timeout branch: # Git branch to use docker_tag: # Custom Docker image tag volumes: # List of persistent storage volumes # Environment variables to pass. Can be specified in two ways: # 1. As a string: Will load the value from the host computer's environment variables # 2. As a key-value pair: Will use the specified value directly # Example: # env: # - CUSTOM_VAR # Loads from host's $CUSTOM_VAR # - {CUSTOM_VAR: "value"} # Uses "value" directly env: # Secrets to inject. Same input format as `env` but for sensitive data. secrets: # - HF_TOKEN # - WANDB_API_KEY ``` ================================================ FILE: docs/custom_integrations.qmd ================================================ --- title: Custom Integrations toc: true toc-depth: 3 --- ```{python} #| echo: false import os import re def process_readme(integration_name): try: path = f'../src/axolotl/integrations/{integration_name}/README.md' with open(path, 'r') as f: txt = f.read() # Remove h1 headings txt = re.sub(r'^# .*\n?', '', txt, flags=re.MULTILINE) # Convert h2 to h3 txt = re.sub(r'^## ', '### ', txt, flags=re.MULTILINE) return txt except FileNotFoundError: return None def print_section(name, folder_name): output = f"\n## {name}\n" content = process_readme(folder_name) if content: output += content output += f"\nPlease see reference [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/{folder_name})\n" return output ``` ```{python} #| output: asis #| echo: false # Introduction text print(""" Axolotl adds custom features through `integrations`. They are located within the `src/axolotl/integrations` directory. To enable them, please check the respective documentations. """) # Sections sections = [ ("Cut Cross Entropy", "cut_cross_entropy"), ("Grokfast", "grokfast"), ("Knowledge Distillation (KD)", "kd"), ("Liger Kernels", "liger"), ("Language Model Evaluation Harness (LM Eval)", "lm_eval"), ("Spectrum", "spectrum"), ("LLMCompressor", "llm_compressor") ] for folder_name in os.listdir("../src/axolotl/integrations/"): if folder_name in [path for name, path in sections]: # skip if already in sections continue if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"): # grab the first heading in README.md as the section name with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f: txt = f.read() matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE) if matches: name = matches.group(1) else: continue sections.append((name, folder_name)) # sort sections by name sections = sorted(sections, key=lambda x: x[0]) for section_name, folder_name in sections: print(print_section(section_name, folder_name)) ``` ## Adding a new integration Plugins can be used to customize the behavior of the training pipeline through [hooks](https://en.wikipedia.org/wiki/Hooking). See [`axolotl.integrations.BasePlugin`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/base.py) for the possible hooks. To add a new integration, please follow these steps: 1. Create a new folder in the `src/axolotl/integrations` directory. 2. Add any relevant files (`LICENSE`, `README.md`, `ACKNOWLEDGEMENTS.md`, etc.) to the new folder. 3. Add `__init__.py` and `args.py` files to the new folder. - `__init__.py` should import the integration and hook into the appropriate functions. - `args.py` should define the arguments for the integration. 4. (If applicable) Add CPU tests under `tests/integrations` or GPU tests under `tests/e2e/integrations`. ::: {.callout-tip} See [src/axolotl/integrations/cut_cross_entropy](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/cut_cross_entropy) for a minimal integration example. ::: ::: {.callout-warning} If you could not load your integration, please ensure you are pip installing in editable mode. ```bash pip install -e . ``` and correctly spelled the integration name in the config file. ```yaml plugins: - axolotl.integrations.your_integration_name.YourIntegrationPlugin ``` ::: ::: {.callout-note} It is not necessary to place your integration in the `integrations` folder. It can be in any location, so long as it's installed in a package in your python env. See this repo for an example: [https://github.com/axolotl-ai-cloud/diff-transformer](https://github.com/axolotl-ai-cloud/diff-transformer) ::: ================================================ FILE: docs/dataset-formats/conversation.qmd ================================================ --- title: Conversation description: Conversation format for supervised fine-tuning. order: 3 --- ## chat_template Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2. ```{.json filename="data.jsonl"} {"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]} ``` See [configs](../config-reference.qmd) for full configs and supported templates. ### Migrating from sharegpt Most configs can be adapted as follows: ```yaml # old chat_template: chatml datasets: - path: ... type: sharegpt conversation: chatml # new (if using tokenizer's chat_template) datasets: - path: ... type: chat_template field_messages: conversations message_property_mappings: role: from content: value # new (if setting a new chat_template like chatml, gemma, etc) chat_template: chatml datasets: - path: ... type: chat_template field_messages: conversations message_property_mappings: role: from content: value ``` We recommend checking the below examples for other usecases. ### Examples #### Training on last message (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message. ```yaml datasets: - path: ... type: chat_template roles_to_train: train_on_eos: ``` ::: {.callout-tip} If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`. ::: #### Overriding default chat template Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages. ```yaml chat_template: gemma # this overwrites the tokenizer's chat_template datasets: - path: ... type: chat_template roles_to_train: ["assistant"] # default value ``` ::: {.callout-note} If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default). ::: #### Using default chat template with fallback Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages. ```yaml chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template datasets: - path: ... type: chat_template ``` #### Custom Jinja template Using a custom jinja template on OpenAI messages format, training on all assistant messages. ```yaml # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" datasets: - path: ... type: chat_template ``` ::: {.callout-important} Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `. ::: #### Using template with different token for EOT and EOS - If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn. ```yaml eot_tokens: - "[/INST]" # - "[/SYSTEM_PROMPT]" datasets: - path: ... type: chat_template # optional train_on_eot: turn # defaults read from train_on_eos (which defaults to turn) ``` ::: {.callout-tip} See [config documentation](../config-reference.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens. ::: ::: {.callout-note} Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior. You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config-reference.qmd) for more details. ::: - Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`. ```yaml eot_tokens: - "[/INST]" # ... datasets: - path: ... type: chat_template train_on_eos: last train_on_eot: turn ``` ::: {.callout-tip} If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them. ::: #### Using tool use Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it. ```json { "tools": [ { "type": "...", "function": { "name": "...", "description": "...", "parameters": { "type": "...", "properties": { // ... }, "required": ["..."], }, }, }, ], "messages": [ // ... { "role": "assistant", // call the function via assistant "tool_calls": [ { "id": "...", // required only for mistral "type": "function", "function": { "name": "...", "arguments": { "...": "...", } } } ] }, { "role": "tool", "tool_call_id": "...", // required only for mistral "name": "...", "content": "..." }, ], } ``` ::: {.callout-note} Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step). ::: ::: {.callout-warning} If you have tool arguments with same name but different dtypes (like `"time": string` and `"time": number`), please save `arguments: ` as JSON string to prevent `datasets` from having casting issues. ``` "arguments": "{\"...\": \"...\"}" ``` The same is applicable for tool parameters. ``` "parameters": "{\"...\": \"...\"}" ``` ::: Example config for Llama4: ```yaml chat_template: llama4 datasets: - path: Nanobit/text-tools-2k-test type: chat_template # field_tools: tools # default is `tools` ``` ::: {.callout-tip} Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template. ::: #### Using fine-grained control over token masking (Advanced) Using fine-grained control over tokens and turns to train in a conversation For a data sample that looks like: ```{.json filename="data.jsonl"} { "conversations": [ {"from": "system", "value": "You are an AI assistant.", "train": false}, {"from": "human", "value": "Hello", "train": false}, {"from": "assistant", "value": "Hello", "train": true}, {"from": "human", "value": "How are you?", "train": true}, { "from": "assistant", "value": "I'm doing very well, thank you!", "train_detail": [ {"begin_offset": 0, "end_offset": 8, "train": false}, {"begin_offset": 9, "end_offset": 18, "train": true}, {"begin_offset": 19, "end_offset": 30, "train": false}, ], }, { "from": "human", "value": "I'm doing very well, thank you!", "train": true, }, {"from": "assistant", "value": "Hi there!", "train": true} ] } ``` The configuration would look like: ```yaml datasets: - path: ... type: chat_template chat_template: tokenizer_default field_messages: conversations message_property_mappings: role: from content: value roles_to_train: [] train_on_eos: turn message_field_training: train message_field_training_detail: train_detail ``` ::: {.callout-tip} It is not necessary to set both `message_field_training` and `message_field_training_detail` at once. ::: #### Reasoning split (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template. ```yaml datasets: - path: ... type: chat_template chat_template: qwen3 split_thinking: true ``` For example, a content can look like: ```json { "content": "Some thinking outputsOutput after thinking." } ``` After split, it will look like: ```json { "reasoning_content": "Some thinking outputs", "content": "Output after thinking..." } ``` ## sharegpt ::: {.callout-important} ShareGPT is deprecated!. Please see [chat_template](#chat_template) section. ::: ## pygmalion ```{.json filename="data.jsonl"} {"conversations": [{"role": "...", "value": "..."}]} ``` ================================================ FILE: docs/dataset-formats/index.qmd ================================================ --- title: Dataset Formats description: Guide to Dataset Formats in Axolotl back-to-top-navigation: true toc: true toc-depth: 5 --- Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file. As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice. Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below. ::: {.callout-tip} This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources. For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training). ::: ## Pre-training When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time. A sample format for a pre-training dataset is as follows: ```json {"text": "first row"} {"text": "second row"} ... ``` It is typically recommended to save your dataset as `.jsonl` due to its flexibility and simplicity. Axolotl supports loading from a Hugging Face hub repo or from local files. ### Pre-training from Hugging Face hub datasets As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config: ```yaml pretraining_dataset: hf_org/name ``` ### Pre-training from local dataset files Given a few corpus files: `A.jsonl`, `B.jsonl`, and `C.jsonl`, your config will look like the below: ```yaml pretraining_dataset: - path: json data_files: - A.jsonl - B.jsonl - C.jsonl ``` While we recommend `.jsonl`, you can also use the other formats (`csv`, `parquet`, `arrow`, `SQL`, `Webdataset`) that are supported by [`Dataset.load_dataset`](https://huggingface.co/docs/datasets/loading#local-and-remote-files) ### Pre-training without streaming In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming. One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs. From Hugging Face: ```yaml datasets: - path: hf_org/name type: completion ``` From local files: ```yaml datasets: - path: A.jsonl type: completion - path: B.jsonl type: completion ``` ::: {.callout-important} For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR! ::: ### Pre-training dataset configuration tips #### Setting max_steps When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop. Therefore, it is necessary to set `max_steps: int` in your config for pre-training to run, so that Axolotl knows when to stop training. One step is equal to `sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus` tokens. #### Group_by_length It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large. ### Reference Please see docs [here](pretraining.qmd). ## Supervised fine-tuning (SFT) Supervised fine-tuning is the process of training models to respond to an instruction or chat input. As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets. Axolotl provides four approaches for loading datasets, however, it's easier to work backwards from the dataset you have available to figure out which approach to use. A flow chart is as follows: 1. Do you already have the dataset tokenized? If yes, check [Pre-Tokenized Dataset](#pre-tokenized-dataset). 2. Do you want to format the dataset yourself and manually choose each section to mask? If yes, check [Template Free Dataset](#template-free-dataset) 3. Is your dataset in a "conversation" format, containing a `list[messages]`? If yes, check [Conversation Dataset](#conversation-dataset) 4. Is your dataset in an "instruct" format, containing `{ instruction, response }`? If yes, check [Instruction Dataset](#instruction-dataset) If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion. ::: {.callout-tip} You can mix and match within each approach or across approaches to train a model on a variety of datasets. ::: ### Pre-Tokenized Dataset We suggest this approach when you want to bring your own tokenized dataset. Axolotl expects the dataset to have three keys: - `input_ids`: from tokenizing formatted prompt - `attention_mask`: for masking padding. If you don't add padding, it would be equal to `len(input_ids) * [1]` - `labels`: this is the same as `input_ids`, however, if you want to mask certain tokens, you would set those indices to `-100`. ::: {.callout-tip} Make sure to add BOS/EOS tokens to your prompt and mask it appropriately. ::: A config for this would look like: ```yaml datasets: - path: A.jsonl type: ``` ::: {.callout-note} `type: ` is empty! ::: Reference: [Pre-Tokenized Dataset Documentation](tokenized.qmd). ### Template Free Dataset We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn't suffice. In the example below, you could see that there is no proper structure. At the same time, it's very flexible as there are no constraints on how your prompt can look. ```json { "segments": [ { "label": true, "text": "Hello\n" }, { "label": true, "text": "hi there!. " }, { "label": false, "text": "goodbye " }, { "label": true, "text": "farewell" } ] } ``` Each prompt must be have a key called `segments` which is a list of `{ text, label }`. ```yaml datasets: - path: A.jsonl type: input_output ``` Reference: [Template Free Documentation](template_free.qmd). ### Conversation Dataset `conversation` messages are a list of messages which usually contain a `role` and `content` key. ::: {.callout-tip} Fun fact: Axolotl synonymously refers to "chat" messages as `conversation` messages due to how FastChat initially used this term to build a widely used [fastchat conversation](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) method for formatting chat messages prior to the creation of `chat_templates`. ::: #### What are `chat_templates`? The current most popular and convenient method for inference is to use `chat_templates` for formatting prompts. Axolotl supports using `chat_templates` for training to ensure that the model performs in the same environment as in inference. Here's a quick rundown on `chat_template`: A `chat_template` is a Jinja2 template which formats a list of messages into a prompt. An example of a prompt formatted into a popular template called ChatML can be seen below: Single prompt (pretty-printed): ```json { "messages": [ { "role": "user", "content": "Hi" }, { "role": "assistant", "content": "How can I help you?" }, { "role": "user", "content": "Can you add 3+5?" }, { "role": "assistant", "content": "The answer is 8." } ] } ``` The ChatML template is as follows: ```jinja2 {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %} ``` The above prompt formatted into this template will result in: ``` <|im_start|>user Hi<|im_end|> <|im_start|>assistant How can I help you?<|im_end|> <|im_start|>user Can you add 3+5?<|im_end|> <|im_start|>assistant The answer is 8.<|im_end|> ``` By using delimiters (`<|im_start|>` and `<|im_end|>`), a prompt separates different speakers which helps the model identify which portion belongs to whom. #### Common Conversation Dataset formats Older conversation datasets with the following format are colloquially called `sharegpt` datasets. ```json {"conversations": [{"from": "...", "value": "..."}]} ``` Newer conversation datasets usually follow the OpenAI format. ```json {"messages": [{"role": "...", "content": "..."}]} ``` Axolotl supports both as well as allowing customization of any kind of key. #### Chat Template Usage To properly use this method, it is important to identify three things: 1. Which `chat_template` would you use? 2. What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be `messages`, `role`, and `content`, respectively, whereas the possible roles are `system`, `user`, and `assistant`. 3. What do you want to mask? For instance, only assistant messages, only last message, or nothing. ##### Choosing a `chat_template` There are a lot of `chat_templates` out there. Axolotl supports the common ones: [supported chat templates](https://github.com/axolotl-ai-cloud/axolotl/blob/860609392184cf62a7e0ca676658b170e059ce6c/src/axolotl/utils/chat_templates.py#L17). For example, to use ChatML, it would be `chat_template: chatml`. However, it is also possible to use the already configured template within the tokenizer by specifying `chat_template: tokenizer_default`. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do `chat_template: tokenizer_default_fallback_chatml` to fallback to the ChatML template if a tokenizer template was not found. One last but powerful approach is to bring your own template. This can be set via: ```yaml chat_template_jinja: # your template ``` ##### Setting `chat_template` dataset keys We currently default to OpenAI format for dataset keys, so if that's your current dataset format, there's nothing to do here. If your dataset format is different, here are the keys you should check (with their defaults): ```yaml datasets: ... field_messages: messages # this should point to the key containing the list of conversations message_property_mappings: # this is a mapping from keys in your dataset to keys in chat_template role: role content: content ``` In some `chat_templates` (e.g. [Gemma](https://huggingface.co/google/gemma-2b-it/blob/main/tokenizer_config.json#L1507)), the roles are hardcoded to `user` and `assistant`. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a `KeyError`, it would be necessary to add mapping for your roles. Here is an example of how it would look like: ```yaml datasets: ... roles: assistant: - gpt - model user: - human ``` In the example above, all `gpt` and `model` values are converted to `assistant`. All `human` values are converted to `user.` ##### Handling masking The common use case for `chat_template` is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on. To train on all `assistant` messages, you would set the following configs. ```yaml datasets: ... roles_to_train: ["assistant"] train_on_eos: "turn" ``` The `train_on_eos` config means that it would mask all EOS tokens for turns that aren't assistant-turns. The other options are: `all` and `last` to choose which EOS to train on. Perhaps, you want to train on `assistant` and `narrator` roles, you can simply add `narrator` to the list of `roles_to_train`. You would also need to add it to the mapping of `roles` above. ```yaml datasets: ... roles_to_train: ["assistant", "narrator"] roles: assistant: - gpt - model user: - human narrator: ["narrator"] ``` ::: {.callout-tip} As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer's EOS, it is highly recommended to set them. For example, `ChatML` uses `<|im_end|>` to end turns. ```yaml special_tokens: eos_token: <|im_end|> ``` ::: ##### Applying `chat_template` Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset. ```yaml datasets: - path: A.jsonl type: chat_template # step 1 chat_template: chatml # step 2 field_messages: messages message_property_mappings: role: role content: content roles: assistant: - gpt - model - assistant user: - human - user # step 3 roles_to_train: ["assistant"] train_on_eos: "turn" special_tokens: eos_token: <|im_end|> ``` If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via `axolotl preprocess config.yaml --debug`): ``` <|im_start|>(-100, 128256) user(-100, 882) (-100, 198) Hi(-100, 13347) <|im_end|>(-100, 128257) (-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191) (-100, 198) How(4438, 4438) can(649, 649) I(358, 358) help(1520, 1520) you(499, 499) ?(30, 30) <|im_end|>(128257, 128257) (-100, 198) <|im_start|>(-100, 128256) user(-100, 882) (-100, 198) Can(-100, 6854) you(-100, 499) add(-100, 923) (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) <|im_end|>(-100, 128257) (-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191) (-100, 198) The(791, 791) answer(4320, 4320) is(374, 374) (220, 220) 8(23, 23) .(13, 13) <|im_end|>(128257, 128257) (-100, 198) ``` The first number refers to the label, the second refers to the `token_id`. For example, `-100` labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the `token_id`. ::: {.callout-note} If during `preprocess`, there are a lot of warnings of `Could not find content __ boundary`, please check the FAQ section for [chat_templates](../faq.qmd#chat-templates). ::: #### Reference Please see docs [here](conversation.qmd). ### Instruction Dataset Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn. An example is of a common format called Alpaca: ```json {"instruction": "...", "input": "...", "output": "..."} ``` Using those keys, a prompt can be built based on it. ``` Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input} ### Response: {output} ``` This can be configured as such: ```yaml datasets: - path: A.jsonl type: alpaca ``` Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format. #### Custom Instruct Prompt Format Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly. In the example below, a sample row is used to output in `mistral_v1` format. ```json {"input": "...", "output": "..."} ``` ```yaml datasets: - path: repo type: system_prompt: "" field_system: field_instruction: input field_input: field_output: output # multi-line example with input format: |- [INST] {instruction} {input} [/INST] # single-line example without input no_input_format: "[INST] {instruction} [/INST]" ``` The config sets that the `field_instruction` is actually named `input`, and the `field_input` is empty as we don't have an `input` in this sample. Generally, `instruction` can be thought as the question to the model, and `input` as the additional information with `output` being the response. It is not necessary to have an `input` nor `system`. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case. Reference: [Custom Instruct Prompt Format Documentation](inst_tune.qmd#how-to-add-custom-prompt-format). ## Reinforcement Learning from Human Feedback (RLHF) As there are multiple RLHF methods with their own dataset requirements. Please see [RLHF documentation](../rlhf.qmd) for more detail. ================================================ FILE: docs/dataset-formats/inst_tune.qmd ================================================ --- title: Instruction Tuning description: Instruction tuning formats for supervised fine-tuning. order: 2 --- ## alpaca instruction; input(optional) ```{.json filename="data.jsonl"} {"instruction": "...", "input": "...", "output": "..."} ``` ## jeopardy question and answer ```{.json filename="data.jsonl"} {"question": "...", "category": "...", "answer": "..."} ``` ## oasst instruction ```{.json filename="data.jsonl"} {"INSTRUCTION": "...", "RESPONSE": "..."} ``` ## gpteacher instruction; input(optional) ```{.json filename="data.jsonl"} {"instruction": "...", "input": "...", "response": "..."} ``` ## reflection instruction with reflect; input(optional) ```{.json filename="data.jsonl"} {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."} ``` ## explainchoice question, choices, (solution OR explanation) ```{.json filename="data.jsonl"} {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."} ``` ## concisechoice question, choices, (solution OR explanation) ```{.json filename="data.jsonl"} {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."} ``` ## summarizetldr article and summary ```{.json filename="data.jsonl"} {"article": "...", "summary": "..."} ``` ## alpaca_chat basic instruct for alpaca chat ```{.json filename="data.jsonl"} {"instruction": "...", "input": "...", "response": "..."} ``` ## alpaca_chat.load_qa question and answer for alpaca chat ```{.json filename="data.jsonl"} {"question": "...", "answer": "..."} ``` ## alpaca_chat.load_concise question and answer for alpaca chat, for concise answers ```{.json filename="data.jsonl"} {"instruction": "...", "input": "...", "response": "..."} ``` ## alpaca_chat.load_camel_ai question and answer for alpaca chat, for load_camel_ai ```{.json filename="data.jsonl"} {"message_1": "...", "message_2": "..."} ``` ## alpaca_w_system.load_open_orca support for open orca datasets with included system prompts, instruct ```{.json filename="data.jsonl"} {"system_prompt": "...", "question": "...", "response": "..."} ``` ## context_qa in context question answering from an article ```{.json filename="data.jsonl"} {"article": "...", "question": "...", "answer": "..."} ``` ## context_qa.load_v2 in context question answering (alternate) ```{.json filename="data.jsonl"} {"context": "...", "question": "...", "answer": "..."} ``` ## context_qa.load_404 in context question answering from an article, with default response for no answer from context ```{.json filename="data.jsonl"} {"article": "...", "unanswerable_question": "..."} ``` ## creative_acr.load_answer instruction and revision ```{.json filename="data.jsonl"} {"instruction": "...", "revision": "..."} ``` ## creative_acr.load_critique critique ```{.json filename="data.jsonl"} {"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."} ``` ## creative_acr.load_revise critique and revise ```{.json filename="data.jsonl"} {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."} ``` ## metharme instruction, adds additional eos tokens ```{.json filename="data.jsonl"} {"prompt": "...", "generation": "..."} ``` ## How to add custom prompt format For a dataset that is preprocessed for instruction purposes: ```{.json filename="data.jsonl"} {"input": "...", "output": "..."} ``` You can use this example in your YAML config: ```{.yaml filename="config.yaml"} datasets: - path: repo type: system_prompt: "" field_system: system field_instruction: input field_output: output format: "[INST] {instruction} [/INST]" no_input_format: "[INST] {instruction} [/INST]" ``` See full config options under [here](../config-reference.qmd). ================================================ FILE: docs/dataset-formats/pretraining.qmd ================================================ --- title: Pre-training description: Data format for a pre-training completion task. order: 1 --- For pretraining, there is no prompt template or roles. The only required field is `text`: ```{.json filename="data.jsonl"} {"text": "first row"} {"text": "second row"} ... ``` :::{.callout-note} ### Streaming is recommended for large datasets Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming: ```{.yaml filename="config.yaml"} pretraining_dataset: - name: path: split: text_column: # column in dataset with the data, usually `text` type: pretrain trust_remote_code: skip: # number of rows of data to skip over from the beginning ``` ::: ================================================ FILE: docs/dataset-formats/stepwise_supervised.qmd ================================================ --- title: Stepwise Supervised Format description: Format for datasets with stepwise completions and labels order: 3 --- ## Stepwise Supervised The stepwise supervised format is designed for chain-of-thought (COT) reasoning datasets where each example contains multiple completion steps and a preference label for each step. ### Example Here's a simple example of a stepwise supervised dataset entry: ```json { "prompt": "Which number is larger, 9.8 or 9.11?", "completions": [ "The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.", "Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8." ], "labels": [true, false] } ``` ================================================ FILE: docs/dataset-formats/template_free.qmd ================================================ --- title: Template-Free description: Construct prompts without a template. toc: true toc-depth: 3 order: 4 --- ## Background {#sec-background} ### Masking Inputs {#masking-inputs} One of the most popular features of [axolotl](https://github.com/axolotl-ai-cloud/axolotl) is setting the following configuration value: ```yaml train_on_inputs: false ``` If you declare a [dataset formats](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#dataset) such as `alpaca` or `chatml`, axolotl knows what is an input (i.e. human) vs. an output (i.e. the assistant) and masks the input labels so that your model can focus on predicting the outputs only. ### You may not want prompt templates {#sec-you-may-not-want-prompt-templates} However, there are many situations where you don't want to use one of these formats or templates. This is because they can: - Add unnecessary boilerplate to your prompts. - Create artifacts like special delimiters `<|im_start|>` that can quickly become footguns if you don't include them correctly at inference time. - Enforce a *chat* interface when you do not want one. Sometimes you just want to fine-tune a model to a very specific task and do NOT want multi-turn conversations, roles, etc. - Limit you to only certain roles that the template allows. ### The `input_output` format {#sec-the-inputoutput-format} You can construct your prompts without a template by using the `input_output` format, by setting `type: input_output` in your configuration file like this: **config.yml** ```yaml train_on_inputs: false # Mask segments of your data datasets: - path: output.jsonl type: input_output # use template free prompt construction ``` Unlike `type: completion`, which is also template-free, `type: input_output` allows you to mask segments of your text. More details on how this works are described below. ## Usage {#sec-usage} This is how you can use the `input_output` format: ### 1. Prepare Data {#sec-1-prepare-data} To use the `input_output` format, collect your data in the following format into a jsonl file (below is the first row from the file `output`.jsonl` pretty printed): ```bash $ head -n1 output.jsonl | python -m json.tool ``` :::{.cell-output .cell-output-stdout} { "segments": [ { "label": true, "text": "Hello\n" }, { "label": true, "text": "hi there!. " }, { "label": false, "text": "goodbye " }, { "label": true, "text": "farewell" } ] } ::: Set `label:false` when you want to mask a segment of text so that the model isn't trained on it. Some things to keep in mind: > [!IMPORTANT] > 1. **EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl concatenates all the segments as-is.** The tokenizer doesn't add anything additional. Notice how I added spaces, newlines, `` (BOS), and `` (EOS) myself. > 2. Make sure you check the materialized output to validate that the prompt is getting assembled how you like. ### 2. Use `type: input_output` {#sec-2-use-type-inputoutput} Let's materialize data with our `output.jsonl` file by setting `type: input_output` in our axolotl config: ```yaml # training_config.yaml base_model: mistralai/Mistral-7B-v0.1 data_seed: 49 seed: 49 datasets: - path: output.jsonl type: input_output val_set_size: 0.1 sequence_len: 896 sample_packing: false micro_batch_size: 2 gradient_accumulation_steps: 3 eval_batch_size: 2 num_epochs: 1 learning_rate: 0.0002 train_on_inputs: false special_tokens: bos_token: "" eos_token: "" unk_token: "" ``` You can use the following command to materialize your data. The `--debug` flag will print the tokens, along with the labels so you can verify that the correct items are being ignored: ```bash axolotl preprocess training_config.yaml --debug ... [2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] (1, 1) Hello(22557, 22557) (13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) (2, 2) ``` The format is `decoded_token`(`label`, `token_id`), for example, `(1, 1)` means that the token is ``, the label is `1` and the token_id is `1`. When the label is `-100` then that token is ignored for training. ### 3. Check the prompts {#sec-3-check-the-prompts} Here is another way to check the materialized output: ```python from transformers import AutoTokenizer from datasets import load_from_disk import yaml directory = !ls last_run_prepared/ with open('training_config.yaml', 'r') as f: cfg = yaml.safe_load(f) model_id = cfg['base_model'] tok = AutoTokenizer.from_pretrained(model_id) ds = load_from_disk(f'last_run_prepared/{directory[0]}/') ``` ```python >>> row = ds[0] >>> print(tok.decode(row['input_ids'])) Hello hi there!. goodbye farewell ``` We can check that the right tokens are ignored by comparing the labels to each token: ```python import pandas as pd pd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in zip(row['input_ids'], row['labels'])]) ``` | token | label | id | |-------|-------|-------| | 0 | \ | 1 | | 1 | Hello | 22557 | | 2 | \\n | 13 | | 3 | hi | 12014 | | 4 | there | 736 | | 5 | ! | 28808 | | 6 | . | 28723 | | 7 | | 28705 | | 8 | good | -100 | | 9 | bye | -100 | | 10 | | -100 | | 11 | fare | 19111 | | 12 | well | 5458 | | 13 | \| 2 | If we look at the input data, the above table seems correct! (The jsonl version is repeated below for reference): ```bash $ head -n1 output.jsonl | python -m json.tool ``` :::{.cell-output .cell-output-stdout} { "segments": [ { "label": true, "text": "Hello\n" }, { "label": true, "text": "hi there!. " }, { "label": false, "text": "goodbye " }, { "label": true, "text": "farewell" } ] } ::: ================================================ FILE: docs/dataset-formats/tokenized.qmd ================================================ --- title: Custom Pre-Tokenized Dataset description: How to use a custom pre-tokenized dataset. order: 5 --- - Pass an empty `type:` in your axolotl config. - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels` - To indicate that a token should be ignored during training, set its corresponding label to `-100`. - You must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100. - For pretraining, do not truncate/pad documents to the context window length. - For instruction training, documents must be truncated/padded as desired. Sample config: ```{.yaml filename="config.yml"} datasets: - path: /path/to/your/file.jsonl ds_type: json type: ``` Sample jsonl: ```jsonl {"input_ids":[271,299,99],"attention_mask":[1,1,1],"labels":[271,-100,99]} {"input_ids":[87,227,8383,12],"attention_mask":[1,1,1,1],"labels":[87,227,8383,12]} ``` ================================================ FILE: docs/dataset_loading.qmd ================================================ --- title: Dataset Loading description: Understanding how to load datasets from different sources back-to-top-navigation: true toc: true toc-depth: 5 --- ## Overview Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored. ## Loading Datasets We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them. You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file. ```yaml datasets: - path: name: data_files: split: revision: trust_remote_code: ``` ::: {.callout-tip} Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`. ::: This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home. For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading). For full details on the config, see [config-reference.qmd](config-reference.qmd). ::: {.callout-note} You can set multiple datasets in the config file by more than one entry under `datasets`. ```yaml datasets: - path: /path/to/your/dataset - path: /path/to/your/other/dataset ``` ::: ### Local dataset #### Files To load a JSON file, you would do something like this: ```python from datasets import load_dataset dataset = load_dataset("json", data_files="data.json") ``` Which translates to the following config: ```yaml datasets: - path: data.json ds_type: json ``` In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset. This works for CSV, JSON, Parquet, and Arrow files. ::: {.callout-tip} If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like. ::: #### Directory If you're loading a directory, you can point the `path` to the directory. Then, you have two options: ##### Loading entire directory You do not need any additional configs. We will attempt to load in the following order: - datasets saved with `datasets.save_to_disk` - loading entire directory of files (such as with parquet/arrow files) ```yaml datasets: - path: /path/to/your/directory ``` ##### Loading specific files in directory Provide `data_files` with a list of files to load. ```yaml datasets: # single file - path: /path/to/your/directory ds_type: csv data_files: file1.csv # multiple files - path: /path/to/your/directory ds_type: json data_files: - file1.jsonl - file2.jsonl # multiple files for parquet - path: /path/to/your/directory ds_type: parquet data_files: - file1.parquet - file2.parquet ``` ### HuggingFace Hub The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed. ::: {.callout-note} If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file. ::: #### Folder uploaded This would mean that the dataset is a single file or file(s) uploaded to the Hub. ```yaml datasets: - path: org/dataset-name data_files: - file1.jsonl - file2.jsonl ``` #### HuggingFace Dataset This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`. ```yaml datasets: - path: org/dataset-name ``` ::: {.callout-note} There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset. ::: ### Remote Filesystems Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI. ::: {.callout-warning} This is currently experimental. Please let us know if you run into any issues! ::: The only difference between the providers is that you need to prepend the path with the respective protocols. ```yaml datasets: # Single file - path: s3://bucket-name/path/to/your/file.jsonl # Directory - path: s3://bucket-name/path/to/your/directory ``` For directory, we load via `load_from_disk`. #### S3 Prepend the path with `s3://`. The credentials are pulled in the following order: - `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables - from the `~/.aws/credentials` file - for nodes on EC2, the IAM metadata provider ::: {.callout-note} We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this. ::: Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables) #### GCS Prepend the path with `gs://` or `gcs://`. The credentials are loaded in the following order: - gcloud credentials - for nodes on GCP, the google metadata service - anonymous access #### Azure ##### Gen 1 Prepend the path with `adl://`. Ensure you have the following environment variables set: - `AZURE_STORAGE_TENANT_ID` - `AZURE_STORAGE_CLIENT_ID` - `AZURE_STORAGE_CLIENT_SECRET` ##### Gen 2 Prepend the path with `abfs://` or `az://`. Ensure you have the following environment variables set: - `AZURE_STORAGE_ACCOUNT_NAME` - `AZURE_STORAGE_ACCOUNT_KEY` Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials) #### OCI Prepend the path with `oci://`. It would attempt to read in the following order: - `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables - when on OCI resource, resource principal Other environment variables: - `OCI_REGION_METADATA` Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables). ### HTTPS The path should start with `https://`. ```yaml datasets: - path: https://path/to/your/dataset/file.jsonl ``` This must be publically accessible. ## Next steps Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats). ================================================ FILE: docs/dataset_preprocessing.qmd ================================================ --- title: Dataset Preprocessing description: How datasets are processed --- ## Overview Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside the [dataset format](dataset-formats) and prompt strategies to: - parse the dataset based on the *dataset format* - transform the dataset to how you would interact with the model based on the *prompt strategy* - tokenize the dataset based on the configured model & tokenizer - shuffle and merge multiple datasets together if using more than one The processing of the datasets can happen one of two ways: 1. Before kicking off training by calling `axolotl preprocess config.yaml --debug` 2. When training is started ### What are the benefits of pre-processing? When training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible. The path of the cache is controlled by `dataset_prepared_path:` and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data. If `dataset_prepared_path:` is left empty, when training, the processed dataset will be cached in a default path of `./last_run_prepared/`, but will ignore anything already cached there. By explicitly setting `dataset_prepared_path: ./last_run_prepared`, the trainer will use whatever pre-processed data is in the cache. ### What are the edge cases? Let's say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset. If you have `dataset_prepared_path: ...` set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt. ================================================ FILE: docs/debugging.qmd ================================================ --- title: Debugging description: How to debug Axolotl --- This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes. ## Table of Contents - [General Tips](#general-tips) - [Debugging with VSCode](#debugging-with-vscode) - [Background](#background) - [Configuration](#configuration) - [Customizing your debugger](#customizing-your-debugger) - [Video Tutorial](#video-tutorial) - [Debugging With Docker](#debugging-with-docker) - [Setup](#setup) - [Attach To Container](#attach-to-container) - [Video - Attaching To Docker On Remote Host](#video---attaching-to-docker-on-remote-host) ## General Tips While debugging it's helpful to simplify your test scenario as much as possible. Here are some tips for doing so: > [!Important] > All of these tips are incorporated into the [example configuration](#configuration) for debugging with VSCode below. 1. **Make sure you are using the latest version of axolotl**: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from `main`. 1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing: - Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`. - Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`. 2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors. If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config): ```yaml datasets: ... shards: 20 ``` 3. **Use a small model**: A good example of a small model is [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0). 4. **Minimize iteration time**: Make sure the training loop finishes as fast as possible, with these settings. - `micro_batch_size: 1` - `max_steps: 1` - `val_set_size: 0` 5. **Clear Caches:** Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging. - Data preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in `dataset_prepared_path:` in your axolotl config. If you didn't set this value, the default is `last_run_prepared`. - HF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache [HuggingFace cache](https://huggingface.co/docs/datasets/cache), by deleting the appropriate `~/.cache/huggingface/datasets/...` folder(s). - **The recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.** ## Debugging with VSCode ### Background The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format. This is the format used when you have the following in your axolotl config: ```yaml datasets: - path: # example on HF Hub: fozziethebeat/alpaca_messages_2k_test type: chat_template ``` >[!Important] > If you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files [.vscode/launch.json](../.vscode/launch.json) and [.vscode/tasks.json](../.vscode/tasks.json) for an example configuration. >[!Tip] > If you prefer to watch a video, rather than read, you can skip to the [video tutorial](#video-tutorial) below (but doing both is recommended). ### Setup Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project: ```bash pip3 install packaging pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` #### Remote Hosts If you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this [remote - SSH guide](https://code.visualstudio.com/docs/remote/ssh). You can also see the video below on [Docker and Remote SSH debugging](#video---attaching-to-docker-on-remote-host). ### Configuration The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs. For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1]. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch. ```json // .vscode/launch.json { "version": "0.2.0", "configurations": [ { "name": "Debug axolotl prompt - chat_template", "type": "python", "module": "accelerate.commands.launch", "request": "launch", "args": [ "-m", "axolotl.cli.train", "dev_chat_template.yml", // The flags below simplify debugging by overriding the axolotl config // with the debugging tips above. Modify as needed. "--dataset_num_proc=1", // limits data preprocessing to one process "--max_steps=1", // limits training to just one step "--batch_size=1", // minimizes batch size "--micro_batch_size=1", // minimizes batch size "--val_set_size=0", // disables validation "--sample_packing=False", // disables sample packing which is necessary for small datasets "--eval_sample_packing=False",// disables sample packing on eval set "--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder "--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder ], "console": "integratedTerminal", // show output in the integrated terminal "cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project "justMyCode": true, // step through only axolotl code "env": {"CUDA_VISIBLE_DEVICES": "0", // Since we aren't doing distributed training, we need to limit to one GPU "HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder "preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below) } ] } ``` **Additional notes about this configuration:** - The argument `justMyCode` is set to `true` such that you step through only the axolotl code. If you want to step into dependencies, set this to `false`. - The `preLaunchTask`: `cleanup-for-dataprep` is defined in [.vscode/tasks.json](../.vscode/tasks.json) and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch: - `./devtools/temp_debug/axolotl_outputs` - `./devtools/temp_debug/.hf-cache/datasets` >[!Tip] > You may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the `tasks.json` file depending on your use case. Below is the [./vscode/tasks.json](../.vscode/tasks.json) file that defines the `cleanup-for-dataprep` task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task `cleanup-for-dataprep` is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the `preLaunchTask` argument of the `launch.json` file. ```json // .vscode/tasks.json // this file is used by launch.json { "version": "2.0.0", "tasks": [ // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder { "label": "delete-outputs", "type": "shell", "command": "rm -rf temp_debug/axolotl_outputs", "options":{ "cwd": "${workspaceFolder}/devtools"}, "problemMatcher": [] }, // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder { "label": "delete-temp-hf-dataset-cache", "type": "shell", "command": "rm -rf temp_debug/.hf-cache/datasets", "options":{ "cwd": "${workspaceFolder}/devtools"}, "problemMatcher": [] }, // this task combines the two tasks above { "label": "cleanup-for-dataprep", "dependsOn": ["delete-outputs", "delete-temp-hf-dataset-cache"], } ] } ``` ### Customizing your debugger Your debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the `devtools` folder and modify the `launch.json` file to use your config. You may also want to modify the `preLaunchTask` to delete different folders or not delete anything at all. ### Video Tutorial The following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):
## Debugging With Docker Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps. ### Setup On the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root: ```bash git clone https://github.com/axolotl-ai-cloud/axolotl cd axolotl ``` >[!Tip] > If you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project. Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2] ```bash docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1 ``` >[!Tip] > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags). For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml). You will now be in the container. Next, perform an editable install of Axolotl: ```bash pip3 install packaging pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` ### Attach To Container Next, if you are using a remote host, [Remote into this host with VSCode](https://code.visualstudio.com/docs/remote/ssh). If you are using a local host, you can skip this step. Next, select `Dev Containers: Attach to Running Container...` using the command palette (`CMD + SHIFT + P`) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host. Now you are ready to debug as described above (see [Debugging with VSCode](#debugging-with-vscode)). ### Video - Attaching To Docker On Remote Host Here is a short video that demonstrates how to attach to a Docker container on a remote host:
[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing. [^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html). ================================================ FILE: docs/docker.qmd ================================================ --- title: "Docker" format: html: toc: true toc-depth: 4 --- This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai). ::: {.callout-important} For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8. ::: ## Base The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more. #### Image ``` axolotlai/axolotl-base ``` Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base) #### Tags format ```bash main-base-py{python_version}-cu{cuda_version}-{pytorch_version} ``` Tags examples: - `main-base-py3.11-cu128-2.8.0` - `main-base-py3.11-cu128-2.9.1` ## Main The main image is the image that is used to run Axolotl. It is based on the `axolotlai/axolotl-base` image and includes the Axolotl codebase, dependencies, and more. #### Image ``` axolotlai/axolotl ``` Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl) #### Tags format {#sec-main-tags} ```bash # on push to main main-py{python_version}-cu{cuda_version}-{pytorch_version} # latest main (currently torch 2.6.0, python 3.11, cuda 12.4) main-latest # nightly build {branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version} # tagged release {version} ``` :::{.callout-tip} There may be some extra tags appended to the image, like `-vllm` which installs those packages. ::: Tags examples: - `main-py3.11-cu128-2.8.0` - `main-py3.11-cu128-2.9.1` - `main-latest` - `main-20250303-py3.11-cu124-2.6.0` - `main-20250303-py3.11-cu126-2.6.0` - `0.12.0` ## Cloud The cloud image is the image that is used to run Axolotl in the cloud. It is based on the `axolotlai/axolotl` image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers. :::{.callout-tip} Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variables to disable it. ::: #### Image ``` axolotlai/axolotl-cloud ``` Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud) #### Tags format This uses the same tags as the [`main` image](#sec-main-tags). #### Environment variables - `JUPYTER_DISABLE`: Disable Jupyter lab. - `JUPYTER_PASSWORD`: Set a password for the Jupyter lab. - `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service. #### Volume mounts :::{.callout-tip} We recommend mounting volumes to `/workspace/data` for data persistence. `/workspace/axolotl` contains the source code and is ephemeral. ::: - `/workspace/data/axolotl-artifacts`: Directory to store Axolotl artifacts. - `/workspace/data/huggingface-cache`: Directory to store HuggingFace cache. ## Cloud-no-tmux This is the same as the [`cloud` image](#sec-cloud) but without tmux. #### Image ``` axolotlai/axolotl-cloud-term ``` Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud-term) :::{.callout-note} The naming may be a bit confusing as it has `-term` appended to the end. ::: #### Tags format This uses the same tags as the [`cloud` image](#sec-cloud-tags). ================================================ FILE: docs/expert_quantization.qmd ================================================ --- title: "MoE Expert Quantization" description: "Reduce VRAM usage when training MoE model adapters by quantizing expert weights on load" --- Transformers v5 changed MoE expert layers from `nn.Linear` to fused `nn.Parameter` (3D+ tensors). This means `bitsandbytes` can no longer quantize them during model loading, resulting in all expert weights being loaded in full bf16 precision and causing massive VRAM usage. `quantize_moe_experts` solves this by quantizing expert weights during model loading. It intercepts the weight loading process, quantizes each expert tensor on the fly, and immediately frees the original bf16 tensor from VRAM. This dramatically reduces peak memory. For example, GLM-4.7-Flash QLoRA drops from ~127GiB to ~23GiB reserved memory. ## Usage Enable expert quantization in your Axolotl config: ```yaml quantize_moe_experts: true ``` This works with both 4-bit (QLoRA) and 8-bit (LoRA) quantization. ### Expert LoRA targeting You can optionally apply LoRA adapters directly to expert weights using `lora_target_parameters`: ```yaml lora_target_parameters: - mlp.experts.gate_up_proj - mlp.experts.down_proj # - mlp.gate.weight # router ``` ::: {.callout-note} `lora_dropout` must be `0` when using `lora_target_parameters`. ::: ## Requirements - Requires (`adapter: lora` and `load_in_8bit: true`) or (`adapter: qlora` and `load_in_4bit: true`) - CUDA GPUs only (not tested with ROCm or other backends) - FSDP2 compatible for distributed training ## Limitations - `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead. - `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA. - Total model parameter count may display incorrectly (trainable param count is correct). - FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this. - FSDP2 may use more VRAM per GPU than single GPU training due to not all layers being properly sharded across ranks. - Model loading takes longer due to on-demand quantization, even on consecutive runs. - DeepSpeed has not been tested. ## Implementation details The quantization is applied by patching transformers to intercept weight loading. When a 3D+ CUDA tensor with "expert" in its name is detected: - **4-bit mode:** Uses bitsandbytes NF4 parametrization (configurable via `bnb_4bit_quant_type`). - **8-bit mode:** Uses a custom row-wise int8 parametrization with bitsandbytes dequantization. The original bf16 tensor is freed immediately after quantization. Multiple sub-patches are applied to transformers, PEFT and accelerate FSDP2 to support these parametrized expert modules. For full implementation details, see [PR #3439](https://github.com/axolotl-ai-cloud/axolotl/pull/3439). ================================================ FILE: docs/faq.qmd ================================================ --- title: FAQ description: Frequently asked questions --- ### General **Q: The trainer stopped and hasn't progressed in several minutes.** > A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd) **Q: exitcode: -9** > A: This usually happens when you run out of system RAM. **Q: exitcode: -7 while using deepspeed** > A: Try upgrading deepspeed w: `pip install -U deepspeed` **Q: AttributeError: 'DummyOptim' object has no attribute 'step'** **Q: ModuleNotFoundError: No module named 'mpi4py' using single GPU with deepspeed** > A: You may be using deepspeed with single gpu. Please remove the `deepspeed:` section in the yaml file or `--deepspeed` CLI flag. **Q: The codes is stuck on saving preprocessed datasets.** > A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it. **Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.** > A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts. > On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config. **Q: How to call Axolotl via custom python scripts?** > A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called. **Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?** > A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_.py` file within `transformers` library. **Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token** > A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via: > ```yaml > special_tokens: > # str. If you're not sure, set to same as `eos_token`. > pad_token: "..." > ``` **Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI** > A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand. **Q: vLLM is not working with Axolotl** > A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag. **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4** > A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717. **Q: Can we mix text and text+image datasets for VLM training?** > A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know! **Q: Why is `memory/max_*` different from `nvidia-smi`?** > A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information. ### Chat templates **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** > A: This means that the property mapping for the stated attribute does not exist when building `chat_template` prompt. For example, if `no attribute 'content'`, please check you have added the correct mapping for `content` under `message_property_mappings`. **Q: `Empty template generated for turn ___`** > A: The `content` is empty for that turn. **Q: `Could not find content start/end boundary for turn __`** > A: The specific turn's start/end could not be detected. Please ensure you have set the `eos_token` following your `chat_template`. Otherwise, this could be a `chat_template` which doesn't use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not `[[dummy_message]]`. Please let us know about this. **Q: `Content end boundary is before start boundary for turn ___`** > A: This is an edge case which should not occur. Please create an Issue if this happens. **Q: `Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.`** > A: This is likely an empty turn. **Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.** > A: There can be two reasons: > 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template. > 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end. **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"** > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details. **Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.** > A: There can be two reasons: > 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template. > 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case. **Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`** > A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue. **Q: `EOT token __ is encoded as multiple tokens.`** > A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. **Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`** > A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `. **Q: If `eot_tokens: ` is not provided, what happens?** > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable. > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens. **Q: `Data processing error: CAS service error`** > A: Try disabling XET with `export HF_HUB_DISABLE_XET=1` **Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `** > A: Depending on the version of torch, you may need to include this in your YAML: > ```yaml > flex_attn_compile_kwargs: > dynamic: false > mode: max-autotune-no-cudagraphs > ``` **Q: `ValueError("Backward pass should have cleared tracker of all tensors")` > A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML. **Q: `Error parsing tool_calls arguments as JSON.` > A: There is an error parsing string arguments to a dict. Please check your dataset and the error message for more details. ================================================ FILE: docs/fsdp_qlora.qmd ================================================ --- title: "FSDP + QLoRA" description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs. format: html: toc: true --- ## Background Using FSDP with QLoRA is essential for **fine-tuning larger (70b+ parameter) LLMs on consumer GPUs.** For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs[^1]. Below, we describe how to use this feature in Axolotl. ## Usage To enable `QLoRA` with `FSDP`, you need to perform the following steps: > ![Tip] > See the [example config](#example-config) file in addition to reading these instructions. 1. Set `adapter: qlora` in your axolotl config file. 2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp). 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`. ## Enabling Swap for FSDP2 If available memory is insufficient even after FSDP's CPU offloading, you can enable swap memory usage by setting `cpu_offload_pin_memory: false` alongside `offload_params: true` in FSDP config. This disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems. ## Example Config [examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl. ## References - [PR #1378](https://github.com/axolotl-ai-cloud/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl. - [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP. - Related HuggingFace PRs Enabling FDSP + QLoRA: - Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 ) - Transformers [PR#29587](https://github.com/huggingface/transformers/pull/29587) - TRL [PR#1416](https://github.com/huggingface/trl/pull/1416) - PEFT [PR#1550](https://github.com/huggingface/peft/pull/1550) [^1]: This was enabled by [this work](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the Answer.AI team. ================================================ FILE: docs/getting-started.qmd ================================================ --- title: "Quickstart" format: html: toc: true toc-depth: 3 number-sections: true execute: enabled: false --- This guide will walk you through your first model fine-tuning project with Axolotl. ## Quick Example {#sec-quick-example} Let's start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs. Assuming `axolotl` is installed (if not, see our [Installation Guide](installation.qmd)) 1. Download example configs: ```bash axolotl fetch examples ``` 2. Run the training: ```bash axolotl train examples/llama-3/lora-1b.yml ``` That's it! Let's understand what just happened. ## Understanding the Process {#sec-understanding} ### The Configuration File {#sec-config} The YAML configuration file controls everything about your training. Here's what (part of) our example config looks like: ```yaml base_model: NousResearch/Llama-3.2-1B load_in_8bit: true adapter: lora datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out ``` ::: {.callout-tip} `load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning. - To perform Full finetuning, remove these two lines. - To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`. ::: See our [config options](config-reference.qmd) for more details. ### Training {#sec-training} When you run `axolotl train`, Axolotl: 1. Downloads the base model 2. (If specified) applies QLoRA/LoRA adapter layers 3. Loads and processes the dataset 4. Runs the training loop 5. Saves the trained model and / or LoRA weights ## Your First Custom Training {#sec-custom} Let's modify the example for your own data: 1. Create a new config file `my_training.yml`: ```yaml base_model: NousResearch/Nous-Hermes-llama-1b-v1 load_in_8bit: true adapter: lora # Training settings micro_batch_size: 2 num_epochs: 3 learning_rate: 0.0003 # Your dataset datasets: - path: my_data.jsonl # Your local data file type: alpaca # Or other format ``` This specific config is for LoRA fine-tuning a model with instruction tuning data using the `alpaca` dataset format, which has the following format: ```json { "instruction": "Write a description of alpacas.", "input": "", "output": "Alpacas are domesticated South American camelids..." } ``` Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to format them. 2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca` format): ```json {"instruction": "Classify this text", "input": "I love this!", "output": "positive"} {"instruction": "Classify this text", "input": "Not good at all", "output": "negative"} ``` 3. Run the training: ```bash axolotl train my_training.yml ``` ## Common Tasks {#sec-common-tasks} ::: {.callout-tip} The same yaml file is used for training, inference, and merging. ::: ### Testing Your Model {#sec-testing} After training, test your model: ```bash axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" ``` More details can be found in [Inference](inference.qmd). ### Using a UI {#sec-ui} Launch a Gradio interface: ```bash axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio ``` ### Preprocessing Data {#sec-preprocessing} For large datasets, preprocess first: ```bash axolotl preprocess my_training.yml ``` Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset. More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd). ### Merging LoRA weights {#sec-merging-lora} To merge the LoRA weights back into the base model, run: ```bash axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out" ``` The merged model will be saved in the `{output_dir}/merged` directory. More details can be found in [Merging LoRA weights](inference.qmd#sec-merging). ## Next Steps {#sec-next-steps} Now that you have the basics, you might want to: - Try different model architectures - Experiment with hyperparameters - Use more advanced training methods - Scale up to larger models Check our other guides for details on these topics: - [Configuration Guide](config-reference.qmd) - Full configuration options - [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources - [Dataset Formats](dataset-formats) - Working with different data formats - [Multi-GPU Training](multi-gpu.qmd) - [Multi-Node Training](multi-node.qmd) ================================================ FILE: docs/gradient_checkpointing.qmd ================================================ --- title: Gradient Checkpointing and Activation Offloading --- Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning models by reducing the memory footprint and improving computational efficiency. ### Enabling Gradient Checkpointing ```yaml gradient_checkpointing: true ``` ### Enabling Activation Offloading ```yaml gradient_checkpointing: true # required for activation offloading activation_offloading: true ``` Activation offloading variants: The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams to overlap the communications and computations when offloading. The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations. For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory. ================================================ FILE: docs/inference.qmd ================================================ --- title: "Inference and Merging" format: html: toc: true toc-depth: 3 number-sections: true execute: enabled: false --- This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps. ## Quick Start {#sec-quickstart} ::: {.callout-tip} Use the same config used for training on inference/merging. ::: ### Basic Inference {#sec-basic} ::: {.panel-tabset} ## LoRA Models ```{.bash} axolotl inference your_config.yml --lora-model-dir="./lora-output-dir" ``` ## Full Fine-tuned Models ```{.bash} axolotl inference your_config.yml --base-model="./completed-model" ``` ::: ## Advanced Usage {#sec-advanced} ### Gradio Interface {#sec-gradio} Launch an interactive web interface: ```{.bash} axolotl inference your_config.yml --gradio ``` ### File-based Prompts {#sec-file-prompts} Process prompts from a text file: ```{.bash} cat /tmp/prompt.txt | axolotl inference your_config.yml \ --base-model="./completed-model" --prompter=None ``` ### Memory Optimization {#sec-memory} For large models or limited memory: ```{.bash} axolotl inference your_config.yml --load-in-8bit=True ``` ## Merging LoRA Weights {#sec-merging} Merge LoRA adapters with the base model: ```{.bash} axolotl merge-lora your_config.yml --lora-model-dir="./completed-model" ``` ### Memory Management for Merging {#sec-memory-management} ::: {.panel-tabset} ## Configuration Options ```{.yaml} gpu_memory_limit: 20GiB # Adjust based on your GPU lora_on_cpu: true # Process on CPU if needed ``` ## Force CPU Merging ```{.bash} CUDA_VISIBLE_DEVICES="" axolotl merge-lora ... ``` ::: ## Tokenization {#sec-tokenization} ### Common Issues {#sec-tokenization-issues} ::: {.callout-warning} Tokenization mismatches between training and inference are a common source of problems. ::: To debug: 1. Check training tokenization: ```{.bash} axolotl preprocess your_config.yml --debug ``` 2. Verify inference tokenization by decoding tokens before model input 3. Compare token IDs between training and inference ### Special Tokens {#sec-special-tokens} Configure special tokens in your YAML: ```{.yaml} special_tokens: bos_token: "" eos_token: "" unk_token: "" tokens: - "<|im_start|>" - "<|im_end|>" ``` ## Troubleshooting {#sec-troubleshooting} ### Common Problems {#sec-common-problems} ::: {.panel-tabset} ## Memory Issues - Use 8-bit loading - Reduce batch sizes - Try CPU offloading ## Token Issues - Verify special tokens - Check tokenizer settings - Compare training and inference preprocessing ## Performance Issues - Verify model loading - Check prompt formatting - Ensure temperature/sampling settings ::: For more details, see our [debugging guide](debugging.qmd). ================================================ FILE: docs/input_output.qmd ================================================ --- title: Template-free prompt construction description: "Template-free prompt construction with the `input_output` format" --- The documentation moved to [here](dataset-formats/template_free.qmd). ================================================ FILE: docs/installation.qmd ================================================ --- title: "Installation" format: html: toc: true toc-depth: 3 number-sections: true execute: enabled: false --- This guide covers all the ways you can install and set up Axolotl for your environment. ## Requirements {#sec-requirements} - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU - Python ≥3.11 - PyTorch ≥2.6.0 ## Installation Methods {#sec-installation-methods} ::: {.callout-important} Please make sure to have Pytorch installed before installing Axolotl in your local environment. Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) ::: ::: {.callout-important} For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8. ::: ### PyPI Installation (Recommended) {#sec-pypi} ```{.bash} pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] ``` We use `--no-build-isolation` in order to detect the installed PyTorch version (if installed) in order not to clobber it, and so that we set the correct version of dependencies that are specific to the PyTorch version or other installed co-dependencies. ### uv Installation {#sec-uv} uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments. Install uv if not already installed ```{.bash} curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.local/bin/env ``` Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`, then create the venv and activate ```{.bash} export UV_TORCH_BACKEND=cu126 uv venv --no-project --relocatable source .venv/bin/activate ``` Install PyTorch - PyTorch 2.6.0 recommended ```{.bash} uv pip install packaging setuptools wheel uv pip install torch==2.6.0 uv pip install awscli pydantic ``` Install axolotl from PyPi ```{.bash} uv pip install --no-build-isolation axolotl[deepspeed,flash-attn] # optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm] ``` ### Edge/Development Build {#sec-edge-build} For the latest features between releases: ```{.bash} git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` ### Docker {#sec-docker} ```{.bash} docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest ``` For development with Docker: ```{.bash} docker compose up -d ``` ::: {.callout-tip} ### Advanced Docker Configuration ```{.bash} docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \ --name axolotl --ipc=host \ --ulimit memlock=-1 --ulimit stack=67108864 \ --mount type=bind,src="${PWD}",target=/workspace/axolotl \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ axolotlai/axolotl:main-latest ``` ::: ::: {.callout-important} For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`. ::: Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available. ## Cloud Environments {#sec-cloud} ### Cloud GPU Providers {#sec-cloud-gpu} For providers supporting Docker: - Use `axolotlai/axolotl-cloud:main-latest` - Available on: - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz) - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link) - [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true) - [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - [Novita](https://novita.ai/gpus-console?templateId=311) - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl) - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c) ### Google Colab {#sec-colab} [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa) ## Platform-Specific Instructions {#sec-platform-specific} ### macOS {#sec-macos} ```{.bash} pip3 install --no-build-isolation -e '.' ``` See @sec-troubleshooting for Mac-specific issues. ### Windows {#sec-windows} ::: {.callout-important} We recommend using WSL2 (Windows Subsystem for Linux) or Docker. ::: ## Environment Managers {#sec-env-managers} ### Conda/Pip venv {#sec-conda} 1. Install Python ≥3.11 2. Install PyTorch: https://pytorch.org/get-started/locally/ 3. Install Axolotl: ```{.bash} pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` 4. (Optional) Login to Hugging Face: ```{.bash} hf auth login ``` ## Troubleshooting {#sec-troubleshooting} If you encounter installation issues, see our [FAQ](faq.qmd) and [Debugging Guide](debugging.qmd). ================================================ FILE: docs/lora_optims.qmd ================================================ --- title: "LoRA Optimizations" description: "Custom autograd functions and Triton kernels in Axolotl for optimized LoRA fine-tuning" --- Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU (including the DDP, DeepSpeed, and FSDP2 settings) training. These include (1) SwiGLU and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was to leverage operator fusion and tensor re-use in order to improve speed and reduce memory usage during the forward and backward passes of these calculations. We currently support several common model architectures, including (but not limited to): - `llama` - `mistral` - `qwen2` - `gemma` - `gemma2` - `gemma3`
The set of models we support is currently limited by our attention patching strategy, which assumes (and replaces) specific code blocks for query / key / value and output projections: ```python ORIGINAL_QKV_CODE = """ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) """.lstrip( "\n" ) ORIGINAL_O_CODE = """ attn_output = self.o_proj(attn_output) """.lstrip( "\n" ) ``` Is replaced with: ```python PATCHED_QKV_CODE = """ query_states, key_states, value_states = self.apply_qkv(hidden_states) query_states = query_states.view(hidden_shape).transpose(1, 2) key_states = key_states.view(hidden_shape).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) """.lstrip( "\n" ) PATCHED_O_CODE = """ attn_output = self.apply_o(attn_output) """.lstrip( "\n" ) ``` Where `apply_qkv` and `apply_o` are defined in the `axolotl.kernels.lora` module. We welcome testing of other model architectures and / or PRs to expand our patching logic to be compatible with more of them.
::: {.callout-tip} Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with). ::: ## Usage These optimizations can be enabled in your Axolotl config YAML file. The `lora_mlp_kernel` option enables the optimized MLP path, while `lora_qkv_kernel` and `lora_o_kernel` enable the fused query-key-value projection and optimized output projection, respectively. ```yaml lora_mlp_kernel: true lora_qkv_kernel: true lora_o_kernel: true ``` ::: {.callout-note} Currently, LoRA kernels are not supported for RLHF training, only SFT. ::: ::: {.callout-warning} LoRA kernels do not support remote modeling code. ::: ## Requirements - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels) - Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491) - Targeted LoRA adapters cannot use Dropout - This may limit model expressivity / cause overfitting - Targeted LoRA adapters cannot have bias terms - This may limit model expressivity Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to be re-finetuned without these features in order to be useful. ## Implementation details ### Custom autograd functions The LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the LoRA and base weight computations together and provides a single, efficient backward pass for the entire MLP block. For attention components, similar optimizations are provided through a function that handles the query, key, and value projections, and a function that handles the output projection. They are designed to work with the existing `transformers` attention implementation via some monkey-patching logic. ### Triton kernels Two activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for improved speed and memory performance. These kernels handle both the forward and backward passes. ### Integration The custom autograd functions and Triton kernels are designed to work together. The autograd function manages the high-level computation flow and gradient tracking, while calling the Triton kernels for the activation function computation. During the backward pass, the kernel computes both the activation output and the required gradients, which the autograd function then uses to compute the final gradients for the entire computation path. ## Future Work - Support for additional model architectures - Support for dropout and bias - Additional operator fusions ================================================ FILE: docs/lr_groups.qmd ================================================ --- title: Learning Rate Groups description: "Setting different learning rates by module name" --- ## Background Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of modules in a model. ## Example ```yaml lr_groups: - name: o_proj modules: - self_attn.o_proj.weight lr: 1e-6 - name: q_proj modules: - model.layers.2.self_attn.q_proj.weight lr: 1e-5 learning_rate: 2e-5 ``` In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's self attention `q_proj` module. ::: {.callout-note} We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17 ::: ================================================ FILE: docs/mac.qmd ================================================ --- title: Mac M-series description: Mac M-series support --- Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support. Current support: - [x] Support for all models - [x] Full training of models - [x] LoRA training - [x] Sample packing - [ ] FP16 and BF16 (awaiting AMP support for MPS in Pytorch) - [ ] Tri-dao's flash-attn (until it is supported use spd_attention as an alternative) - [ ] xformers - [ ] bitsandbytes (meaning no 4/8 bits loading and bnb optimizers) - [ ] qlora - [ ] DeepSpeed Untested: - FSDP ================================================ FILE: docs/mixed_precision.qmd ================================================ --- title: "Mixed Precision Training" format: html: toc: true toc-depth: 3 number-sections: true code-tools: true execute: enabled: false --- Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats: - **FP16** - Half precision 16-bit (Pascal generation+) - **BF16** - Brain Float 16-bit (Ampere generation+) - **FP8** - 8-bit floating point (Hopper generation+) ## FP16 Mixed Precision {#sec-fp16} ### Overview {#sec-fp16-overview} FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16. ### Configuration {#sec-fp16-config} ```{.yaml} fp16: true ``` ### FP16 Considerations {#sec-fp16-considerations} - May require gradient scaling to prevent underflow - Less numerically stable than BF16 - Can cause training instability with some model architectures - Consider using BF16 if your hardware supports it ## BF16 Mixed Precision {#sec-bf16} ### Overview {#sec-bf16-overview} BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory. ### Configuration {#sec-bf16-config} ```{.yaml} # Automatic BF16 detection (recommended) bf16: auto # Or explicitly enable bf16: true # For evaluation with BF16 bf16: full # Equivalent to bf16_full_eval in the HF trainer ``` ## FP8 Mixed Precision {#sec-fp8} ::: {.callout-note} FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO. ::: ### What is FP8? {#sec-fp8-overview} FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl's implementation uses PyTorch's TorchAO library with "tensorwise" scaling strategy. ### Requirements {#sec-fp8-software} - Hopper+ GPUs (H100/H200) - PyTorch 2.7+ (+ compatible TorchAO version) - CUDA 12.4+ ### Configuration {#sec-fp8-config} Add to your YAML config: ```{.yaml} # Enable FP8 mixed precision fp8: true # Optional: Enable FP8 for FSDP all-gather operations fp8_enable_fsdp_float8_all_gather: true # Enable torch.compile (almost always necessary for FP8 speedups) torch_compile: true ``` ::: {.callout-important} **torch.compile is critical for FP8 performance** FP8 training requires `torch_compile: true` to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16. ::: ### Advanced FP8 Configs {#sec-fp8-advanced} For [FSDP](multi-gpu.qmd#sec-fsdp) (Fully Sharded Data Parallel) training: ```{.yaml} fp8: true fp8_enable_fsdp_float8_all_gather: true torch_compile: true # FSDP configuration fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer state_dict_type: FULL_STATE_DICT reshard_after_forward: true ``` ## Best Practices {#sec-best-practices} ### Choosing Precision Format {#sec-choosing-format} - **Start with automatic detection**: `bf16: auto` - **For Hopper+ (H100/H200)**: Try FP8 + torch.compile for maximum speed - **For Ampere (A100/RTX 30/40)**: Use BF16 - **For older Pascal/Turing GPUs**: Use FP16 with caution - **For very old or unsupported GPUs**: Use FP32 ### Validation and Testing {#sec-validation} Always validate your mixed precision setup: - **Start with a small dataset** to verify stability - **Monitor loss curves** for irregularities - **Compare with FP32 baseline** when possible - **Test evaluation metrics** match expectations ### FP8 Particulars {#sec-fp8-details} - Use cases - Single GPU training - Multi GPU training with FSDP2 or Deepspeed - Speedups - Please refer to the [TorchAO FP8 training benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) for expected matmul speedups for different (M, K, N) settings - Concrete number for LLaMA 3 8B training can be found [here](https://github.com/pytorch/ao/tree/main/torchao/float8#training-benchmarks) - Known issues: - FP8 + DDP + `torch.compile` (causes [error](https://gist.github.com/djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4)) - FP8 + FSDP2 + `torch.compile` + FSDP2 activation checkpointing tends to be _slower_ than the BF16 equivalent training - Flash Attention 2 does not play nicely with `torch.compile` See `examples/llama-3/3b-fp8-fsdp2.yaml` for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model For more information on multi-GPU training, see our [Multi-GPU guide](multi-gpu.qmd). ================================================ FILE: docs/multi-gpu.qmd ================================================ --- title: "Multi-GPU" format: html: toc: true toc-depth: 3 # number-sections: true code-tools: true execute: enabled: false --- This guide covers advanced training configurations for multi-GPU setups using Axolotl. ## Overview {#sec-overview} When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy. You generally cannot combine these strategies; they are mutually exclusive. 1. **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3. 2. **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended). 3. **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected). These features can often be combined with the strategies above: * **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP). * **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP). ## DeepSpeed {#sec-deepspeed} ### Configuration {#sec-deepspeed-config} Add to your YAML config: ```{.yaml} deepspeed: deepspeed_configs/zero1.json ``` ### Usage {#sec-deepspeed-usage} ```{.bash} # Fetch deepspeed configs (if not already present) axolotl fetch deepspeed_configs # Passing arg via config axolotl train config.yml # Passing arg via cli axolotl train config.yml --deepspeed deepspeed_configs/zero1.json ``` ### ZeRO Stages {#sec-zero-stages} We provide default configurations for: - ZeRO Stage 1 (`zero1.json`) - ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`) - ZeRO Stage 2 (`zero2.json`) - ZeRO Stage 3 (`zero3.json`) - ZeRO Stage 3 with bf16 (`zero3_bf16.json`) - ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`) - ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`) ::: {.callout-tip} Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance. Start from Stage 1 -> Stage 2 -> Stage 3. ::: ## Fully Sharded Data Parallel (FSDP) {#sec-fsdp} FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers. ::: {.callout-note} FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl. ::: ### FSDP + QLoRA {#sec-fsdp-qlora} For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd). ### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2} To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and also follow the config field mapping below to update field names. #### Config mapping FSDP1 | FSDP2 -------- | -------- fsdp_sharding_strategy | reshard_after_forward fsdp_backward_prefetch_policy | **REMOVED** fsdp_backward_prefetch | **REMOVED** fsdp_forward_prefetch | **REMOVED** fsdp_sync_module_states | **REMOVED** fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading fsdp_state_dict_type | state_dict_type fsdp_use_orig_params | **REMOVED** fsdp_activation_checkpointing | activation_checkpointing For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl, if you were using the following FSDP1 config: ```{.yaml} fsdp_version: 1 fsdp_config: fsdp_offload_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD ``` You can migrate to the following FSDP2 config: ```{.yaml} fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3DecoderLayer state_dict_type: FULL_STATE_DICT reshard_after_forward: true ``` ### FSDP1 (deprecated) {#sec-fsdp-config} ::: {.callout-note} Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead. ::: ```{.yaml} fsdp: - full_shard - auto_wrap fsdp_config: fsdp_offload_params: true fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer ``` ## Sequence parallelism {#sec-sequence-parallelism} We support sequence parallelism (SP) via the [ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This allows one to split up sequences across GPUs, which is useful in the event that a single sequence causes OOM errors during model training. See our [dedicated guide](sequence_parallelism.qmd) for more information. ## Performance Optimization {#sec-performance} ### Liger Kernel Integration {#sec-liger} Please see [docs](custom_integrations.qmd#liger) for more info. ## Troubleshooting {#sec-troubleshooting} ### NCCL Issues {#sec-nccl} For NCCL-related problems, see our [NCCL troubleshooting guide](nccl.qmd). ### Common Problems {#sec-common-problems} ::: {.panel-tabset} ## Memory Issues - Reduce `micro_batch_size` - Reduce `eval_batch_size` - Adjust `gradient_accumulation_steps` - Consider using a higher ZeRO stage ## Training Instability - Start with DeepSpeed ZeRO-2 - Monitor loss values - Check learning rates ::: For more detailed troubleshooting, see our [debugging guide](debugging.qmd). ================================================ FILE: docs/multi-node.qmd ================================================ --- title: Multi Node description: How to use Axolotl on multiple machines --- The below are three ways to train multi-node in Axolotl. ::: {.callout-important} Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility. You will also need to have the same configuration file for your model on each machine. Make sure the main machine is reachable by other machines. ::: ## Accelerate You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below: ~/.cache/huggingface/accelerate/default_config.yaml ```yaml compute_environment: LOCAL_MACHINE debug: false distributed_type: FSDP downcast_bf16: 'no' machine_rank: 0 # Set to 0 for the main machine, increment by one for other machines main_process_ip: 10.0.0.4 # Set to main machine's IP main_process_port: 5000 main_training_function: main mixed_precision: bf16 num_machines: 2 # Change to the number of machines num_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8) rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false ``` Configure your model to use FSDP in the Axolotl yaml. For example: ```yaml fsdp_version: 2 fsdp_config: offload_params: true state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer reshard_after_forward: true ``` All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine. ## Raytrain Please see ray train doc [here](ray-integration.qmd). ## Torchrun If you are using Infiniband, we recommend torchrun to utilize the full bandwidth. Set the following env (change buffersize/socketname depending on your system): ```bash export NCCL_IB_DISABLE=0 export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" export NCCL_BUFFSIZE=2097152 ``` Run the following on each node: ### Option 1: New Axolotl CLI with launcher args (Recommended) ```bash axolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" ``` ### Option 2: Direct torchrun (Legacy) ```bash torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml ``` Please make sure to substitute the placeholder variables: - `num_nodes`: Number of nodes (containing GPUs) - `gpu_per_node`: Number of gpus per node - `head_node_ip`: IP of the head node (make sure other machines can connect to this) - `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400) - `rdzv_id`: A unique job ID that is used by the job across nodes. The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features. More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html) ================================================ FILE: docs/multimodal.qmd ================================================ --- title: MultiModal / Vision Language Models (BETA) format: html: toc: true toc-depth: 3 --- ## Supported Models - [Mllama](#sec-mllama) - [Llama4](#sec-llama4) - [Pixtral](#sec-pixtral) - [Llava-1.5](#sec-llava-15) - [Mistral-Small-3.1](#sec-mistral-small-31) - [Mistral-Small-4](#sec-mistral-small-4) - [Magistral-Small-2509](#sec-magistral-small-2509) - [Voxtral](#sec-voxtral) - [Gemma-3](#sec-gemma-3) - [Gemma-3n](#sec-gemma-3n) - [Qwen2-VL](#sec-qwen2-vl) - [Qwen2.5-VL](#sec-qwen25-vl) - [Qwen3.5](#sec-qwen3-5) - [GLM-4.6V](#sec-glm-4-6v) - [SmolVLM2](#sec-smolvlm2) - [LFM2-VL](#sec-lfm2-vl) - [Intern-VL](#sec-intern-vl) ## Usage Multimodal support is limited and doesn't have full feature parity. Here are the hyperparams you'll need to use to finetune a multimodal model. ```yaml processor_type: AutoProcessor skip_prepare_dataset: true remove_unused_columns: false # leave columns in place as they are needed to handle image embeddings during training sample_packing: false # not yet supported with multimodal chat_template: # see in next section if specified # example dataset datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] # (optional) if doing lora, only finetune the Language model, # leave the vision model and vision tower frozen # load_in_8bit: true adapter: lora lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' # (optional) if you want to resize images to a set size image_size: 512 image_resize_algorithm: bilinear ``` Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs. ::: {.callout-tip} Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs. ::: ::: {.callout-note} As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this. ::: ### Mllama {#sec-mllama} ```yaml base_model: meta-llama/Llama-3.2-11B-Vision-Instruct chat_template: llama3_2_vision ``` ### Llama4 {#sec-llama4} ```yaml base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct chat_template: llama4 ``` ### Pixtral {#sec-pixtral} ```yaml base_model: mistralai/Pixtral-12B-2409 chat_template: pixtral ``` ### Llava-1.5 {#sec-llava-15} ```yaml base_model: llava-hf/llava-1.5-7b-hf chat_template: llava ``` ### Mistral-Small-3.1 {#sec-mistral-small-31} ::: {.callout-tip} Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'` ::: ```yaml base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 ``` ### Mistral-Small-4 {#sec-mistral-small-4} ```yaml base_model: mistralai/Mistral-Small-4-119B-2603 ``` ### Magistral-Small-2509 {#sec-magistral-small-2509} ::: {.callout-tip} Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'` ::: ```yaml base_model: mistralai/Magistral-Small-2509 ``` ### Voxtral {#sec-voxtral} ::: {.callout-tip} Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'` ::: ```yaml base_model: mistralai/Voxtral-Mini-3B-2507 processor_type: VoxtralProcessor ``` ### Gemma-3 {#sec-gemma-3} ::: {.callout-tip} The Gemma3-1B model is a text-only model, so please train as regular text model. ::: For multi-modal 4B/12B/27B models, use the following config: ```yaml base_model: google/gemma-3-4b-it chat_template: gemma3 ``` ### Gemma-3n {#sec-gemma-3n} ::: {.callout-warning} The model's initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers. ::: ::: {.callout-tip} Please make sure to install `timm` via `pip3 install timm==1.0.17` ::: ```yaml base_model: google/gemma-3n-E2B-it chat_template: gemma3n ``` ### Qwen2-VL {#sec-qwen2-vl} ```yaml base_model: Qwen/Qwen2-VL-7B-Instruct chat_template: qwen2_vl ``` ### Qwen2.5-VL {#sec-qwen25-vl} ```yaml base_model: Qwen/Qwen2.5-VL-7B-Instruct chat_template: qwen2_vl # same as qwen2-vl ``` ### Qwen3-VL {#sec-qwen3-vl} ```yaml base_model: Qwen/Qwen3-VL-4B-Instruct chat_template: qwen2_vl # same as qwen2-vl ``` ### Qwen3.5 {#sec-qwen3-5} ```yaml base_model: Qwen/Qwen3.5-9B chat_template: qwen3_5 ``` ### GLM-4.6V {#sec-glm-4-6v} Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported. ```yaml # GLM-4.6V (106B MoE version) base_model: zai-org/GLM-4.6V # OR GLM-4.6V-Flash (9B version) base_model: zai-org/GLM-4.6V-Flash ``` ### SmolVLM2 {#sec-smolvlm2} ::: {.callout-tip} Please make sure to install `num2words` via `pip3 install num2words==0.5.14` ::: ```yaml base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct ``` ### LFM2-VL {#sec-lfm2-vl} ::: {.callout-warning} Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d` ::: ```yaml base_model: LiquidAI/LFM2-VL-450M ``` ### Intern-VL {#sec-intern-vl} ::: {.callout-tip} Please make sure to install `timm` via `pip3 install timm==1.0.19` ::: ```yaml base_model: OpenGVLab/InternVL3_5-8B ``` ## Dataset Format For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format. - A message is a list of `role` and `content`. - `role` can be `system`, `user`, `assistant`, etc. - `content` is a list of `type` and (`text`, `image`, `path`, `url`, `base64`, or `audio`). ### Image ::: {.callout-note} For backwards compatibility: - If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key. - If `content` is a string, it will be converted to a list with `type` as `text`. ::: For image loading, you can use the following keys within `content` alongside `"type": "image"`: - `"path": "/path/to/image.jpg"` - `"url": "https://example.com/image.jpg"` - `"base64": "..."` - `"image": PIL.Image` ### Audio For audio loading, you can use the following keys within `content` alongside `"type": "audio"`: - `"path": "/path/to/audio.mp3"` - `"url": "https://example.com/audio.mp3"` - `"audio": np.ndarray` ::: {.callout-tip} You may need to install `librosa` via `pip3 install librosa==0.11.0`. ::: ### Video ::: {.callout-warning} This is not well tested at the moment. We welcome contributors! ::: For video loading, you can use the following keys within `content` alongside `"type": "video"`: - `"path": "/path/to/video.mp4"` - `"url": "https://example.com/video.mp4"` - `"video": np.ndarray | list[PIL.Image.Image] | torch.Tensor` (or list of the aforementioned) ### Example Here is an example of a multi-modal dataset: ```json [ { "messages": [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful assistant."} ] }, { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"}, {"type": "text", "text": "Describe this image in detail."} ] }, { "role": "assistant", "content": [ {"type": "text", "text": "The image is a bee."} ] } ] } ] ``` ## FAQ 1. `PIL.UnidentifiedImageError: cannot identify image file ...` `PIL` could not retrieve the file at `url` using `requests`. Please check for typo. One alternative reason is that the request is blocked by the server. ================================================ FILE: docs/multipack.qmd ================================================ --- title: Multipack (Sample Packing) description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput. --- ## Visualization of Multipack with Flash Attention Because Flash Attention simply drops the attention mask, we do not need to construct a 4d attention mask. We only need to concatenate the sequences into a single batch and let flash attention know where each new sequence begins. 4k context, bsz =4, each character represents 256 tokens X represents a padding token ``` 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 [[ A A A A A A A A A A A ] B B B B B B ] C C C C C C C ] D D D D ]] [[ E E E E E E E E ] [ F F F F ] [ G G G ] [ H H H H ]] [[ I I I ] [ J J J ] [ K K K K K] [ L L L ]] ``` after padding to longest input in each step ``` 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 [[ A A A A A A A A A A A ] B B B B B B X X X X X X ] C C C C C C C X X X X ] D D D D X X X X X X X ]] [[ E E E E E E E E ] [ F F F F X X X X ] [ G G G X X X X X ] [ H H H H X X X X ]] [[ I I I X X ] [ J J J X X ] [ K K K K K ] [ L L L X X ]] ``` w packing ( note it's the same effective number of tokens per step, but a true bsz of 1) ``` 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 [[ A A A A A A A A A A A B B B B B B C C C C C C C D D D D E E E E E E E E F F F F F G G G H H H H I I I J J J J K K K K K L L L X ]] ``` cu_seqlens: [[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]] ## Multipack without Flash Attention Multipack can still be achieved without Flash attention, but with lower packing efficiency as we are not able to join multiple batches into a single batch due to context length limits without flash attention. We can use either Pytorch's Scaled Dot Product Attention implementation or native Pytorch attention implementation along with [4d attention masks](https://github.com/huggingface/transformers/pull/27539) to pack sequences together and avoid cross attention. axolotl ================================================ FILE: docs/nccl.qmd ================================================ --- title: NCCL description: Troubleshooting NCCL issues --- NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort: ```text Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out. ``` Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you. Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command: ```bash nvidia-smi nvlink --status ``` To force NCCL to use NVLink, simply set this in the environment: ```bash export NCCL_P2P_LEVEL=NVL ``` If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below: | NCCL_P2P_LEVEL | Description | | -------------- | ----------- | | PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. | | PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. | | PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) | To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example: ```bash ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3 ``` It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL: ```bash export NCCL_DEBUG=INFO export NCCL_DEBUG_SUBSYS=ALL export TORCH_DISTRIBUTED_DEBUG=INFO export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log ``` Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value. ================================================ FILE: docs/nd_parallelism.qmd ================================================ --- title: "N-D Parallelism (Beta)" --- Axolotl enables training models at scale by composing different parallelism techniques. This is essential when: - A model's weights are too large to fit on a single GPU's memory. - A model's activations, especially with very long contexts, are too large for a single GPU. - You want to accelerate training by using multiple GPUs or nodes. or combinations of the above! ## Core Concepts Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch's `DeviceMesh` is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid. ### Data Parallelism {#sec-dp} Data Parallelism focuses on splitting the global data batch across GPUs. - Distributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states. - [Fully Sharded Data Parallel (FSDP)](multi-gpu.qmd#fully-sharded-data-parallel-(fsdp)): A highly memory-efficient form of data parallelism (inspired by DeepSpeed's ZeRO). Instead of replicating the model, FSDP shards the model's *parameters, gradients, and optimizer states* across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an `all_gather` operation just before they are used, and they can be discarded immediately after (`reshard-after-forward`). - FSDP maps to ZeRO stages: - ZeRO-2 (`reshard_after_forward=False`): Shards gradients and optimizer states. Model weights are replicated on each GPU. - ZeRO-3 (`reshard_after_forward=True`): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes). ### [Experimental] Tensor Parallelism (TP) {#sec-tp} Also known as "horizontal model parallelism," as described in the [Megatron-LM paper](https://arxiv.org/pdf/1909.08053.pdf). Instead of splitting the batch, TP splits the model's layers themselves across GPUs. - How it works: For a linear layer `Y = XA`, the weight matrix `A` is split column-wise (`A = [A_1, A_2]`). The computation becomes `Y_1 = XA_1` and `Y_2 = XA_2`, which can happen in parallel on different GPUs. The final output `Y` is simply the concatenation of `Y_1` and `Y_2`. Check [this comment](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530) for more detailed info. - Requirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes. ### Context Parallelism (CP) {#sec-cp} Context Parallelism, also called [Sequence Parallelism](sequence_parallelism.qmd), addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs. - How it works: If you have a sequence of 8192 tokens and a `context_parallel_size` of 4, each GPU will only handle a chunk of 2048 tokens. - The Challenge: Attention is not local; every token needs to "attend to" every other token. Splitting the sequence breaks this. - The Solution (`ring-flash-attention`): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a "ring." After `N-1` steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized `flash-attention` kernel at each step. ### Hybrid Sharding Data Parallel (HSDP) {#sec-hsdp} HSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training. - Intra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the `all_gather` operations for sharded parameters fast. - Inter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP's parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband). - Example: With 2 nodes of 8 GPUs each (16 total), you could have `dp_shard_size=8` (FSDP within each node) and `dp_replicate_size=2` (DDP across the two nodes). ## Usage ```yaml # FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp fsdp_version: 2 fsdp_config: # ... # The number of GPUs to shard the model parameters across (FSDP dimension). dp_shard_size: 4 # The number of times to replicate the sharded model (DDP dimension). dp_replicate_size: 2 # Number of GPUs for Tensor Parallelism. tensor_parallel_size: 1 # (default is 1, no TP) # Number of GPUs for Context/Sequence Parallelism. context_parallel_size: 1 # (default is 1, no CP) ``` Note: We recommend FSDP. DeepSpeed is only compatible with `tensor_parallel_size`. ## Examples ::: {.callout-tip} See our example configs [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/distributed-parallel). ::: 1. HSDP on 2 nodes with 4 GPUs each (8 GPUs total): - You want FSDP within each node and DDP across nodes. - Set `dp_shard_size: 4` and `dp_replicate_size: 2`. 2. FSDP + TP on a single 8-GPU node: - You want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP. - Set `dp_shard_size: 4` and `tensor_parallel_size: 2`. 3. FSDP + CP on a single 8-GPU node for long context: - You want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs. - Set `dp_shard_size: 8` and `context_parallel_size: 8`. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group. ## Support Matrix This matrix describes how different parallelism methods can be combined in Axolotl. | Combination | `dp_replicate_size` | `dp_shard_size` | `tp_size` | `cp_size` | Status & Notes | | --- | :---: | :---: |:---:|:---:|---| | **FSDP** (ZeRO-3) | 1 | >1 | 1 | 1 | ✅ Fully supported. Shards model across all GPUs. | | **HSDP** | >1 | >1 | 1 | 1 | ✅ Fully supported. FSDP intra-node, DDP inter-node. | | **FSDP + TP** | 1 | >1 | >1 | 1 | ✅ **2D Parallelism**. Shards the model across a `dp_shard` group, and TP-splits layers within the `tp` group. | | **HSDP + TP** | >1 | >1 | >1 | 1 | ✅ **3D Parallelism**. A powerful but complex combination. | | **FSDP + CP** | 1 | >1 | 1 | >1 | ✅ **2D Parallelism**. Combines FSDP with context parallelism. | | **FSDP + TP + CP**| 1 | >1 | >1| >1| ✅ **3D Parallelism**. Another advanced combination. | | DDP + TP/CP | >1 | 1 | >1 | >1 | ❌ **Not Supported**. The `ParallelismConfig` explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (`dp_shard_size > 1`). | | Just TP / CP | 1 | 1 | >1 | >1 | ✅ Supported. Useful for inference or when the model fits on one GPU but context is too long. | - `tp_size` refers to `tensor_parallel_size` - `cp_size` refers to `context_parallel_size` ================================================ FILE: docs/optimizations.qmd ================================================ --- title: Optimizations Guide description: A guide to the performance and memory optimizations available in Axolotl. --- Axolotl includes numerous optimizations to speed up training, reduce memory usage, and handle large models. This guide provides a high-level overview and directs you to the detailed documentation for each feature. ## Speed Optimizations These optimizations focus on increasing training throughput and reducing total training time. ### Sample Packing Improves GPU utilization by combining multiple short sequences into a single packed sequence for training. This requires enabling one of the [attention](#attention-implementations) implementations below. - **Config:** `sample_packing: true` - **Learn more:** [Sample Packing](multipack.qmd) ### Attention Implementations Using an optimized attention implementation is critical for training speed. - **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support). - **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`. - **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation. - **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16. *Note: You should only enable one attention backend.* ### LoRA Optimizations Leverages optimized kernels to accelerate LoRA training and reduce memory usage. - **Learn more:** [LoRA Optimizations Documentation](lora_optims.qmd) ## Memory Optimizations These techniques help you fit larger models or use bigger batch sizes on your existing hardware. ### Parameter Efficient Finetuning (LoRA & QLoRA) Drastically reduces memory by training a small set of "adapter" parameters instead of the full model. This is the most common and effective memory-saving technique. - Examples: Find configs with `lora` or `qlora` in the [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-3). - Config Reference: See `adapter`, `load_in_4bit`, and `load_in_8bit` in the [Configuration Reference](config-reference.qmd). ### Gradient Checkpointing & Activation Offloading These techniques save VRAM by changing how activations are handled. - Gradient Checkpointing: re-computes activations during the backward pass, trading compute time for VRAM. - Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM. - Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd) ### Cut Cross Entropy (CCE) Reduces VRAM usage by using an optimized cross-entropy loss calculation. - **Learn more:** [Custom Integrations - CCE](custom_integrations.qmd#cut-cross-entropy) ### Liger Kernels Provides efficient Triton kernels to improve training speed and reduce memory usage. - **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels) ### Expert Kernels Optimized kernel implementations for Mixture of Experts (MoE) model training. - **ScatterMoE**: Triton-based MoE kernels with fused LoRA support. - **SonicMoE**: CUTLASS-based MoE kernels for NVIDIA Hopper and Blackwell GPUs. - **Learn more:** [Custom Integrations - Kernels Integration](custom_integrations.qmd#kernels-integration) ## Long Context Models Techniques to train models on sequences longer than their original context window. ### RoPE Scaling Extends a model's context window by interpolating its Rotary Position Embeddings. - **Config:** Pass the `rope_scaling` config under the `overrides_of_model_config: `. To learn how to set RoPE, check the respective model config. ### Sequence Parallelism Splits long sequences across multiple GPUs, enabling training with sequence lengths that would not fit on a single device. - **Learn more:** [Sequence Parallelism Documentation](sequence_parallelism.qmd) ### Artic Long Sequence Training (ALST) ALST is a recipe that combines several techniques to train long-context models efficiently. It typically involves: - TiledMLP to reduce memory usage in MLP layers. - Tiled Loss functions (like [CCE](#cut-cross-entropy-(cce) or [Liger](#liger-kernels)). - Activation Offloading to CPU. - Example: [ALST Example Configuration](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) ## Large Models (Distributed Training) To train models that don't fit on a single GPU, you'll need to use a distributed training strategy like FSDP or DeepSpeed. These frameworks shard the model weights, gradients, and optimizer states across multiple GPUs and nodes. - **Learn more:** [Multi-GPU Guide](multi-gpu.qmd) - **Learn more:** [Multi-Node Guide](multi-node.qmd) ### N-D Parallelism (Beta) For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once. - **Learn more:** [N-D Parallelism Guide](nd_parallelism.qmd) ## Quantization Techniques to reduce the precision of model weights for memory savings. ### 4-bit Training (QLoRA) The recommended approach for quantization-based training. It loads the base model in 4-bit using `bitsandbytes` and then trains QLoRA adapters. See [Adapter Finetuning](#adapter-finetuning-lora-qlora) for details. ### FP8 Training Enables training with 8-bit floating point precision on supported hardware (e.g., NVIDIA Hopper series GPUs) for significant speed and memory gains. - **Example:** [Llama 3 FP8 FSDP Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-3/3b-fp8-fsdp2.yaml) ### Quantization Aware Training (QAT) Simulates quantization effects during training, helping the model adapt and potentially improving the final accuracy of the quantized model. - **Learn more:** [QAT Documentation](qat.qmd) ### GPTQ Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method. - **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml) ### MoE Expert Quantization Quantizes MoE expert weights on load to reduce VRAM when training MoE models with adapters. Required for Transformers v5+ MoE models where experts use fused `nn.Parameter` tensors. - **Config:** `quantize_moe_experts: true` - **Learn more:** [MoE Expert Quantization](expert_quantization.qmd) ================================================ FILE: docs/optimizers.qmd ================================================ --- title: Optimizers description: Configuring optimizers --- ## Overview Axolotl supports all optimizers supported by [transformers OptimizerNames](https://github.com/huggingface/transformers/blob/51f94ea06d19a6308c61bbb4dc97c40aabd12bad/src/transformers/training_args.py#L142-L187) Here is a list of optimizers supported by transformers as of `v4.54.0`: - `adamw_torch` - `adamw_torch_fused` - `adamw_torch_xla` - `adamw_torch_npu_fused` - `adamw_apex_fused` - `adafactor` - `adamw_anyprecision` - `adamw_torch_4bit` - `adamw_torch_8bit` - `ademamix` - `sgd` - `adagrad` - `adamw_bnb_8bit` - `adamw_8bit` # alias for adamw_bnb_8bit - `ademamix_8bit` - `lion_8bit` - `lion_32bit` - `paged_adamw_32bit` - `paged_adamw_8bit` - `paged_ademamix_32bit` - `paged_ademamix_8bit` - `paged_lion_32bit` - `paged_lion_8bit` - `rmsprop` - `rmsprop_bnb` - `rmsprop_bnb_8bit` - `rmsprop_bnb_32bit` - `galore_adamw` - `galore_adamw_8bit` - `galore_adafactor` - `galore_adamw_layerwise` - `galore_adamw_8bit_layerwise` - `galore_adafactor_layerwise` - `lomo` - `adalomo` - `grokadamw` - `schedule_free_radam` - `schedule_free_adamw` - `schedule_free_sgd` - `apollo_adamw` - `apollo_adamw_layerwise` - `stable_adamw` ## Custom Optimizers Enable custom optimizers by passing a string to the `optimizer` argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below. ### optimi_adamw ```yaml optimizer: optimi_adamw ``` ### ao_adamw_4bit Deprecated: Please use `adamw_torch_4bit`. ### ao_adamw_8bit Deprecated: Please use `adamw_torch_8bit`. ### ao_adamw_fp8 ```yaml optimizer: ao_adamw_fp8 ``` ### adopt_adamw GitHub: [https://github.com/iShohei220/adopt](https://github.com/iShohei220/adopt) Paper: [https://arxiv.org/abs/2411.02853](https://arxiv.org/abs/2411.02853) ```yaml optimizer: adopt_adamw ``` ### came_pytorch GitHub: [https://github.com/yangluo7/CAME/tree/master](https://github.com/yangluo7/CAME/tree/master) Paper: [https://arxiv.org/abs/2307.02047](https://arxiv.org/abs/2307.02047) ```yaml optimizer: came_pytorch # optional args (defaults below) adam_beta1: 0.9 adam_beta2: 0.999 adam_beta3: 0.9999 adam_epsilon: 1e-30 adam_epsilon2: 1e-16 ``` ### muon Blog: [https://kellerjordan.github.io/posts/muon/](https://kellerjordan.github.io/posts/muon/) Paper: [https://arxiv.org/abs/2502.16982v1](https://arxiv.org/abs/2502.16982v1) ```yaml optimizer: muon ``` ### dion Microsoft's Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient orthonormalizing optimizer that uses low-rank approximations to reduce gradient communication. GitHub: [https://github.com/microsoft/dion](https://github.com/microsoft/dion) Paper: [https://arxiv.org/pdf/2504.05295](https://arxiv.org/pdf/2504.05295) Note: Implementation written for PyTorch 2.7+ for DTensor ```yaml optimizer: dion dion_lr: 0.01 dion_momentum: 0.95 lr: 0.00001 # learning rate for embeddings and parameters that fallback to AdamW ``` ================================================ FILE: docs/qat.qmd ================================================ --- title: "Quantization Aware Training (QAT)" back-to-top-navigation: true toc: true toc-expand: 2 toc-depth: 4 --- ## Overview [Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide support for QAT and post-training quantization (PTQ) in axolotl. We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model), and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details. ## Configuring QAT in Axolotl To enable QAT in axolotl, add the following to your configuration file: ```yaml qat: activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8" weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4". group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after ``` We support the following quantization schemas: - `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl) - `Int8DynamicActivationInt4Weight` - `Float8DynamicActivationFloat8Weight` - `Float8DynamicActivationInt4Weight` - `NVFP4` Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this. ================================================ FILE: docs/quantize.qmd ================================================ --- title: "Quantization with torchao" back-to-top-navigation: true toc: true toc-expand: 2 toc-depth: 4 --- Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT). ::: {.callout-note} We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment. ::: ## Configuring Quantization in Axolotl Quantization is configured using the `quantization` key in your configuration file. ```yaml base_model: # The path to the model to quantize. quantization: activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8" weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4". group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer. output_dir: # The path to the output directory. ``` Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory. You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.qmd) - you can do this by using the existing QAT configuration file which you used to train the model: ```yaml # qat.yml qat: activation_dtype: int8 weight_dtype: int4 group_size: 256 output_dir: # The path to the output directory used during training where the final checkpoint has been saved. ``` ```bash axolotl quantize qat.yml ``` This ensures that an identical quantization configuration is used to quantize the model as was used to train it. ::: {.callout-note} If you have configured pushing to hub with `hub_model_id`, your model hub name will have the quantization schema appended to it, e.g. `axolotl-ai-cloud/qat-nvfp4-llama3B` will become `axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w` ::: ================================================ FILE: docs/ray-integration.qmd ================================================ --- title: Ray Train description: How to use Axolotl with Ray Train --- Axolotl supports using Ray as an alternative to `accelerate` for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node. With the `--use-ray` CLI flag, Axolotl will use Ray Train's [`TorchTrainer`](https://docs.ray.io/en/latest/train/api/doc/ray.train.torch.TorchTrainer.html#ray.train.torch.TorchTrainer) to run training. ## Ray cluster setup A prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs [here](https://docs.ray.io/en/latest/cluster/getting-started.html). Every Ray cluster has one _head_ node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this [doc](https://docs.ray.io/en/latest/cluster/key-concepts.html#cluster-key-concepts). ## Sanity check To run a sanity check on whether your ray cluster is setup properly, execute the following on the head node: ```bash ray status ``` The output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this: ``` Node status --------------------------------------------------------------- Active: 1 head Idle: 2 4xL40S:48CPU-384GB Pending: (no pending nodes) Recent failures: (no failures) Resources --------------------------------------------------------------- Usage: 0.0/96.0 CPU 0.0/8.0 GPU 0B/800.00GiB memory 0B/229.57GiB object_store_memory Demands: (no resource demands) ``` You should also be able to see the same on the [Ray dashboard](https://docs.ray.io/en/latest/ray-observability/getting-started.html). ## Configuring training with Ray Train You can find an example configuration at `configs/llama-3/lora-1b-ray.yaml`. The key parameters to note here are: ```yaml use_ray: true ray_num_workers: 4 # optional resources_per_worker: GPU: 1 ``` - `use_ray`: This is the flag that enables the Ray Train integration. You can either use the corresponding `--use-ray` flag in the CLI or set `use_ray` in the config file. - `ray_num_workers`: This is the number of workers/GPUs to use for training. - `resources_per_worker`: This is the Ray [resource request](https://docs.ray.io/en/latest/ray-core/scheduling/resources.html) for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do ```yaml resources_per_worker: accelerator_type:L40S: 0.001 ``` ## Launching training You can simply run the following command on the head node: ```bash axolotl train examples/llama-3/lora-1b-ray.yml --use-ray ``` This will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes. You can also monitor training progress on the Ray dashboard. Coming back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let's say you want to make use of all 8 GPUs. You would be able to just set `ray_num_workers: 8` and run the previous command. The Cluster tab will show the following: ![Ray dashboard](./images/ray-cluster-dashboard.png) ================================================ FILE: docs/reward_modelling.qmd ================================================ --- title: "Reward Modelling" description: "Reward models are used to guide models towards behaviors which is preferred by humans, by training over large datasets annotated with human preferences. " --- ### Overview Reward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions. We support the reward modelling techniques supported by `trl`. ### (Outcome) Reward Models Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step). For improved training stability, you can use the `center_rewards_coefficient` parameter to encourage mean-zero reward outputs ([see TRL docs](https://huggingface.co/docs/trl/v0.10.1/en/reward_trainer#centering-rewards)). ```yaml base_model: google/gemma-2-2b model_type: AutoModelForSequenceClassification num_labels: 1 tokenizer_type: AutoTokenizer reward_model: true chat_template: gemma datasets: - path: argilla/distilabel-intel-orca-dpo-pairs type: bradley_terry.chat_template val_set_size: 0.1 eval_steps: 100 ``` Bradley-Terry chat templates expect single-turn conversations in the following format: ```json { "system": "...", // optional "input": "...", "chosen": "...", "rejected": "..." } ``` ### Process Reward Models (PRM) ::: {.callout-tip} Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models). ::: Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning. ```yaml base_model: Qwen/Qwen2.5-3B model_type: AutoModelForTokenClassification num_labels: 2 process_reward_model: true datasets: - path: trl-lib/math_shepherd type: stepwise_supervised split: train val_set_size: 0.1 eval_steps: 100 ``` Please see [stepwise_supervised](dataset-formats/stepwise_supervised.qmd) for more details on the dataset format. ================================================ FILE: docs/rlhf.qmd ================================================ --- title: "RLHF (Beta)" description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback." back-to-top-navigation: true toc: true toc-expand: 2 toc-depth: 4 --- ## Overview Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback. Various methods include, but not limited to: - [Direct Preference Optimization (DPO)](#dpo) - [Identity Preference Optimization (IPO)](#ipo) - [Kahneman-Tversky Optimization (KTO)](#kto) - [Odds Ratio Preference Optimization (ORPO)](#orpo) - [Group Relative Policy Optimization (GRPO)](#grpo) - [Group Reward-Decoupled Policy Optimization (GDPO)](#gdpo) ## RLHF using Axolotl ::: {.callout-important} This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality. ::: We rely on the [TRL](https://github.com/huggingface/trl) library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats. ::: {.callout-tip} You can find what each method supports by going into `src/axolotl/prompt_strategies/{method}` where `{method}` is one of our supported methods. The `type: ` can be retrieved from `{method}.{function_name}`. ::: ### DPO Example config: ```yaml rl: dpo datasets: - path: Intel/orca_dpo_pairs split: train type: chatml.intel - path: argilla/ultrafeedback-binarized-preferences split: train type: chatml ``` DPO supports the following types with the following dataset format: #### chatml.argilla ```json { "system": "...", // optional "instruction": "...", "chosen_response": "...", "rejected_response": "..." } ``` #### chatml.argilla_chat ```json { "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### chatml.icr ```json { "system": "...", // optional "input": "...", "chosen": "...", "rejected": "..." } ``` #### chatml.intel ```json { "system": "...", // optional "question": "...", "chosen": "...", "rejected": "..." } ``` #### chatml.prompt_pairs ```json { "system": "...", // optional "prompt": "...", "chosen": "...", "rejected": "..." } ``` #### chatml.ultra ```json { "system": "...", // optional "prompt": "...", "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### llama3.argilla ```json { "system": "...", // optional "instruction": "...", "chosen_response": "...", "rejected_response": "..." } ``` #### llama3.argilla_chat ```json { "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### llama3.icr ```json { "system": "...", // optional "input": "...", "chosen": "...", "rejected": "..." } ``` #### llama3.intel ```json { "system": "...", // optional "question": "...", "chosen": "...", "rejected": "..." } ``` #### llama3.prompt_pairs ```json { "system": "...", // optional "prompt": "...", "chosen": "...", "rejected": "..." } ``` #### llama3.ultra ```json { "system": "...", // optional "prompt": "...", "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### zephyr.nectar ```json { "prompt": "...", "answers": [ { "answer": "...", "rank": 1 }, { "answer": "...", "rank": 2 } // ... more answers with ranks ] } ``` #### chat_template.argilla_chat ```json { "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### chat_template.default ```yaml rl: dpo datasets: - path: ... split: train type: chat_template.default field_messages: "messages" field_chosen: "chosen" field_rejected: "rejected" message_property_mappings: role: role content: content roles: user: ["user"] assistant: ["assistant"] system: ["system"] ``` Sample input format: ```json { "messages": [ { "role": "system", "content": "..." }, { "role": "user", "content": "..." }, // ... more messages ], "chosen": { "role": "assistant", "content": "..." }, "rejected": { "role": "assistant", "content": "..." } } ``` #### user_defined.default For custom behaviors, ```yaml rl: dpo datasets: - path: ... split: train type: field_prompt: "prompt" field_system: "system" field_chosen: "chosen" field_rejected: "rejected" prompt_format: "{prompt}" chosen_format: "{chosen}" rejected_format: "{rejected}" ``` The input format is a simple JSON input with customizable fields based on the above config. ```json { "system": "...", // optional "prompt": "...", "chosen": "...", "rejected": "..." } ``` ### IPO As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO. ```yaml rl: ipo ``` ### ORPO Paper: https://arxiv.org/abs/2403.07691 ```yaml rl: orpo orpo_alpha: 0.1 remove_unused_columns: false chat_template: chatml datasets: - path: argilla/ultrafeedback-binarized-preferences-cleaned type: chat_template.argilla ``` ORPO supports the following types with the following dataset format: #### chat_template.argilla ```json { "system": "...", // optional "prompt": "...", // if available, will be taken as user message for single-turn instead of from list below // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` ### KTO ```yaml rl: kto rl_beta: 0.1 # default kto_desirable_weight: 1.0 # default kto_undesirable_weight: 1.0 # default remove_unused_columns: false datasets: - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto type: llama3.ultra split: train gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true ``` KTO supports the following types with the following dataset format: #### chatml.argilla ```json { "system": "...", // optional "instruction": "...", "completion": "..." } ``` #### chatml.argilla_chat ```json { "chosen": [ {"role": "user", "content": "..."} ], "completion": [ {"role": "assistant", "content": "..."} ] } ``` #### chatml.intel ```json { "system": "...", // optional "question": "...", "completion": "..." } ``` #### chatml.prompt_pairs ```json { "system": "...", // optional "prompt": "...", "completion": "..." } ``` #### chatml.ultra ```json { "system": "...", // optional "prompt": "...", "completion": "..." } ``` #### llama3.argilla ```json { "system": "...", // optional "instruction": "...", "completion": "..." } ``` #### llama3.argilla_chat ```json { "completion": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } ``` #### llama3.intel ```json { "system": "...", // optional "question": "...", "completion": "..." } ``` #### llama3.prompt_pairs ```json { "system": "...", // optional "prompt": "...", "completion": "..." } ``` #### llama3.ultra ```json { "system": "...", // optional "prompt": "...", "completion": "..." } ``` #### user_defined.default For custom behaviors, ```yaml rl: kto datasets: - path: ... split: train type: field_prompt: "prompt" field_system: "system" field_completion: "completion" field_label: "label" prompt_format: "{prompt}" completion_format: "{completion}" ``` The input format is a simple JSON input with customizable fields based on the above config. ```json { "system": "...", // optional "prompt": "...", "completion": "...", "label": "..." } ``` ### GRPO ::: {.callout-tip} Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code). ::: In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM: ::: {.callout-important} Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`. ::: ```yaml base_model: Qwen/Qwen2.5-1.5B-Instruct vllm: host: 0.0.0.0 port: 8000 tensor_parallel_size: 2 gpu_memory_utilization: 0.85 dtype: auto # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand rl: grpo trl: use_vllm: true vllm_server_host: 0.0.0.0 vllm_server_port: 8000 vllm_server_timeout: 300 ``` ```bash CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml ``` Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute: ```bash CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2 ``` ::: {.callout-note} Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance. ::: #### Reward functions GRPO uses custom reward functions and transformations. Please have them ready locally. For example, to load OpenAI's GSM8K and use a random reward for completions: ```python # rewards.py import random def rand_reward_func(completions, **kwargs) -> list[float]: return [random.uniform(0, 1) for _ in completions] def oai_gsm8k_transform(cfg, *args, **kwargs): def transform_fn(example, tokenizer=None): label = example["answer"].split("####")[-1].strip().replace(",", "") return { "prompt": [{"role": "user", "content": example["question"]},], "answer": label, } return transform_fn, {"remove_columns": ["question"]} ``` ```yaml rl: grpo trl: beta: 0.001 max_completion_length: 256 use_vllm: True num_generations: 4 reward_funcs: ["rewards.rand_reward_func"] # format: '{file_name}.{fn_name}' reward_weights: [1.0] datasets: - path: openai/gsm8k name: main type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}' ``` To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function). To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py). #### OpenEnv Rollout Functions GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments. For example, to implement a simple math-solving environment with step-by-step verification: ```python # math_env.py import re def math_solver_rollout(model, processing_class, prompts, generation_config=None): """ Custom rollout function that generates step-by-step math solutions. Args: model: The language model processing_class: The tokenizer/processing_class prompts: List of prompt dicts (with 'messages' key for chat format) generation_config: Optional generation configuration Returns: List of completion strings """ completions = [] for prompt in prompts: # Apply chat template to prompt messages = prompt.get("messages", []) formatted_prompt = processing_class.apply_chat_template( messages, processing_class=False, add_generation_prompt=True ) # Generate step-by-step solution full_response = "" for step in range(5): # Max 5 reasoning steps current_input = formatted_prompt + full_response + "\nNext step:" inputs = processing_class(current_input, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=100, generation_config=generation_config, ) step_text = processing_class.decode( outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True ) # Check if solution is complete if "FINAL ANSWER:" in step_text: full_response += step_text break full_response += step_text + "\n" completions.append(full_response) return completions def math_reward(prompts, completions, answers, **kwargs): """Reward function that checks mathematical correctness""" rewards = [] for completion, correct_answer in zip(completions, answers): # Extract predicted answer match = re.search(r"FINAL ANSWER:\s*(.+)", completion) predicted = match.group(1).strip() if match else "" # Compare with correct answer reward = 1.0 if predicted == str(correct_answer) else 0.0 rewards.append(reward) return rewards def math_transform(cfg, *args, **kwargs): """Transform dataset to GRPO format with answer field""" def transform_fn(example, processing_class=None): return { "prompt": [{"role": "user", "content": example["question"]}], "answer": str(example["answer"]), } return transform_fn, {"remove_columns": ["question"]} ``` ```yaml rl: grpo trl: beta: 0.001 max_completion_length: 512 num_generations: 4 rollout_func: "math_env.math_solver_rollout" # Custom rollout function reward_funcs: ["math_env.math_reward"] reward_weights: [1.0] datasets: - path: openai/gsm8k name: main type: math_env.math_transform ``` The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives: - `model`: The language model - `processing_class`: The tokenizer/processing class - `prompts`: List of prompt dictionaries - `generation_config` (optional): Generation configuration And should return a list of completion strings. For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv). #### GRPO with DAPO/Dr. GRPO loss The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses. ```yaml trl: loss_type: dr_grpo # Normalizes loss based on max completion length (default: 256) max_completion_length: ``` For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types). #### Async GRPO Async GRPO overlaps vLLM generation with training by producing rollouts in a background thread. While the model trains on the current batch, the next batch is already being generated. This can significantly reduce wall-clock time per step. ```yaml trl: use_data_producer: true # Enable data producer protocol use_vllm: true async_prefetch: true # Generate rollouts in background thread prefetch_depth: 1 # Number of rollouts to prefetch vllm_sync_interval: 2 # Sync weights to vLLM every N steps ``` ::: {.callout-note} Because the background thread generates completions with slightly stale model weights, async GRPO uses importance sampling correction to account for the distribution shift. This is controlled by `vllm_importance_sampling_correction: true` (default when async is enabled). ::: ##### vLLM LoRA Sync By default, weight sync to vLLM merges the LoRA adapter into the base model and broadcasts all parameters via NCCL. LoRA sync is a faster alternative that saves only the adapter weights to the filesystem and has vLLM load them natively using Punica kernels. ```yaml adapter: lora lora_r: 32 lora_alpha: 64 lora_target_linear: true trl: vllm_lora_sync: true # Enable native LoRA sync ``` When `vllm_lora_sync: true` is set, axolotl automatically selects the LoRA-aware vLLM serve module. Start vLLM as usual: ```bash CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml ``` Then start training on a separate GPU: ```bash CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml ``` ::: {.callout-tip} LoRA sync is especially beneficial with multi-GPU training (FSDP/DeepSpeed), where NCCL merge-sync can cause GPU contention with vLLM generation. ::: ##### Streaming Partial Batch Instead of scoring the entire batch at once, streaming mode scores one prompt group at a time. This enables finer-grained zero-advantage skipping and reduces peak memory usage during scoring. ```yaml trl: streaming_partial_batch: true ``` ##### Importance Sampling Correction When using async prefetch, completions are generated from a slightly older version of the model. Importance sampling (IS) correction adjusts the policy gradient to account for this distribution shift. ```yaml trl: vllm_importance_sampling_correction: true # Enable IS correction importance_sampling_level: token # 'token' or 'sequence' off_policy_mask_threshold: 0.5 # Mask sequences with IS ratio below this ``` - `importance_sampling_level: token` applies per-token IS ratios (recommended with Liger kernel) - `importance_sampling_level: sequence` applies per-sequence IS ratios - `off_policy_mask_threshold` masks out sequences where the IS ratio indicates they are too far off-policy ##### Replay Buffer The replay buffer caches rollout groups that had learning signal (non-zero reward variance) and uses them to replace zero-signal groups in later batches. ```yaml trl: replay_buffer_size: 100 # Max cached groups (0 = disabled) replay_recompute_logps: true # Recompute log-probs for replayed data (recommended) ``` ::: {.callout-note} When `replay_recompute_logps: true` (default), old log-probabilities are recomputed using the current model weights. This fixes the IS mismatch that would otherwise occur when replaying stale data. ::: ##### Deferred Re-rolling Failed prompts (where the model produces zero reward for all generations) are buffered and re-injected into later batches when the model may be better equipped to solve them. ```yaml trl: reroll_start_fraction: 0.5 # Start re-rolling after 50% of training reroll_max_groups: 1 # Max groups to replace per batch ``` ##### Zero-Advantage Batch Skipping When all advantages in a micro-batch are zero (no learning signal), the forward/backward pass is skipped entirely. This is enabled by default and logged as `skipped_zero_adv_batches=1`. ```yaml trl: skip_zero_advantage_batches: true # default ``` ##### Parallel Reward Workers Reward functions that use `signal.alarm()` (e.g., `math_verify`) must run in the main thread. Parallel reward workers use subprocesses to work around this limitation while enabling concurrent reward computation. ```yaml trl: reward_num_workers: 4 # Number of subprocess workers (1 = no parallelism) ``` ##### Full Async GRPO Example ```yaml base_model: Qwen/Qwen2.5-1.5B-Instruct vllm: host: 0.0.0.0 port: 8000 gpu_memory_utilization: 0.35 dtype: auto adapter: lora lora_r: 32 lora_alpha: 64 lora_target_linear: true rl: grpo trl: use_data_producer: true use_vllm: true async_prefetch: true prefetch_depth: 1 vllm_sync_interval: 2 vllm_lora_sync: true streaming_partial_batch: true vllm_importance_sampling_correction: true off_policy_mask_threshold: 0.5 importance_sampling_level: token num_generations: 8 max_completion_length: 512 reward_funcs: - rewards.accuracy_reward reroll_start_fraction: 0.5 replay_buffer_size: 100 reward_num_workers: 4 skip_zero_advantage_batches: true datasets: - path: AI-MO/NuminaMath-TIR type: rewards.prompt_transform split: train gradient_accumulation_steps: 4 micro_batch_size: 2 max_steps: 500 learning_rate: 1e-5 bf16: true gradient_checkpointing: true ``` ```bash # Terminal 1: Start vLLM on GPU 0 CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml # Terminal 2: Train on GPU 1 CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml ``` ##### Multi-GPU Async GRPO Async GRPO supports FSDP and DeepSpeed ZeRO-3 for multi-GPU training. vLLM runs on one GPU while training is distributed across the remaining GPUs. **FSDP:** ```yaml fsdp: - full_shard - auto_wrap fsdp_config: fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer gradient_checkpointing_kwargs: use_reentrant: false ``` **DeepSpeed ZeRO-3:** ```yaml deepspeed: deepspeed_configs/zero3_bf16.json gradient_checkpointing_kwargs: use_reentrant: true # Required for ZeRO-3 ``` ```bash # Terminal 1: Start vLLM on GPU 0 CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml # Terminal 2: Train on GPUs 0,1 CUDA_VISIBLE_DEVICES=0,1 accelerate launch --num_processes 2 -m axolotl.cli.train config.yaml ``` ::: {.callout-important} With multi-GPU async prefetch, only rank 0 generates completions in the background thread. Results are broadcast to all ranks on the main thread. This avoids FSDP/DeepSpeed collective deadlocks from unsynchronized background threads. ::: ### GDPO GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the **reward advantage collapse** problem by normalizing each reward function independently before combining them. ::: {.callout-tip} Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results. ::: Paper: [https://arxiv.org/pdf/2501.05242](https://arxiv.org/pdf/2501.05242) GDPO uses TRL's native `multi_objective_aggregation` parameter under the hood. When you set `rl: gdpo`, axolotl automatically configures TRL to use `normalize_then_sum` aggregation. ```yaml base_model: Qwen/Qwen2.5-1.5B-Instruct vllm: host: 0.0.0.0 port: 8000 tensor_parallel_size: 2 gpu_memory_utilization: 0.85 rl: gdpo trl: beta: 0.001 max_completion_length: 256 use_vllm: true num_generations: 4 reward_funcs: - rewards.format_reward - rewards.correctness_reward reward_weights: [1.0, 2.0] datasets: - path: openai/gsm8k name: main type: rewards.oai_gsm8k_transform ``` You can also use GRPO with explicit aggregation control: ```yaml rl: grpo trl: multi_objective_aggregation: normalize_then_sum # GDPO behavior # or: sum_then_normalize # Default GRPO behavior ``` #### GDPO vs GRPO | Aspect | GRPO | GDPO | |--------|------|------| | **Aggregation** | `sum_then_normalize` | `normalize_then_sum` | | **Multi-reward** | May collapse advantages | Preserves reward signals | | **Single reward** | Standard behavior | Equivalent to GRPO | #### Why GDPO? When using multiple rewards with GRPO, different reward combinations can produce identical advantages: ``` # Example: format + correctness rewards [format=0, correct=3] → sum=3 [format=1, correct=2] → sum=3 ← GRPO sees these as equal! [format=2, correct=1] → sum=3 [format=3, correct=0] → sum=3 ``` GDPO normalizes each reward independently, preserving their relative differences. #### Reward Functions GDPO uses the same reward function format as GRPO: ```python # rewards.py def format_reward(completions, **kwargs) -> list[float]: return [1.0 if len(c) > 10 else 0.0 for c in completions] def correctness_reward(completions, answers, **kwargs) -> list[float]: rewards = [] for completion, answer in zip(completions, answers): # Your scoring logic here rewards.append(score) return rewards ``` #### Sequence Parallelism GDPO supports sequence parallelism for long-context training: ```yaml rl: gdpo context_parallel_size: 2 ``` ### SimPO SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function. ```yaml rl: simpo rl_beta: 0.1 # default in CPOTrainer cpo_alpha: 1.0 # default in CPOTrainer simpo_gamma: 0.5 # default in CPOTrainer ``` This method uses the same dataset format as [DPO](#dpo). ### Using local dataset files ```yaml datasets: - ds_type: json data_files: - orca_rlhf.jsonl split: train type: chatml.intel ``` ### TRL auto-unwrapping for PEFT TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config: ```yaml # load ref model when adapter training. rl_adapter_ref_model: true ``` ================================================ FILE: docs/scripts/examples-allowlist.yml ================================================ examples: # December 2025 - name: kimi-linear title: Kimi Linear - name: plano title: Plano Orchestrator - name: mimo title: MiMo - name: internvl3_5 title: InternVL 3.5 # AllenAI - name: olmo3 title: OLMo 3 # ArceeAI - name: trinity title: Trinity - name: arcee title: Arcee AFM # MistralAI - name: ministral3/think title: Ministral 3 Thinking - name: ministral3/vision title: Ministral 3 Vision - name: magistral/think title: Magistral Thinking - name: magistral/vision title: Magistral Vision - name: ministral title: Ministral - name: mistral-small title: Mistral Small 3.1/3.2 - name: voxtral title: Voxtral - name: devstral title: Devstral - name: mistral title: Mistral 7B # Meta - name: llama-4 title: Llama 4 - name: llama-2 title: Llama 2 # Alibaba - name: qwen3-next title: Qwen 3 Next - name: qwen3 title: Qwen 3 # Google - name: gemma3n title: Gemma 3n # Swiss AI - name: apertus title: Apertus # GPT-OSS - name: gpt-oss title: GPT-OSS - name: seed-oss title: Seed-OSS # Microsoft - name: phi title: Phi # SmolVLM - name: smolvlm2 title: SmolVLM 2 # IBM - name: granite4 title: Granite 4 # LiquidAI - name: LiquidAI title: Liquid Foundation Models 2 # Other - name: hunyuan title: Hunyuan - name: jamba title: Jamba - name: orpheus title: Orpheus ================================================ FILE: docs/scripts/generate_config_docs.py ================================================ # type: ignore """ Quarto documentation generation from Pydantic models. Uses Pydantic model source code to automatically group fields, including inherited fields from parent classes. """ import ast import inspect import textwrap import types import typing from typing import Any, FrozenSet, Type, Union from pydantic import BaseModel from axolotl.utils.schemas.config import AxolotlInputConfig class QuartoGenerator: """Generate Quarto documentation from Pydantic models.""" def __init__(self): self._class_fields_cache = {} self._inheritance_map_cache = {} self._nested_models_cache = {} def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]: """Get fields defined directly in a single class (not inherited).""" if cls in self._class_fields_cache: return self._class_fields_cache[cls] fields = set() # Get annotated fields if hasattr(cls, "__annotations__"): fields.update(cls.__annotations__.keys()) # Filter out private/special methods fields = {f for f in fields if not f.startswith("_")} result = frozenset(fields) self._class_fields_cache[cls] = result return result def _is_pydantic_model(self, type_obj) -> bool: """Check if a type is a Pydantic BaseModel.""" return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel) def _extract_nested_type(self, field_type) -> Any: """Extract the actual type from complex type annotations.""" # Handle Annotated types (Python 3.9+) if hasattr(typing, "get_origin") and hasattr(typing, "get_args"): origin = typing.get_origin(field_type) args = typing.get_args(field_type) if origin is not None: # Handle Annotated[SomeType, ...] - extract the first argument if hasattr(typing, "Annotated") and origin is typing.Annotated: if args: return self._extract_nested_type( args[0] ) # Recursively process the actual type # Handle list[SomeType], List[SomeType], etc. elif origin in (list, typing.List): if args: return self._extract_nested_type( args[0] ) # Extract element type # Handle Union types (including | syntax) elif origin is typing.Union: # Get non-None types from the Union non_none_types = [arg for arg in args if arg is not type(None)] if len(non_none_types) >= 1: # Prioritize Pydantic models over primitive types pydantic_models = [ arg for arg in non_none_types if self._is_pydantic_model(arg) ] if pydantic_models: # Return the first Pydantic model found return self._extract_nested_type(pydantic_models[0]) # No Pydantic models, return the first non-None type return self._extract_nested_type(non_none_types[0]) # Handle new Python 3.10+ union syntax (PeftConfig | None) if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType: # Get non-None types from the Union non_none_types = [ arg for arg in field_type.__args__ if arg is not type(None) ] if len(non_none_types) >= 1: # Prioritize Pydantic models over primitive types pydantic_models = [ arg for arg in non_none_types if self._is_pydantic_model(arg) ] if pydantic_models: return self._extract_nested_type(pydantic_models[0]) return self._extract_nested_type(non_none_types[0]) # Handle old typing.Union syntax (fallback) if hasattr(field_type, "__origin__"): if field_type.__origin__ is Union: # Get non-None types from the Union non_none_types = [ arg for arg in field_type.__args__ if arg is not type(None) ] if len(non_none_types) >= 1: # Prioritize Pydantic models over primitive types pydantic_models = [ arg for arg in non_none_types if self._is_pydantic_model(arg) ] if pydantic_models: return self._extract_nested_type(pydantic_models[0]) return self._extract_nested_type(non_none_types[0]) # Handle other generic types like dict[str, Any], etc. elif hasattr(field_type, "__args__"): return field_type return field_type def _extract_all_pydantic_models_from_type( self, field_type ) -> list[type[BaseModel]]: """Extract all Pydantic models from a type annotation, including from Unions.""" models = [] if field_type is None: return models # Handle Annotated types if hasattr(typing, "get_origin") and hasattr(typing, "get_args"): origin = typing.get_origin(field_type) args = typing.get_args(field_type) if origin is not None: # Handle Annotated[SomeType, ...] - extract from the first argument if hasattr(typing, "Annotated") and origin is typing.Annotated: if args: models.extend( self._extract_all_pydantic_models_from_type(args[0]) ) return models # Handle list[SomeType], List[SomeType], etc. if origin in (list, typing.List): if args: models.extend( self._extract_all_pydantic_models_from_type(args[0]) ) return models # Handle Union types if origin is typing.Union: for arg in args: if arg is not type(None): # Skip None type models.extend( self._extract_all_pydantic_models_from_type(arg) ) return models # Handle new Python 3.10+ union syntax if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType: for arg in field_type.__args__: if arg is not type(None): # Skip None type models.extend(self._extract_all_pydantic_models_from_type(arg)) return models # Handle old typing.Union syntax (fallback) if hasattr(field_type, "__origin__") and field_type.__origin__ is Union: for arg in field_type.__args__: if arg is not type(None): # Skip None type models.extend(self._extract_all_pydantic_models_from_type(arg)) return models # Check if this type itself is a Pydantic model if self._is_pydantic_model(field_type): models.append(field_type) return models def _get_nested_models( self, model_class: type[BaseModel], visited=None ) -> dict[str, type[BaseModel]]: """Get all nested Pydantic models from a model class.""" if visited is None: visited = set() # Avoid infinite recursion if model_class in visited: return {} if model_class in self._nested_models_cache: return self._nested_models_cache[model_class] visited.add(model_class) nested_models = {} # Check all fields in the model for field_info in model_class.model_fields.values(): field_type = self._extract_nested_type(field_info.annotation) if self._is_pydantic_model(field_type): nested_models[field_type.__name__] = field_type # Recursively get nested models from this nested model deeper_nested = self._get_nested_models(field_type, visited.copy()) nested_models.update(deeper_nested) self._nested_models_cache[model_class] = nested_models return nested_models def _build_inheritance_map(self, child_class: Type[BaseModel]): """Build inheritance map for a class and all its parents.""" if child_class in self._inheritance_map_cache: return self._inheritance_map_cache[child_class] inheritance_map = {} # Get MRO and filter out BaseModel and object mro_classes = [ cls for cls in child_class.__mro__ if cls not in (BaseModel, object) and hasattr(cls, "__annotations__") ] # Process each class in the MRO for cls in mro_classes: inheritance_map[cls] = self._get_direct_fields(cls) self._inheritance_map_cache[child_class] = inheritance_map return inheritance_map def _wrap_comment(self, text: str, width: int = 88) -> list[str]: """Wrap a comment to specified width, accounting for '# ' prefix.""" if not text.strip(): return ["#"] # Account for "# " prefix (2 characters) content_width = width - 2 wrapped_lines = textwrap.wrap(text, width=content_width) return [f"# {line}" for line in wrapped_lines] def _extract_type_from_source( self, model_class: type[BaseModel], field_name: str ) -> str: """Extract the actual type annotation text from source code, checking inheritance chain.""" # Use inheritance map to check classes efficiently inheritance_map = self._build_inheritance_map(model_class) # Check classes in MRO order for cls in model_class.__mro__: if cls in inheritance_map and field_name in inheritance_map[cls]: type_annotation = self._get_type_from_class_source(cls, field_name) if type_annotation != "unknown": return type_annotation return "unknown" def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str: """Extract type annotation from a specific class's source code.""" try: source = inspect.getsource(class_obj) tree = ast.parse(source) except (OSError, TypeError): return "unknown" # Find the class definition for node in tree.body: if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__: # Find the field assignment for body_node in node.body: if isinstance(body_node, ast.AnnAssign) and isinstance( body_node.target, ast.Name ): if body_node.target.id == field_name and body_node.annotation: return ast.unparse(body_node.annotation) break return "unknown" def _extract_field_groups_from_all_classes( self, model_class: type[BaseModel] ) -> list[dict]: """Extract field groups from all classes in the inheritance hierarchy.""" all_groups = [] inheritance_map = self._build_inheritance_map(model_class) # Get all Pydantic base classes in MRO order (most specific first) # This puts AxolotlInputConfig fields first, then parent class fields pydantic_classes = [ cls for cls in model_class.__mro__ if cls in inheritance_map and inheritance_map[cls] ] # Extract groups from each class for cls in pydantic_classes: class_groups = self._extract_field_groups_from_source(cls) for group in class_groups: all_groups.append(group) # If no groups found, create a default grouping by class if not all_groups: for cls in pydantic_classes: fields_in_class = inheritance_map[cls] if fields_in_class: all_groups.append( { "fields": list(fields_in_class), } ) return all_groups def _extract_field_groups_from_source( self, model_class: type[BaseModel] ) -> list[dict]: """Extract field groups from source code based on blank lines and comments.""" try: source = inspect.getsource(model_class) tree = ast.parse(source) except (OSError, TypeError): # Fallback if we can't get source code fields_in_class = self._get_direct_fields(model_class) if fields_in_class: return [ { "fields": list(fields_in_class), } ] return [] groups = [] current_group_fields = [] current_group_comment = None # Find the class definition class_node = None for node in ast.walk(tree): if isinstance(node, ast.ClassDef) and node.name == model_class.__name__: class_node = node break if not class_node: fields_in_class = self._get_direct_fields(model_class) if fields_in_class: return [ { "fields": list(fields_in_class), } ] return [] # Parse the source lines to detect groupings source_lines = source.split("\n") # Get fields that are actually defined in this specific class fields_in_class = self._get_direct_fields(model_class) # Find assignments that correspond to model fields for THIS class only field_assignments = [] for node in class_node.body: if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name): field_name = node.target.id if field_name in fields_in_class: field_assignments.append( { "name": field_name, "lineno": node.lineno, "end_lineno": getattr(node, "end_lineno", node.lineno), } ) if not field_assignments: if fields_in_class: return [ { "fields": list(fields_in_class), } ] return [] # Sort by line number field_assignments.sort(key=lambda x: x["lineno"]) # Group fields based on blank lines and comments for i, field_info in enumerate(field_assignments): field_name = field_info["name"] current_line = field_info["lineno"] # Check if this starts a new group (blank line before or significant gap) is_new_group = False if i == 0: is_new_group = True else: prev_end_line = field_assignments[i - 1]["end_lineno"] # Check for blank lines or comments between fields lines_between = source_lines[prev_end_line : current_line - 1] has_blank_line = any(line.strip() == "" for line in lines_between) has_comment = any( line.strip().startswith("#") for line in lines_between ) # Start new group if there's a blank line or comment, or significant gap if has_blank_line or has_comment or (current_line - prev_end_line > 3): is_new_group = True if is_new_group and current_group_fields: # Save the previous group groups.append( { "fields": current_group_fields.copy(), "description": current_group_comment, } ) current_group_fields = [] current_group_comment = None current_group_fields.append(field_name) # Add the final group if current_group_fields: groups.append( { "fields": current_group_fields, "description": current_group_comment, } ) return groups def _generate_field_documentation( self, model_class: type[BaseModel], field_name: str, field_info: dict, field_type_str: str, is_required: bool, indent_level: int = 0, visited_models: set = None, ) -> list[str]: """Generate documentation for a single field, expanding nested models inline.""" if visited_models is None: visited_models = set() lines = [] indent = " " * indent_level # Get the actual field type for nested model detection if field_name in model_class.model_fields: pydantic_field_info = model_class.model_fields[field_name] actual_field_type = pydantic_field_info.annotation else: actual_field_type = None # Add description comment if available description = field_info.get("description", "") if description: wrapped_lines = self._wrap_comment(description, width=88 - len(indent)) for line in wrapped_lines: lines.append(f"{indent}{line}") # Extract nested Pydantic models from the type annotation nested_models = self._extract_all_pydantic_models_from_type(actual_field_type) # Filter out already visited models to prevent infinite recursion expandable_models = [ model for model in nested_models if model not in visited_models ] if expandable_models: # This field contains Pydantic models that can be expanded # Show the field with its full type annotation field_line = f"{indent}{field_name}: {field_type_str}" if field_info.get("default") is not None: field_line += f" = {field_info['default']}" if is_required: field_line += " (required)" lines.append(field_line) # Add to visited to prevent infinite recursion new_visited = visited_models.copy() new_visited.update(expandable_models) # Expand each nested Pydantic model for i, nested_model in enumerate(expandable_models): if i > 0: lines.append("\n") lines.append(f"{indent} # For {nested_model.__name__}:") # Get nested model schema try: nested_schema = nested_model.model_json_schema() nested_properties = nested_schema.get("properties", {}) nested_required = nested_schema.get("required", []) except Exception: # Fallback: use model fields directly nested_properties = {} nested_required = [] for ( nested_field_name, nested_field_info, ) in nested_model.model_fields.items(): nested_description = "" if ( hasattr(nested_field_info, "json_schema_extra") and nested_field_info.json_schema_extra ): nested_description = ( nested_field_info.json_schema_extra.get( "description", "" ) ) elif ( hasattr(nested_field_info, "description") and nested_field_info.description ): nested_description = nested_field_info.description nested_default_val = None if ( hasattr(nested_field_info, "default") and nested_field_info.default is not None ): if str(nested_field_info.default) != "PydanticUndefined": nested_default_val = nested_field_info.default nested_properties[nested_field_name] = { "type": "unknown", "description": nested_description, "default": nested_default_val, } if nested_field_info.is_required(): nested_required.append(nested_field_name) # Get field groups for the nested model nested_field_groups = self._extract_field_groups_from_all_classes( nested_model ) # Generate nested fields with increased indentation for i, group in enumerate(nested_field_groups): if not group["fields"]: continue # Add blank line between groups (except before first group) if i > 0: lines.append("") # Process nested fields for nested_field_name in group["fields"]: if nested_field_name not in nested_properties: continue nested_field_info = nested_properties[nested_field_name] nested_field_type = self._extract_type_from_source( nested_model, nested_field_name ) nested_is_required = nested_field_name in nested_required # Recursively generate documentation for nested field nested_lines = self._generate_field_documentation( nested_model, nested_field_name, nested_field_info, nested_field_type, nested_is_required, indent_level + 1, new_visited, ) lines.extend(nested_lines) else: # Regular field (no expandable nested models) field_line = f"{indent}{field_name}: {field_type_str}" if field_info.get("default") is not None: field_line += f" = {field_info['default']}" if is_required: field_line += " (required)" lines.append(field_line) return lines def generate_qmd( self, model_class: type[BaseModel], title: str | None = None, expand_nested: bool = True, ) -> str: """Auto-generate config reference documentation including inherited fields.""" if title is None: title = f"{model_class.__name__} Reference" # Try to get JSON schema, with fallback for serialization issues try: schema = model_class.model_json_schema() properties = schema.get("properties", {}) required = schema.get("required", []) except Exception as e: print( f"Warning: Could not generate JSON schema ({e}). Using model fields instead." ) # Fallback: use model fields directly properties = {} required = [] for field_name, field_info in model_class.model_fields.items(): # Extract description from json_schema_extra or field info description = "" if ( hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra ): description = field_info.json_schema_extra.get("description", "") elif hasattr(field_info, "description") and field_info.description: description = field_info.description # Get default value default_val = None if hasattr(field_info, "default") and field_info.default is not None: # Handle special Pydantic default markers if str(field_info.default) != "PydanticUndefined": default_val = field_info.default properties[field_name] = { "type": "unknown", "description": description, "default": default_val, } if field_info.is_required(): required.append(field_name) # Extract field groups from all classes in inheritance hierarchy field_groups = self._extract_field_groups_from_all_classes(model_class) # Start building QMD content qmd_lines = [ "---", f"title: {title}", "description: A complete list of all configuration options.", "---", "", ] # Generate one big code block with all fields (inline nested expansion) qmd_lines.append("```yaml") for i, group in enumerate(field_groups): if not group["fields"]: continue # Add blank line between groups (except before first group) if i > 0: qmd_lines.append("") # Process fields in the order they appear in source for field_name in group["fields"]: if field_name not in properties: continue field_info = properties[field_name] field_type = self._extract_type_from_source(model_class, field_name) is_required = field_name in required if expand_nested: # Check if this field has nested models if field_name in model_class.model_fields: pydantic_field_info = model_class.model_fields[field_name] nested_models = self._extract_all_pydantic_models_from_type( pydantic_field_info.annotation ) has_nested = bool(nested_models) else: has_nested = False # Add blank line before nested config if has_nested: qmd_lines.append("") # Use the new inline generation method field_lines = self._generate_field_documentation( model_class, field_name, field_info, field_type, is_required, indent_level=0, visited_models=set(), ) qmd_lines.extend(field_lines) # Add blank line after nested config if has_nested: qmd_lines.append("") else: # Original simple approach description = field_info.get("description", "") default = field_info.get("default") # Add wrapped comment for description if description: wrapped_lines = self._wrap_comment(description) qmd_lines.extend(wrapped_lines) line = f"{field_name}: {field_type}" if default is not None: line += f" = {default}" if is_required: line += " (required)" qmd_lines.append(line) qmd_lines.append("```") # Join all lines and clean up any double newlines content = "\n".join(qmd_lines) # Replace multiple consecutive newlines with just two newlines (one blank line) import re content = re.sub(r"\n{3,}", "\n\n", content) # Ensure single newline at the very end content = content.rstrip("\n") + "\n" return content def main(): generator = QuartoGenerator() print("Generating config reference content...") qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True) print("Writing to file...") with open("docs/config-reference.qmd", "w", encoding="utf-8") as f: f.write(qmd_content) print("Done!") if __name__ == "__main__": main() ================================================ FILE: docs/scripts/generate_examples_docs.py ================================================ """ auto generate example docs from allowlist """ import re import shutil import sys from pathlib import Path import yaml # Paths THIS = Path(__file__).resolve() ROOT = THIS.parents[2] # repo root (docs/scripts -> docs -> ROOT) EXAMPLES_DIR = ROOT / "examples" OUTPUT_DIR = ROOT / "docs" / "models" ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml" def slugify(name: str) -> str: """Convert a name to a slug (lowercase, hyphens for spaces).""" s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip()) s = re.sub(r"\s+", "-", s).strip("-").lower() return s or "example" def read_allowlist(): with open(ALLOWLIST_YML, "r", encoding="utf-8") as f: data = yaml.safe_load(f) or {} items = data.get("examples", []) if not isinstance(items, list): raise ValueError("`examples` must be a list in examples-allowlist.yml") return items def find_readme(folder: Path) -> Path | None: for name in ("README.md", "Readme.md", "readme.md"): p = folder / name if p.exists(): return p return None def remove_first_h1(md: str) -> tuple[str, str | None]: """ Remove the first H1 from markdown and return (modified_md, h1_title). The H1 is removed since we use the frontmatter title instead. """ lines = md.splitlines() result = [] h1_title = None skipped_first = False for line in lines: if not skipped_first and line.startswith("# "): h1_title = line[2:].strip() skipped_first = True continue result.append(line) return "\n".join(result), h1_title IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)") LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str: """ Copy local image assets referenced in markdown to docs/examples/assets/... and rewrite the links. """ dest_assets = dest_assets_root / "assets" def repl(m): url = m.group(1).strip() if re.match(r"^(https?:)?//", url): return m.group(0) # leave remote URLs src_path = (src_dir / url).resolve() if not src_path.exists(): return m.group(0) # leave as-is if not found rel = src_path.relative_to(src_dir) # Create a unique asset path based on source directory name asset_name = src_dir.name.replace("/", "-") dest_path = dest_assets / asset_name / rel dest_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src_path, dest_path) new_rel = f"assets/{asset_name}/{rel.as_posix()}" return m.group(0).replace(url, new_rel) return IMG_RE.sub(repl, md) def rewrite_readme_links( md: str, src_dir: Path, examples_dir: Path, parent_index_only: set, current_src_path: str, allowlist_entries: set, current_output_path: str, ) -> str: """ Rewrite links between README.md files to point to the correct .qmd files. """ def repl(m): text = m.group(1) url = m.group(2).strip() # Skip remote URLs and anchor links if re.match(r"^(https?:)?//", url) or url.startswith("#"): return m.group(0) # Skip non-markdown files if not url.lower().endswith(".md"): return m.group(0) # Resolve the target path try: target_path = (src_dir / url).resolve() # Check if target is outside examples_dir try: rel_path = target_path.relative_to(examples_dir) except ValueError: # Target is outside examples_dir, leave as-is return m.group(0) parts = list(rel_path.parts) # Determine the output path for the target if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"): # This is a README link if len(parts) == 1: # Link to root README -> index.qmd target_output = "index.qmd" elif len(parts) == 2: if parts[0] == ".": # Current directory README target_output = "index.qmd" else: # subdir/README.md parent_dir = parts[0] if parent_dir in parent_index_only: target_output = f"{parent_dir}/index.qmd" else: target_output = f"{parent_dir}.qmd" else: # Deeper nesting: parent/subdir/README.md # Build the full path like "parent/subdir" full_path = "/".join(parts[:-1]) # Remove README.md # Check if this exact path is in allowlist if full_path in allowlist_entries: # This is a sub-entry with its own entry -> use .qmd target_output = f"{full_path}.qmd" elif parts[0] == ".": # ./subdir/README.md -> check if subdir has own entry subdir = parts[1] if subdir in parent_index_only: target_output = f"{subdir}/index.qmd" else: target_output = f"{subdir}.qmd" else: # parent/subdir where parent doesn't have own entry target_output = f"{full_path}/index.qmd" else: # Regular .md file -> convert to .qmd, keep path structure target_output = "/".join(parts)[:-2] + "qmd" # Compute relative path from current output file to target current_parts = current_output_path.split("/") target_parts = target_output.split("/") # Special case: if current is a subdir file and target is a single-component file at root # Example: current="magistral/vision", target="magistral.qmd" if len(current_parts) > 1 and len(target_parts) == 1: # Current is in subdir, target is at root level # Go up to root: ../ for each level up_count = len(current_parts) - 1 rel_parts = [".."] * up_count + [target_parts[0]] new_url = "/".join(rel_parts) else: # Find common prefix i = 0 while ( i < min(len(current_parts) - 1, len(target_parts)) and current_parts[i] == target_parts[i] ): i += 1 # Build relative path: go up (../) then down to target up_count = len(current_parts) - 1 - i rel_parts = [".."] * up_count + target_parts[i:] if not rel_parts or rel_parts == [".."]: # Points to same directory or parent new_url = "/".join(rel_parts) if rel_parts else "." else: new_url = "/".join(rel_parts) return f"[{text}]({new_url})" except (ValueError, IndexError): return m.group(0) return LINK_RE.sub(repl, md) def write_qmd(out_path: Path, title: str, body_md: str): out_path.parent.mkdir(parents=True, exist_ok=True) fm = f"---\ntitle: {title!r}\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n" out_path.write_text(fm + body_md, encoding="utf-8") def update_quarto_yml(generated: list[tuple[str, str, str]]): """ Update _quarto.yml with the generated example files in the correct order. This keeps the sidebar in sync with the allowlist. Model Guides is now nested under "Getting Started" section. Creates nested sections for models with sub-entries (e.g., magistral, ministral3). Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs. """ quarto_yml = ROOT / "_quarto.yml" if not quarto_yml.exists(): print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr) return content = quarto_yml.read_text(encoding="utf-8") # First pass: find all parents that have sub-entries parents_with_subs = set() for path, _name, _title in generated: if "/" in path: parent = path.split("/")[0] parents_with_subs.add(parent) # Build the YAML contents while preserving allowlist order lines = [] processed_sections = set() for path, _name, title in generated: # Check if this is a parent page that has sub-pages if path in parents_with_subs: # This is a parent page with sub-pages - create a nested section if path not in processed_sections: processed_sections.add(path) section_title = ( title or path.replace("-", " ").replace("_", " ").title() ) lines.append(f' - section: "{section_title}"') lines.append(" contents:") # Add the parent page first lines.append(f" - docs/models/{path}.qmd") # Then add all sub-pages for sub_path, _sub_name, _sub_title in generated: if "/" in sub_path and sub_path.split("/")[0] == path: lines.append( f" - docs/models/{sub_path}.qmd" ) elif "/" not in path: # This is a flat item with no sub-pages # Skip if it was already included as part of a parent section if path not in processed_sections: lines.append(f" - docs/models/{path}.qmd") yaml_content = "\n".join(lines) + "\n" # Pattern to match only the Model Guides contents, stopping at the next item # in Getting Started (lines starting with 12 spaces: same level as the section) pattern = r'( - section: "Model Guides"\n contents:)([^\n]*|.*?)(?=\n - |\n - section:|\n\nformat:)' def replacement(match): prefix = match.group(1) return prefix + "\n" + yaml_content new_content = re.sub(pattern, replacement, content, flags=re.DOTALL) if new_content != content: quarto_yml.write_text(new_content, encoding="utf-8") print(f"Updated {quarto_yml}") else: print(f"No changes needed for {quarto_yml}") def main(): allow = read_allowlist() if not EXAMPLES_DIR.exists(): print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr) return (OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True) # First pass: identify which parents have their own entry vs only sub-entries parent_entries = set() # Parents that have their own entry parent_with_subs = set() # Parents that have sub-entries allowlist_entries = set() # All entries in allowlist for item in allow: if isinstance(item, str): name = item else: name = item.get("name") allowlist_entries.add(name) if "/" in name: parent = name.split("/")[0] parent_with_subs.add(parent) else: parent_entries.add(name) # Parents with subs that DON'T have their own entry -> use index.qmd parent_index_only = parent_with_subs - parent_entries generated = [] seen_dirs = set() # Track which parent directories we've created index for for item in allow: if isinstance(item, str): name = item title = None else: name = item.get("name") title = item.get("title") if not name: print(f"[WARN] Skipping item without name: {item}", file=sys.stderr) continue src_dir = EXAMPLES_DIR / name if not src_dir.exists() or not src_dir.is_dir(): print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr) continue readme = find_readme(src_dir) if not readme: print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr) continue md = readme.read_text(encoding="utf-8") # Determine output path first (needed for link rewriting) parts = name.split("/") if len(parts) == 1: # Simple case: no subdirectory out_path = OUTPUT_DIR / f"{parts[0]}.qmd" sidebar_path = parts[0] else: # Has subdirectory: e.g., magistral/think parent = parts[0] child = "-".join(parts[1:]) # handle nested subdirs out_path = OUTPUT_DIR / parent / f"{child}.qmd" sidebar_path = f"{parent}/{child}" # Remove the first H1 (we use frontmatter title instead) md, _ = remove_first_h1(md) # Rewrite links between README files md = rewrite_readme_links( md, src_dir, EXAMPLES_DIR, parent_index_only, name, allowlist_entries, sidebar_path, ) md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR) # Handle parent page generation for sub-entries if len(parts) > 1: # Has subdirectory: e.g., magistral/think parent = parts[0] # Create parent.qmd if not already done and parent doesn't have own entry if parent not in seen_dirs and parent in parent_index_only: parent_readme = find_readme(EXAMPLES_DIR / parent) if parent_readme: parent_md = parent_readme.read_text(encoding="utf-8") parent_md, _ = remove_first_h1(parent_md) parent_md = rewrite_readme_links( parent_md, EXAMPLES_DIR / parent, EXAMPLES_DIR, parent_index_only, parent, allowlist_entries, parent, ) parent_md = rewrite_and_copy_assets( parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR ) parent_title = parent.replace("-", " ").replace("_", " ").title() write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md) generated.append((parent, parent, parent_title)) seen_dirs.add(parent) if not title: title = name.replace("/", " ").replace("-", " ").title() write_qmd(out_path, title, md) generated.append((sidebar_path, name, title)) # Index page - preserve allowlist order if generated: listing = "\n".join( [f"- [{title}]({path}.qmd)" for path, name, title in generated] ) index_md = ( "# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n" + listing + "\n" ) index_fm = ( "---\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n" ) (OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8") # Auto-update _quarto.yml to keep sidebar in sync update_quarto_yml(generated) if __name__ == "__main__": main() ================================================ FILE: docs/sequence_parallelism.qmd ================================================ --- title: Sequence Parallelism description: Train with long sequences split across multiple GPUs. --- Sequence parallelism is a technique that splits sequences across multiple GPUs, allowing you to train with very long sequences that wouldn't fit on a single GPU. Each GPU processes a different portion of the sequence, and the results are aggregated through a ring communication pattern. ## When to Use Sequence Parallelism Use sequence parallelism when: - You need to train with sequence lengths that don't fit into a single GPU's memory - You have multiple GPUs available - You're experiencing OOM (Out Of Memory) errors with long sequences ## Configuration To enable sequence parallelism, add the following to your configuration file: ```yaml # Set to a divisor (> 1) of the number of GPUs available context_parallel_size: 4 # Split sequences across 4 GPUs # Optional; strides across the key dimension. Larger values use more memory but should make training faster. heads_k_stride: 1 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise. ring_attn_func: ``` The `context_parallel_size` should be a divisor of the total number of GPUs. For example: - With 8 GPUs, valid values would be 2, 4, or 8 - With 4 GPUs, valid values would be 2 or 4 ## Implementation Details When sequence parallelism is enabled: 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids 3. Position IDs are adjusted to maintain proper relative positions 4. The trainer uses special ring communication patterns for attention operations ## Requirements To use sequence parallelism, you need: - Multiple GPUs (at least 2) - The `ring-flash-attn` package. Install with: - `pip install axolotl[ring-flash-attn]` (preferred) - `pip install ring-flash-attn>=0.1.4` ## Limitations - Flash attention must be enabled for this to work (`flash_attention: true` in config YAML) - May have a small performance overhead due to communication between GPUs ## Example ```yaml base_model: meta-llama/Llama-3-8B-Instruct sequence_len: 8192 ... context_parallel_size: 4 # Split each sequence into 4 parts, one per GPU # Optional; strides across the key dimension. Larger values use more memory but should make training faster. heads_k_stride: 1 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise. ring_attn_func: ... ``` This will train the Llama 3 8B model with 8K context length, with each sequence split into 2 subsequences of length 4096 across 2 GPUs. ## Sample Packing with Sequence Parallelism Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together: 1. Samples are first packed together 2. The packed sequences are then divided across GPUs in the sequence parallel group 3. Position IDs are automatically adjusted to maintain proper relative positions ## Effect on Batch Size When using sequence parallelism, your effective global batch size is **divided** by the `context_parallel_size`. This happens because: - Each group of `context_parallel_size` GPUs works on the same batch (just different parts of each sequence) - The number of batches processed per step decreases For example: - With 8 GPUs and no sequence parallelism: 8 different batches processed per step - With 8 GPUs and `context_parallel_size=4`: Only 2 different batches processed per step (each split across 4 GPUs) - If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4 ================================================ FILE: docs/streaming.qmd ================================================ --- title: Streaming Datasets description: How to use streaming mode for large-scale datasets and memory-efficient training order: 10 --- Streaming enables memory-efficient training with large datasets by loading data incrementally rather than loading the entire dataset into memory at once. Use streaming when: - Your dataset is too large to fit in memory (e.g. when you're doing pretraining with massive text corpora) - You want to start training immediately without preprocessing the entire dataset Streaming works with both remote and locally stored datasets! ::: {.callout-note} Streaming currently only supports a single dataset. Multi-dataset support will be added soon. ::: ## Configuration ### Basic Streaming Enable streaming mode by setting the `streaming` flag: ```yaml streaming: true ``` ### Pretraining with Streaming For pretraining tasks, streaming is automatically enabled when using `pretraining_dataset`: ```yaml pretraining_dataset: - path: HuggingFaceFW/fineweb-edu type: pretrain text_column: text split: train # Optionally, enable sample packing streaming_multipack_buffer_size: 10000 sample_packing: true ``` ### SFT with Streaming For supervised fine-tuning with streaming: ```yaml streaming: true datasets: - path: tatsu-lab/alpaca type: alpaca split: train # Optionally, enable sample packing streaming_multipack_buffer_size: 10000 sample_packing: true ``` ## Configuration Options ### `streaming_multipack_buffer_size` Controls the buffer size for multipack streaming (default: 10,000). This determines how many samples are buffered before packing. Larger buffers can improve packing efficiency but use more memory. ### `shuffle_merged_datasets` When enabled, shuffles the streaming dataset using the buffer. This requires additional memory for the shuffle buffer. ## Sample Packing with Streaming Sample packing is supported for streaming datasets. When enabled, multiple samples are packed into a single sequence to maximize GPU utilization: ```yaml sample_packing: true streaming_multipack_buffer_size: 10000 # For SFT: attention is automatically isolated between packed samples # For pretraining: control with pretrain_multipack_attn pretrain_multipack_attn: true # prevent cross-attention between packed samples ``` For more information, see our [documentation](multipack.qmd) on multipacking. ## Important Considerations ### Memory Usage While streaming reduces memory usage compared to loading entire datasets, you still need to consider: - You can control the memory usage by adjusting `streaming_multipack_buffer_size` - Sample packing requires buffering multiple samples - Shuffling requires additional memory for the shuffle buffer ### Performance - Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly - Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively - Consider using `axolotl preprocess` for smaller or more frequently used datasets ### Evaluation Datasets Evaluation datasets are not streamed to ensure consistent evaluation metrics. They're loaded normally even when training uses streaming. ## Examples See the `examples/streaming/` directory for complete configuration examples: - `pretrain.yaml`: Pretraining with streaming dataset - `sft.yaml`: Supervised fine-tuning with streaming ================================================ FILE: docs/telemetry.qmd ================================================ --- title: Telemetry description: A description of the telemetry implementation in Axolotl. --- # Telemetry in Axolotl Axolotl implements anonymous telemetry to help maintainers understand how the library is used and where users encounter issues. This data helps prioritize features, optimize performance, and fix bugs. ## Data Collection We collect: - System info: OS, Python version, Axolotl version, PyTorch version, Transformers version, etc. - Hardware info: CPU count, memory, GPU count and models - Runtime metrics: Training progress, memory usage, timing information - Usage patterns: Models (from a whitelist) and configurations used - Error tracking: Stack traces and error messages (sanitized to remove personal information) Personally identifiable information (PII) is not collected. ## Implementation Telemetry is implemented using PostHog and consists of: - `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the telemetry system and provides methods for tracking events. - `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and sends sanitized stack traces. - `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks runtime metrics during training. - `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends runtime metrics telemetry. The telemetry system will block training startup for 10 seconds to ensure users are aware of data collection, unless telemetry is explicitly enabled or disabled. ## Opt-Out Mechanism Telemetry is **enabled by default** on an opt-out basis. To disable it, set `AXOLOTL_DO_NOT_TRACK=1` or `DO_NOT_TRACK=1`. A warning message will be logged on start to clearly inform users about telemetry. We will remove this after some period. To hide the warning message about telemetry that is displayed on train, etc. startup, explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1` (explicitly disable telemetry). ## Privacy - All path-like config information is automatically redacted from telemetry data - Model information is only collected for whitelisted organizations - See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations - Each run generates a unique anonymous ID - This allows us to link different telemetry events in a single same training run - Telemetry is only sent from the main process to avoid duplicate events ================================================ FILE: docs/torchao.qmd ================================================ --- title: "PyTorch ao" description: "Custom data types and layouts for training and inference" --- To use experimental optimizers (`AdamWFp8`, `AdamW4bit`, `AdamW8bit`) from Pytorch Ao, please install the package as shown below. ::: {.callout-tip} Some experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package! ::: ### Installation Stable Release from the PyTorch index ```bash pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124 ``` Nightly release ```bash pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124 ``` ================================================ FILE: docs/unsloth.qmd ================================================ --- title: "Unsloth" description: "Hyper-optimized QLoRA finetuning for single GPUs" --- ### Overview Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over standard industry baselines. ::: {.callout-important} Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch. This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd). ::: ### Installation The following will install the correct unsloth and extras from source. ```bash python scripts/unsloth_install.py | sh ``` ### Usage Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains. Our unsloth integration is currently limited to the following model architectures: - llama These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning ```yaml unsloth_lora_mlp: true unsloth_lora_qkv: true unsloth_lora_o: true ``` These options are composable and can be used with multi-gpu finetuning ```yaml unsloth_cross_entropy_loss: true unsloth_rms_norm: true unsloth_rope: true ``` ### Limitations - Single GPU only; e.g. no multi-gpu support - No deepspeed or FSDP support (requires multi-gpu) - LoRA + QLoRA support only. No full fine tunes or fp8 support. - Limited model architecture support. Llama, Phi, Gemma, Mistral only - No MoE support. ================================================ FILE: examples/LiquidAI/README.md ================================================ # Finetune Liquid Foundation Models 2 (LFM2) with Axolotl [Liquid Foundation Models 2 (LFM2)](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) are a family of small, open-weight models from [Liquid AI](https://www.liquid.ai/) focused on quality, speed, and memory efficiency. Liquid AI released text-only [LFM2](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) and text+vision [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) models. LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference. This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl. Thanks to the team at LiquidAI for giving us early access to prepare for these releases. ## Getting Started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have a compatible version of Pytorch installed pip3 install packaging setuptools wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Run one of the finetuning examples below. **LFM2** ```bash # FFT SFT (1x48GB @ 25GiB) axolotl train examples/LiquidAI/lfm2-350m-fft.yaml ``` **LFM2-VL** ```bash # LoRA SFT (1x48GB @ 2.7GiB) axolotl train examples/LiquidAI/lfm2-vl-lora.yaml ``` **LFM2-MoE** ```bash pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6 # LoRA SFT (1x48GB @ 16.2GiB) axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml ``` ### TIPS - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it: ```bash pip uninstall -y causal-conv1d ``` - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html). - **Dataset Formats**: - For LFM2 models, the dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - For LFM2-VL models, Axolotl follows the multi-content Messages format. See our [Multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) for details. ## Optimization Guides - [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html) ## Related Resources - [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) - [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models) - [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/LiquidAI/lfm2-350m-fft.yaml ================================================ base_model: LiquidAI/LFM2-350M plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin eot_tokens: - "<|im_end|>" datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_field_role: from message_field_content: value dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 4 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-5 bf16: true tf32: true gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/LiquidAI/lfm2-8b-a1b-lora.yaml ================================================ base_model: LiquidAI/LFM2-8B-A1B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: true eot_tokens: - "<|im_end|>" datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_field_role: from message_field_content: value dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 4 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-5 bf16: true tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/LiquidAI/lfm2-vl-lora.yaml ================================================ base_model: LiquidAI/LFM2-VL-450M trust_remote_code: true model_type: AutoModelForImageTextToText processor_type: AutoProcessor plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/alst/README.md ================================================ # Arctic Long Sequence Training (ALST) Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization techniques. It is a combination of: - TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage - Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage - Activation Offloading: Offload activations to CPU RAM to reduce memory usage For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996). ## Usage ```yaml tiled_mlp: true # See Sequence Parallelism docs # https://docs.axolotl.ai/docs/sequence_parallelism.html context_parallel_size: int plugins: # See Cut Cross Entropy docs # https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # or Liger Kernel docs # https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels - axolotl.integrations.liger.LigerPlugin # ... ``` ================================================ FILE: examples/alst/llama3-8b-deepspeed-alst.yaml ================================================ base_model: meta-llama/Llama-3.1-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: togethercomputer/Long-Data-Collections type: completion field: text data_files: - pretrain/rp_sub.jsonl.zst - path: princeton-nlp/TextbookChapters type: completion field: chapter dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 500_000 min_sample_len: 200_000 sample_packing: true tiled_mlp: true context_parallel_size: 8 plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: true gradient_checkpointing: true activation_offloading: legacy resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_steps: 100 saves_per_epoch: 1 evals_per_epoch: 2 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/alst/llama3-8b-fsdp2-alst.yaml ================================================ base_model: meta-llama/Llama-3.1-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: togethercomputer/Long-Data-Collections type: completion field: text data_files: - pretrain/rp_sub.jsonl.zst - path: princeton-nlp/TextbookChapters type: completion field: chapter dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 500_000 min_sample_len: 200_000 sample_packing: true tiled_mlp: true context_parallel_size: 8 plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: true gradient_checkpointing: true activation_offloading: legacy resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_steps: 100 saves_per_epoch: 1 evals_per_epoch: 2 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> fsdp_version: 2 fsdp_config: offload_params: false # offloading is currently not compatible with SP + torchao optimizer state_dict_type: SHARDED_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer reshard_after_forward: true # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/apertus/README.md ================================================ # Finetune Swiss-AI's Apertus with Axolotl [Apertus](https://huggingface.co/collections/swiss-ai/apertus-llm-68b699e65415c231ace3b059) is a family of opensource models trained by Swiss-ai. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Apertus is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). Here is an example of how to install from main for pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation -e '.[flash-attn]' # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy python scripts/cutcrossentropy_install.py | sh ``` 2. (Optional, highly recommended) Install XIELU CUDA ```bash ## Recommended for reduced VRAM and faster speeds # Point to CUDA toolkit directory # For those using our Docker image, use the below path. export CUDA_HOME=/usr/local/cuda pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps ``` For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues) 3. Run the finetuning example: ```bash axolotl train examples/apertus/apertus-8b-qlora.yaml ``` This config uses about 8.7 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Tips - For inference, the official Apertus team recommends `top_p=0.9` and `temperature=0.8`. - You can instead use full paremter fine-tuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ### XIELU Installation Issues #### `ModuleNotFoundError: No module named 'torch'` Please check these one by one: - Running in correct environment - Env has PyTorch installed - CUDA toolkit is at `CUDA_HOME` If those didn't help, please try the below solutions: 1. Pass env for CMAKE and try install again: ```bash Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps ``` 2. Git clone the repo and manually hardcode python path: ```bash git clone https://github.com/nickjbrowning/XIELU cd xielu git checkout 59d6031 cd xielu nano CMakeLists.txt # or vi depending on your preference ``` ```diff execute_process( - COMMAND ${Python_EXECUTABLE} -c "import torch.utils; print(torch.utils.cmake_prefix_path)" + COMMAND /root/miniconda3/envs/py3.11/bin/python -c "import torch.utils; print(torch.utils.cmake_prefix_path)" RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR ) ``` ```bash pip3 install . --no-build-isolation --no-deps ``` ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [Apertus Tech Report](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/apertus/apertus-8b-qlora.yaml ================================================ base_model: swiss-ai/Apertus-8B-Instruct-2509 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/arcee/README.md ================================================ # Finetune ArceeAI's AFM with Axolotl [Arcee Foundation Models (AFM)](https://huggingface.co/collections/arcee-ai/afm-45b-68823397c351603014963473) are a family of 4.5B parameter open weight models trained by Arcee.ai. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as AFM is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). Here is an example of how to install from main for pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation -e '.[flash-attn]' # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy python scripts/cutcrossentropy_install.py | sh ``` 2. Run the finetuning example: ```bash axolotl train examples/arcee/afm-4.5b-qlora.yaml ``` This config uses about 7.8GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - For inference, the official Arcee.ai team recommends `top_p: 0.95`, `temperature: 0.5`, `top_k: 50`, and `repeat_penalty: 1.1`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [AFM Blog](https://docs.arcee.ai/arcee-foundation-models/introduction-to-arcee-foundation-models) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/arcee/afm-4.5b-qlora.yaml ================================================ base_model: arcee-ai/AFM-4.5B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/archived/README.md ================================================ # Archived Examples This directory contains examples that are no longer maintained and may no longer be functional. We keep them around for archival purposes in case they are useful to others. ================================================ FILE: examples/archived/cerebras/btlm-ft.yml ================================================ base_model: cerebras/btlm-3b-8k-base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: GPT2Tokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true tokenizer_use_fast: true tokenizer_legacy: true push_dataset_to_hub: hf_use_auth_token: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_prepared_run val_set_size: 0.05 adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: sample_packing: false sample_packing_eff_est: sample_packing_seq_len_multiplier: total_num_tokens: lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/btlm-out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_eps: 0.000000001 max_grad_norm: 1.0 torchdistx_path: lr_scheduler: cosine lr_quadratic_warmup: true learning_rate: 0.000085 train_on_inputs: true group_by_length: false bf16: auto tf32: true gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 flash_attention: true sdp_attention: flash_optimum: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 save_total_limit: weight_decay: 0.1 special_tokens: pad_token: "<|endoftext|>" fsdp: # - full_shard # - auto_wrap fsdp_config: # fsdp_state_dict_type: FULL_STATE_DICT # fsdp_transformer_layer_cls_to_wrap: BTLMBlock ================================================ FILE: examples/archived/cerebras/qlora.yml ================================================ base_model: cerebras/Cerebras-GPT-1.3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: qlora lora_model_dir: sequence_len: 2048 lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - c_fc - c_attn - c_proj lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/qlora-out batch_size: 4 micro_batch_size: 4 num_epochs: 2 optimizer: paged_adamw_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: pad_token: "<|endoftext|>" ================================================ FILE: examples/archived/code-llama/13b/lora.yml ================================================ base_model: codellama/CodeLlama-13b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/13b/qlora.yml ================================================ base_model: codellama/CodeLlama-13b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/34b/lora.yml ================================================ base_model: codellama/CodeLlama-34b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/34b/qlora.yml ================================================ base_model: codellama/CodeLlama-34b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/7b/lora.yml ================================================ base_model: codellama/CodeLlama-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/7b/qlora.yml ================================================ base_model: codellama/CodeLlama-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/code-llama/README.md ================================================ # Overview This is an example of CodeLLaMA configuration for 7b, 13b and 34b. The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes. The 13b variant will fit if you change these settings to these values: gradient_accumulation_steps: 2 micro_batch_size: 1 The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices. ```shell accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml ``` or ```shell accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml ``` ================================================ FILE: examples/archived/dbrx/16bit-lora.yaml ================================================ base_model: LnL-AI/dbrx-base-converted-v2 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 512 sample_packing: false pad_to_sequence_len: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: adapter: lora lora_model_dir: lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 # w1, w2, & v1 will hang the trainer lora_target_modules: - q_proj # attn - k_proj # attn - v_proj # attn - out_proj # attn - layer # router # - w1 # - w2 # - v1 gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: false # don't use with fsdp_activation_checkpointing gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: false fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DbrxBlock fsdp_state_dict_type: FULL_STATE_DICT fsdp_activation_checkpointing: true ================================================ FILE: examples/archived/dbrx/8bit-lora.yaml ================================================ base_model: LnL-AI/dbrx-base-converted-v2 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: true load_in_4bit: false datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 512 sample_packing: false pad_to_sequence_len: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: adapter: lora lora_model_dir: lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 # w1, w2, & v1 will hang the trainer lora_target_modules: - q_proj # attn - k_proj # attn - v_proj # attn - out_proj # attn - layer # router # - w1 # - w2 # - v1 gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: false # don't use with fsdp_activation_checkpointing gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: false fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DbrxBlock fsdp_state_dict_type: FULL_STATE_DICT fsdp_activation_checkpointing: true ================================================ FILE: examples/archived/dbrx/README.md ================================================ # DBRX MoE Currently, for LoRA, only the `q_proj`, `k_proj`, `v_proj` `out_proj` and `layer` Linear layers are trainable. We are using the "converted" base models based on [this issue](https://huggingface.co/databricks/dbrx-instruct/discussions/10) where the Experts are fused as an `nn.Parameter` rather than a `nn.Linear` layer. However, the implementation is still a bit buggy and attempting to train a LoRA adapter over those `w1`, `w2` and `v1` layers results in the trainer hanging. ### FSDP We've tested using the [`LnL-AI/dbrx-base-converted-v2`](https://huggingface.co/LnL-AI/dbrx-base-converted-v2) model as the base model for FSDP. The high memory usage seen w/ FSDP is due to FSDP not supporting 8bit optimizers. - 16-bit LoRA w/ FSDP - ✅ w/o CPU Offload - 8x80GB uses ~80GiB/gpu - ❌ w/ CPU Offload - `paged_adamw_8bit` optimizer errors from being on cpu - ✅ 8-bit LoRA w/ FSDP - ❌ 4-bit QLoRA w/ FSDP - errors w/: `Error an illegal memory access was encountered at line 90 in file /src/csrc/ops.cu` - ✅ bf16 full finetune w/ FSDP, freezing all but first 8 layers (8x80GB uses ~78GiB/gpu) ### Deepspeed WIP ================================================ FILE: examples/archived/dbrx/fft-ds-zero3.yaml ================================================ base_model: LnL-AI/dbrx-base-converted-v2 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 512 sample_packing: false pad_to_sequence_len: false unfrozen_parameters: - transformer.blocks.[0-7]. wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 deepspeed: deepspeed_configs/zero3_bf16.json ================================================ FILE: examples/archived/deepcoder/deepcoder-14B-preview-lora.yml ================================================ base_model: agentica-org/DeepCoder-14B-Preview # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false strict: false datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/falcon/config-7b-lora.yml ================================================ base_model: tiiuae/falcon-7b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main trust_remote_code: true load_in_8bit: true load_in_4bit: false gptq: false push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca:chat dataset_prepared_path: val_set_size: 0.05 adapter: lora lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 16 lora_alpha: 32 lora_dropout: 0.0 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/falcon-7b batch_size: 2 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" ================================================ FILE: examples/archived/falcon/config-7b-qlora.yml ================================================ # 1b: tiiuae/falcon-rw-1b # 40b: tiiuae/falcon-40b base_model: tiiuae/falcon-7b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main trust_remote_code: true load_in_8bit: false # enable 4bit for QLoRA load_in_4bit: true gptq: false push_dataset_to_hub: datasets: - path: QingyiSi/Alpaca-CoT data_files: - Chain-of-Thought/formatted_cot_data/gsm8k_train.json type: "alpaca:chat" dataset_prepared_path: val_set_size: 0.05 # enable QLoRA adapter: qlora lora_model_dir: sequence_len: 2048 max_packed_sequence_len: # hyperparameters from QLoRA paper Appendix B.2 # "We find hyperparameters to be largely robust across datasets" lora_r: 64 lora_alpha: 16 # 0.1 for models up to 13B # 0.05 for 33B and 65B models lora_dropout: 0.05 # add LoRA modules on all linear layers of the base model lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/qlora-out # QLoRA paper Table 9 # - 16 for 7b & 13b # - 32 for 33b, 64 for 64b # Max size tested on A6000 # - 7b: 40 # - 40b: 4 # decrease if OOM, increase for max VRAM utilization micro_batch_size: 1 gradient_accumulation_steps: 2 num_epochs: 4 # Optimizer for QLoRA optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine # QLoRA paper Table 9 # - 2e-4 for 7b & 13b # - 1e-4 for 33b & 64b learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true # stop training after this many evaluation losses have increased in a row # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.000001 special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" ================================================ FILE: examples/archived/falcon/config-7b.yml ================================================ base_model: tiiuae/falcon-7b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main trust_remote_code: true gptq: false push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca:chat dataset_prepared_path: val_set_size: 0.05 adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 64 lora_alpha: 32 lora_dropout: 0.0 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/falcon-7b batch_size: 2 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" ================================================ FILE: examples/archived/gemma/qlora.yml ================================================ # use google/gemma-7b if you have access base_model: mhenrichsen/gemma-7b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca val_set_size: 0.1 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true sequence_len: 4096 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 3 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/gptj/qlora.yml ================================================ base_model: EleutherAI/gpt-j-6b # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: qlora lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/qlora-out gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 2 optimizer: paged_adamw_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0001 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: pad_token: "<|endoftext|>" ================================================ FILE: examples/archived/jeopardy-bot/config.yml ================================================ base_model: huggyllama/llama-7b # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false datasets: - path: openaccess-ai-collective/jeopardy type: jeopardy dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 512 max_packed_sequence_len: lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_fan_in_fan_out: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/jeopardy-bot-7b gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/mpt-7b/README.md ================================================ # MPT-7B ```shell accelerate launch scripts/finetune.py examples/mpt-7b/config.yml ``` ================================================ FILE: examples/archived/mpt-7b/config.yml ================================================ base_model: mosaicml/mpt-7b # optionally might have model_type or tokenizer_type tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true # required for mpt as their model class is not merged into transformers yet load_in_8bit: false datasets: - path: vicgalle/alpaca-gpt4 type: alpaca dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - v_proj lora_fan_in_fan_out: false wandb_project: mpt-alpaca-7b wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/mpt-alpaca-7b gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0000002 bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0001 tokens: pad_token: "<|padding|>" bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" unk_token: "<|endoftext|>" ================================================ FILE: examples/archived/openllama-3b/README.md ================================================ # openllama-3b Basic full tune ```shell accelerate launch scripts/finetune.py examples/openllama-3b/config.yml ``` LoRA ```shell accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml ``` QLoRA ```shell accelerate launch scripts/finetune.py examples/openllama-3b/qlora.yml ``` ================================================ FILE: examples/archived/openllama-3b/config.yml ================================================ base_model: openlm-research/open_llama_3b_v2 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 1024 sample_packing: true lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/openllama-out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.000003 float16: true bf16: false fp16: false tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/openllama-3b/lora.yml ================================================ base_model: openlm-research/open_llama_3b_v2 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.02 adapter: lora lora_model_dir: sequence_len: 1024 sample_packing: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/lora-out gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 bf16: false fp16: true tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/openllama-3b/qlora.yml ================================================ base_model: openlm-research/open_llama_3b_v2 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: qlora lora_model_dir: sequence_len: 1024 sample_packing: true lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/qlora-out gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 bf16: false fp16: true tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" ================================================ FILE: examples/archived/pythia/lora.yml ================================================ base_model: EleutherAI/pythia-1.4b-deduped # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: lora lora_model_dir: sequence_len: 512 lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - query_key_value lora_target_linear: lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/lora-alpaca-pythia gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 4 learning_rate: 0.00001 bf16: auto tf32: true resume_from_checkpoint: weight_decay: 0.1 evals_per_epoch: 4 logging_steps: 1 ================================================ FILE: examples/archived/pythia-12b/README.md ================================================ # Pythia 12B - Single-GPU A100 only (?) ```shell python scripts/finetune.py examples/pythia-12b/config.yml ``` ⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️ ================================================ FILE: examples/archived/pythia-12b/config.yml ================================================ base_model: EleutherAI/pythia-12b-deduped base_model_ignore_patterns: pytorch* # prefer safetensors # optionally might have model_type or tokenizer_type model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name gptq: false device_map: auto datasets: - path: vicgalle/alpaca-gpt4 type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: 2048 lora_r: 64 lora_alpha: 32 lora_dropout: 0.0 lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/pythia-12b gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 5 learning_rate: 0.00003 optimizer: adamw_bnb_8bit lr_scheduler: cosine bf16: false fp16: false float16: true tf32: true flash_optimum: true resume_from_checkpoint: gradient_checkpointing: true ================================================ FILE: examples/archived/qwen/README.md ================================================ # Qwen TODO # Qwen2 MoE ✅ multipack ✅ qwen2_moe 4-bit QLoRA ✅ qwen2_moe 16-bit LoRA ❓ qwen2_moe 8-bit LoRA ================================================ FILE: examples/archived/qwen/lora.yml ================================================ base_model: Qwen/Qwen-7B # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 2048 # supports up to 8192 sample_packing: false pad_to_sequence_len: adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/qwen/qlora.yml ================================================ base_model: Qwen/Qwen-7B # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 2048 # supports up to 8192 sample_packing: false pad_to_sequence_len: adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/qwen/qwen2-moe-lora.yaml ================================================ base_model: Qwen/Qwen1.5-MoE-A2.7B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 1024 # supports up to 32k sample_packing: false pad_to_sequence_len: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/qwen/qwen2-moe-qlora.yaml ================================================ base_model: Qwen/Qwen1.5-MoE-A2.7B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 1024 # supports up to 32k sample_packing: false pad_to_sequence_len: false adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/redpajama/README.md ================================================ # RedPajama 3B preview release ```shell accelerate launch scripts/finetune.py examples/redpajama/config-3b.yml ``` ================================================ FILE: examples/archived/redpajama/config-3b.yml ================================================ base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1 # optionally might have model_type or tokenizer_type model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: load_in_8bit: false datasets: - path: vicgalle/alpaca-gpt4 type: alpaca dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - v_proj lora_fan_in_fan_out: false wandb_project: redpajama-alpaca-3b wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/redpajama-alpaca-3b batch_size: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0000002 bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0001 tokens: pad_token: "<|padding|>" bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" unk_token: "<|endoftext|>" ================================================ FILE: examples/archived/replit-3b/config-lora.yml ================================================ base_model: replit/replit-code-v1-3b # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false datasets: - path: vicgalle/alpaca-gpt4 type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: lora lora_model_dir: sequence_len: 2048 max_packed_sequence_len: lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - Wqkv - mlp_up - mlp_down wandb_project: lora-replit wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/lora-replit batch_size: 8 micro_batch_size: 1 num_epochs: 4 optimizer: torchdistx_path: lr_scheduler: learning_rate: 0.00001 bf16: auto tf32: true gradient_checkpointing: resume_from_checkpoint: logging_steps: 1 flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0 #special_tokens: ================================================ FILE: examples/archived/stablelm-2/1.6b/fft.yml ================================================ base_model: stabilityai/stablelm-2-1_6b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only weight_decay: 0.1 special_tokens: ================================================ FILE: examples/archived/stablelm-2/1.6b/lora.yml ================================================ base_model: stabilityai/stablelm-2-1_6b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/stablelm-2/README.md ================================================ # StableLM 2 This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case. ## Estimating GPU Requirements | type | deepspeed | batch size | context length | vRAM GPU (GBs) | |---------------|-----------|------------|----------------|----------------| | full finetune | N/A | 1 | 4096 | ~21.5GBs | | full finetune | zero2 | 1 | 4096 | ~20GBs | | lora | N/A | 1 | 4096 | ~16.6GBs | The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096). This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html ## Training We have example scripts here for both full finetuning and lora using the popular alpaca dataset: ```shell # preprocess the dataset CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml ``` Single GPU Training: ```shell python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json # OR python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml ``` Multinode GPU Training with `accelerate`: ```shell # make sure you've configured accelerate properly accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json ``` ================================================ FILE: examples/archived/starcoder2/qlora.yml ================================================ base_model: bigcode/starcoder2-3b # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.2 output_dir: ./outputs/qlora adapter: qlora lora_model_dir: sequence_len: 8192 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_run_id: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 2 num_epochs: 3 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: auto fp16: false tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 eval_steps: saves_per_epoch: 4 save_steps: save_total_limit: 2 weight_decay: special_tokens: ================================================ FILE: examples/archived/tiny-llama/README.md ================================================ # Overview This is a simple example of how to finetune TinyLlama1.1B using either lora or qlora: LoRa: ``` accelerate launch -m axolotl.cli.train examples/tiny-llama/lora.yml ``` qLoRa: ``` accelerate launch -m axolotl.cli.train examples/tiny-llama/qlora.yml ``` Both take about 10 minutes to complete on a 4090. ================================================ FILE: examples/archived/tiny-llama/lora-mps.yml ================================================ base_model: TinyLlama/TinyLlama_v1.1 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto fp16: false tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 0 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/tiny-llama/lora.yml ================================================ base_model: TinyLlama/TinyLlama_v1.1 # optionally might have model_type or tokenizer_type tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/tiny-llama/pretrain.yml ================================================ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name max_steps: 200 pretraining_dataset: - path: allenai/c4 name: en type: pretrain dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/model-out sequence_len: 2048 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/tiny-llama/qlora.yml ================================================ base_model: TinyLlama/TinyLlama_v1.1 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true eval_sample_packing: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/archived/xgen-7b/xgen-7b-8k-qlora.yml ================================================ # An example finetuning Saleforce's XGen-7b model with 8k context using qlora # on Tim Dettmer's Guanaco dataset. base_model: Salesforce/xgen-7b-8k-base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false # enable 4bit for QLoRA load_in_4bit: true gptq: false push_dataset_to_hub: datasets: - path: timdettmers/openassistant-guanaco data_files: - openassistant_best_replies_train.jsonl type: "completion" dataset_prepared_path: val_set_size: 0.05 # enable QLoRA adapter: qlora lora_model_dir: sequence_len: 8192 max_packed_sequence_len: # hyperparameters from QLoRA paper Appendix B.2 # "We find hyperparameters to be largely robust across datasets" lora_r: 64 lora_alpha: 16 # 0.1 for models up to 13B # 0.05 for 33B and 65B models lora_dropout: 0.05 # add LoRA modules on all linear layers of the base model lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/qlora-out # QLoRA paper Table 9 # - 16 for 7b & 13b # - 32 for 33b, 64 for 64b # Max size tested on A6000 # - 7b: 40 # - 40b: 4 # decrease if OOM, increase for max VRAM utilization micro_batch_size: 1 gradient_accumulation_steps: 1 num_epochs: 4 # Optimizer for QLoRA optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine # QLoRA paper Table 9 # - 2e-4 for 7b & 13b # - 1e-4 for 33b & 64b learning_rate: 0.00002 bf16: auto tf32: false gradient_checkpointing: true # stop training after this many evaluation losses have increased in a row # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true logging_steps: 1 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: eos_token: "<|endoftext|>" bos_token: "<|endoftext|>" unk_token: "<|endoftext|>" pad_token: "<|endoftext|>" ================================================ FILE: examples/archived/yi-34B-chat/README.md ================================================ # Overview This is an example of a Yi-34B-Chat configuration. It demonstrates that it is possible to finetune a 34B model on a GPU with 24GB of VRAM. Tested on an RTX 4090 with `python -m axolotl.cli.train examples/mistral/qlora.yml`, a single epoch of finetuning on the alpaca dataset using qlora runs in 47 mins, using 97% of available memory. ================================================ FILE: examples/archived/yi-34B-chat/qlora.yml ================================================ base_model: 01-ai/Yi-34B-Chat # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true sequence_len: 1024 bf16: auto tf32: false flash_attention: true special_tokens: bos_token: "<|startoftext|>" eos_token: "<|endoftext|>" unk_token: "" # Data datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca warmup_ratio: 0.1 # Iterations num_epochs: 1 # Evaluation val_set_size: 0.1 evals_per_epoch: 5 eval_sample_packing: false eval_batch_size: 1 # LoRA output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: # Sampling sample_packing: false pad_to_sequence_len: false # Batching gradient_accumulation_steps: 4 micro_batch_size: 1 gradient_checkpointing: true # wandb wandb_project: # Optimizer optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 # Misc resume_from_checkpoint: logging_steps: 1 weight_decay: 0 ================================================ FILE: examples/cloud/baseten.yaml ================================================ provider: baseten project_name: secrets: - HF_TOKEN - WANDB_API_KEY gpu: h100 gpu_count: 8 node_count: 1 ================================================ FILE: examples/cloud/modal.yaml ================================================ project_name: volumes: - name: axolotl-data mount: /workspace/data - name: axolotl-artifacts mount: /workspace/artifacts # environment variables from local to set as secrets secrets: - HF_TOKEN - WANDB_API_KEY # Which branch of axolotl to use remotely branch: # additional custom commands when building the image dockerfile_commands: gpu: h100 gpu_count: 1 # Train specific configurations memory: 128 timeout: 86400 # Preprocess specific configurations memory_preprocess: 32 timeout_preprocess: 14400 ================================================ FILE: examples/cohere/command-r-7b-qlora.yml ================================================ base_model: CohereForAI/c4ai-command-r7b-12-2024 model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: cohere datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/colab-notebooks/colab-axolotl-example.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "OPLSwmgdrB7g" }, "source": [ "# Fine-Tune Qwen3 14B with Axolotl\n", "\n", "[\"Built](https://github.com/axolotl-ai-cloud/axolotl)\n", "\n", "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n", "\n", "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n", "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n", "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n", "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "rVjKD7CbxIP3" }, "source": [ "# Installation\n", "\n", "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "msOCO4NRmRLa" }, "outputs": [], "source": [ "%%capture\n", "# This step can take ~5-10 minutes to install dependencies\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\"" ] }, { "cell_type": "markdown", "metadata": { "id": "N0OW0YeksDLr" }, "source": [ "## Demo: Talk Like a Pirate\n", "\n", "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab." ] }, { "cell_type": "markdown", "metadata": { "id": "8Du2fANTsNCK" }, "source": [ "### Upload your own dataset or use a Huggingface dataset\n", "\n", "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n", "\n", "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fGEEjyQ-r_IV" }, "outputs": [], "source": [ "# Default to HF dataset location\n", "dataset_id = \"winglian/pirate-ultrachat-10k\"\n", "uploaded = {}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c5MyYqk7vIsG" }, "outputs": [], "source": [ "import os\n", "\n", "# Optionally, upload your own JSONL to your Google Drive\n", "GOOGLE_DRIVE_PATH = \"\" # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n", "\n", "# \"Select All\" permissions, or you may get the error:\n", "# \"MessageError: Error: credential propagation was unsuccessful\"\n", "if GOOGLE_DRIVE_PATH:\n", " from google.colab import drive\n", "\n", " # Mount your Google Drive\n", " GOOGLE_DRIVE_MNT = \"/content/drive/\"\n", " drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n", " tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n", " # make sure file exists\n", " if not os.path.isfile(tmp_path):\n", " raise ValueError(f\"File {tmp_path} does not exist\")\n", " dataset_id = tmp_path" ] }, { "cell_type": "markdown", "metadata": { "id": "U6pTk3A9xj1W" }, "source": [ "# Configure for Supervised Fine-Tuning (SFT)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 151, "referenced_widgets": [ "388f618924274d21a066f098f4f1e744", "7c95f85a2b1f47a1bd846d110c47bb3c", "083f9cda8d754c168beee10d2f8955a2", "62e1a65582f446a78612eaa804e08a7d", "487a177d020f4605834878b2fdc7afa3", "7fd44cf9ca6e4726bfd7ac21846d6a14", "366a343b62fa47d8985a3bd464d99f9e", "a0a11e929edd4189b79723d618522c33", "e87ea87fcff247b5bbcc331ba79a8dc2", "5e18768f7ad6434ba8b8b8a2e853e204", "bb33aec33a6447078c31bfd728942994" ] }, "id": "fdRioqytmTtX", "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n", "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n", "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n", "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n", "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "388f618924274d21a066f098f4f1e744", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/728 [00:00\"],\n", " }\n", " ],\n", " dataloader_prefetch_factor=8, # dataloader optimizations\n", " dataloader_num_workers=2,\n", " dataloader_pin_memory=True,\n", ")\n", "\n", "# validates the configuration\n", "cfg = load_cfg(config)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "715UpvnSoBIS" }, "outputs": [], "source": [ "from axolotl.utils import set_pytorch_cuda_alloc_conf\n", "\n", "set_pytorch_cuda_alloc_conf()" ] }, { "cell_type": "markdown", "metadata": { "id": "Vc6MC-hwyH-n" }, "source": [ "# Datasets\n", "\n", "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "b82aa8c57f7c422a9a9c90f333ed2a99", "c0991cf63ee6458b96e9a75e7a88b61a", "71c8af139cd248b1b51101fd46a93f35", "1d5117195d4b49eb8f1a73b18419f7ce", "3c21e4a511b4441192c03b7f1d0976e9", "ed28e2e0410d4e0b855467e798e53d66", "d93f134f802b4b69b575bdaf07dbd27c", "d0e9dce55cec4c1ca619a0ccf209d924", "4c727d40ef0443449afc31724ee79f0c", "0dea5caa27384f5689e3cab51f558727", "a6f48410b9964fefba0c3009a77dc838", "95caff42f08a4c2aa14c867b8f37f231", "de7c37ee83e24f0c889e84d07279c2ec", "9d4897eefb5f48259ffb2d23e332f752", "253017b0d0534e54ab44e181f6d7c82d", "27beaf06e41b472abdb544a43c720c5a", "34cf3df51fbc41cabfdbba153c007f0e", "ac764024cf1c4e08ba7749afd2cd20ac", "30a81da86f8043eca301e86a8651201a", "e8b7a81040904c1e89e58978223b1737", "1c6f1f10667545aaab958016ba7e2c94", "e6e969610738449887259063967f82b0", "a138859f19b74fc0928dc236ab5359db", "9b42e08b3c9548818488268768a118b1", "12b56912736849fea2ad8124456fdc5c", "879c8ab5873847a8833bd74123be90a4", "20352e5f58d24bb8b1f3940efd14fe4a", "d955dcaa0e944e719f3a06139dd54a03", "d3de2662c7964f1ba96e58da382af720", "97e36007e1304e1583fd81bfb13f0edd", "c65dc74c7d6f4bab8f7dd28455161dd8", "ef223e8504b64e3592589880326aaf41", "598da69727bd4fb8b1caf465ac736d7a", "5f86cd894de94c3280fadc1e2fd0ee13", "a20927bf5f2c41f58c1e31ac858ab36c", "0a46ad75c198463d843fb35e813642cb", "09007681cf8d42aeb8c1d2f6a74e470a", "ebc80d1a55fa47f4a5ea2756588569ec", "1811cda0644e4190a9469d1774435d82", "35c811d2ae8e43f3b5cecbdd3cfa857f", "b8e39e4dddc3497fbc29ae45c66da759", "63b4e563e85c4f03b1b72beda9577bcc", "b195f160ca20442fadd8b5aed0ee41af", "ca65e32eb52f48c09a84b33cb18f22cd", "7cd0b85ebd204b7aba908417811ce4e0", "7baeab52d6694c32b1efd1ea1a0a7782", "519a7b154022443db6703f04a9142bae", "d4183e9715f34d249942b8271cca3bdf", "da2347ac94764a3fa2743343cf0d3cd2", "93a44a11aa4846fa8efc6c1413ef1627", "a55060adc3564407ac81ad7297d34aaa", "d02274afd47b462291c745f261209d42", "0f417447a7bd4a33acca96fa37aec877", "63580b6fb30642479fe3000915bf551a", "8f726dbfb45d4528afa33e36a6313267", "03b093d592ba4386aa61f7b8483da660", "b8766a88716948cf968f4563531a76d9", "6f3a28b912714c6e931003549664bfa3", "16d1283741404b7bb319094c992fce01", "2a5bb0e818ab47be8cf6465988328503", "2b3a2659b12244bd8548320320016dbf", "0cd7efffbb3c4c4b972e63749f61ab97", "5ca240f31e6b44e3882c5eb37cd5a309", "5eb06edeb58e4930b1affef2a59eae81", "a4e5789584564049b83df7c6c54a3e08", "ff3a94b146a948b6907f5d80c7157f99", "258b7c635c1045329d4669e48c46ccd5", "6f68ed9889f54ad2ae8a3b95ac263a83", "80366349d81e4dcc892db6cd56e384f3", "c73055099c084dca996159e23e162d0b", "977f799afaac4a55b2dc1cffa7d5b63b", "41f3b32c2f6b4034ae7a3b9124e28bc7", "a10d0a76010f4e508c65a9b69ebc5156", "f8ef805b776145c3bfa9ba8d90972058", "cc587493c33c4f118d1b1170f85be24c", "e40d1c1ac9494b3bade9858324e7ffdf", "d65b6b060d9845779299491ac5599c31", "0f6907ebbc6242c8bde059cef1e1bd29", "5bdfd87fc6cd4f9dabef7cfee29c8060", "64f54d4a744a4627a07c3c0120276f3b", "65b75b9b8bc143cf997796af68ff6668", "d6fe74e4255444368f8f90a62157d869", "4d468f96ec924681ad65eb671674b93e", "ad7599de524549c48bf2d3124ad4b299", "0546d04aae644dde846c58a4afb598a6", "897b77a56c09479bb11d7f2a30997e55", "81c3db71ac704280ad030072655f1537", "042e091f75694c47aee761e760e76773", "ef0a3c7a6f14460fb4da096928ae249e", "07fb3a2c8315494e97b447e672dfae06", "ec030fc3c346426f9abc3a89892258d3", "e3fb3fc6afe04b3c9b7ac61809ce78fa", "c3be9109d63c485d9c0ef4f9bc0f9218", "12815f401eba44658caa7b2e490137a8", "30e02aa2d0d241979369e598287f2639", "dfd2a2649b8341ef913207526708aff1", "4f1977d7e4824ef1a14b65f0f42bba10", "c6164e05a1914ae48083db9ad7f4ef7c", "813621384dc748b0ad06775e22761c0b", "dc892a596f6942d7973c616c38f0eebb", "c84cc07789be48aebb322c23d355289e", "bed8726b8069434687c75452e21f19e5", "16a188a0b06d45f980dcf3933509fe0a", "60c1a0d765c14a1d888317e6a507e4ea", "0077aedc3d174560bce924ee89e9c006", "00321cce58884f6f9b3855a21fcd9187", "fa864b41586f4a7aa56aeafd1d84eb75", "3225603166b54e7aab766b9964a2f660", "349eee9f56d64f0cba6fc24ff2c50c9b", "7e5d3774060e4589aa65982da5ea4ef4", "7c2485c6cdfe463da6fdb35982a1070d", "ad1236893754446881e153adc9d5c962", "daee63fd167e4441a32324b51b00ad2b", "fe41858c6bd04c58840112b67c19a336", "d262c82138024169b9f3aa034ca756fa", "62e302ebdad64aada0ffe64ae1c873f3", "bd1b0dfed6d34d16af33a4a58330f5ec", "d07c8b97d3314f1c852e44bdd40f61ed", "ebb69a2c3d0a4299a484698287b3087c", "e5a82df528bb4e408797a3b6c2758f4a", "f113ebd8c1c34806bea4dd7ed3035173" ] }, "id": "KQQhgK8FoDfF", "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/9.68k [00:00\u001b[39m\n", "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n", "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n", "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n", "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n", "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n", "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n", "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7cd0b85ebd204b7aba908417811ce4e0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train.jsonl: 0%| | 0.00/27.3M [00:00system\\n' }}\n", " {%- if messages[0].role == 'system' %}\n", " {{- messages[0].content + '\\n\\n' }}\n", " {%- endif %}\n", " {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n", " {%- for tool in tools %}\n", " {{- \"\\n\" }}\n", " {{- tool | tojson }}\n", " {%- endfor %}\n", " {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n", "{%- else %}\n", " {%- if messages[0].role == 'system' %}\n", " {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n", " {%- endif %}\n", "{%- endif %}\n", "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n", "{%- for message in messages[::-1] %}\n", " {%- set index = (messages|length - 1) - loop.index0 %}\n", " {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('') and message.content.endswith('')) %}\n", " {%- set ns.multi_step_tool = false %}\n", " {%- set ns.last_query_index = index %}\n", " {%- endif %}\n", "{%- endfor %}\n", "{%- for message in messages %}\n", " {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n", " {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n", " {%- elif message.role == \"assistant\" %}\n", " {%- set content = message.content %}\n", " {%- set reasoning_content = '' %}\n", " {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n", " {%- set reasoning_content = message.reasoning_content %}\n", " {%- else %}\n", " {%- if '' in message.content %}\n", " {%- set content = message.content.split('')[-1].lstrip('\\n') %}\n", " {%- set reasoning_content = message.content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n", " {%- endif %}\n", " {%- endif %}\n", " {%- if loop.index0 > ns.last_query_index %}\n", " {%- if loop.last or (not loop.last and reasoning_content) %}\n", " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content.strip('\\n') + '\\n\\n\\n' + content.lstrip('\\n') }}\n", " {%- else %}\n", " {{- '<|im_start|>' + message.role + '\\n' + content }}\n", " {%- endif %}\n", " {%- else %}\n", " {{- '<|im_start|>' + message.role + '\\n' + content }}\n", " {%- endif %}\n", " {%- if message.tool_calls %}\n", " {%- for tool_call in message.tool_calls %}\n", " {%- if (loop.first and content) or (not loop.first) %}\n", " {{- '\\n' }}\n", " {%- endif %}\n", " {%- if tool_call.function %}\n", " {%- set tool_call = tool_call.function %}\n", " {%- endif %}\n", " {{- '\\n{\"name\": \"' }}\n", " {{- tool_call.name }}\n", " {{- '\", \"arguments\": ' }}\n", " {%- if tool_call.arguments is string %}\n", " {{- tool_call.arguments }}\n", " {%- else %}\n", " {{- tool_call.arguments | tojson }}\n", " {%- endif %}\n", " {{- '}\\n' }}\n", " {%- endfor %}\n", " {%- endif %}\n", " {{- '<|im_end|>\\n' }}\n", " {%- elif message.role == \"tool\" %}\n", " {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n", " {{- '<|im_start|>user' }}\n", " {%- endif %}\n", " {{- '\\n\\n' }}\n", " {{- message.content }}\n", " {{- '\\n' }}\n", " {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n", " {{- '<|im_end|>\\n' }}\n", " {%- endif %}\n", " {%- endif %}\n", "{%- endfor %}\n", "{%- if add_generation_prompt %}\n", " {{- '<|im_start|>assistant\\n' }}\n", " {%- if enable_thinking is defined and enable_thinking is false %}\n", " {{- '\\n\\n\\n\\n' }}\n", " {%- endif %}\n", "{%- endif %}\n", "---\u001b[39m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "258b7c635c1045329d4669e48c46ccd5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Tokenizing Prompts (num_proc=2): 0%| | 0/9985 [00:00\n", " \n", " \n", " [25/25 09:25, Epoch 0/1]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
11.092300
21.554200
31.041400
41.733800
51.430000
61.258500
71.343600
81.101700
91.086500
100.813200
110.689600
120.826700
131.541800
140.948000
151.357000
161.085800
171.516800
181.146800
190.834800
200.968000
211.388800
221.511500
231.338500
241.206600
251.504600

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)\u001b[39m\n", "[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.\u001b[39m\n" ] } ], "source": [ "from axolotl.train import train\n", "\n", "# just train the first 25 steps for demo.\n", "# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.\n", "cfg.max_steps = 25\n", "model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)" ] }, { "cell_type": "markdown", "metadata": { "id": "j1b9ypF78eCb" }, "source": [ "# Inferencing the trained model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r3_vHhif8YEs", "outputId": "e5050605-f6c9-421c-98f9-bde56a281eae" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.\n", "\n", "The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. \n", "\n", "The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. \n", "\n", "In other words, if ye have a triangle with sides of length a, b, and c (\n" ] } ], "source": [ "from transformers import TextStreamer\n", "\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"Explain the Pythagorean theorem to me.\",\n", " },\n", "]\n", "\n", "prompt = tokenizer.apply_chat_template(\n", " messages,\n", " add_generation_prompt=True,\n", " tokenize=False,\n", " enable_thinking=False,\n", ")\n", "\n", "outputs = model.generate(\n", " **tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\"),\n", " max_new_tokens=192,\n", " temperature=1.0,\n", " top_p=0.8,\n", " top_k=32,\n", " streamer=TextStreamer(tokenizer, skip_prompt=True),\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "HoGwT2JRSIjA" }, "source": [ "# Saving your trained model\n", "\n", "Axolotl automatically saves checkpoints to the `output_dir` path.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5BmSbiy6NaaS", "outputId": "f5e1d913-7d55-42d2-8340-f9f1b0bc2b38" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 506M\n", "-rw-r--r-- 1 root root 845 May 7 22:21 adapter_config.json\n", "-rw-r--r-- 1 root root 491M May 7 22:21 adapter_model.safetensors\n", "-rw-r--r-- 1 root root 707 May 7 22:11 added_tokens.json\n", "drwxr-xr-x 2 root root 4.0K May 7 22:17 checkpoint-13\n", "drwxr-xr-x 2 root root 4.0K May 7 22:21 checkpoint-25\n", "-rw-r--r-- 1 root root 1.2K May 7 22:11 config.json\n", "-rw-r--r-- 1 root root 1.6M May 7 22:11 merges.txt\n", "-rw-r--r-- 1 root root 2.6K May 7 22:21 README.md\n", "-rw-r--r-- 1 root root 613 May 7 22:11 special_tokens_map.json\n", "-rw-r--r-- 1 root root 9.5K May 7 22:11 tokenizer_config.json\n", "-rw-r--r-- 1 root root 11M May 7 22:11 tokenizer.json\n", "-rw-r--r-- 1 root root 2.7M May 7 22:11 vocab.json\n" ] } ], "source": [ "# Show the saved checkpoints in the output_dir\n", "!ls -lh \"./outputs/qwen-sft-pirate-rrr\"" ] }, { "cell_type": "markdown", "metadata": { "id": "_PCIFWxuOZd6" }, "source": [ "Setting `hub_model_id: ` in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. `hub_model_id: username/model_id`)\n", "\n", "If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 955, "referenced_widgets": [ "c12ea43372ac4d57bb9605f1a429b397", "86816687746246b4a6105e8010384e25", "6f05e9bebf7b40c9835808e77de6c236", "c7433acd3c4841e6958ae8f7e87b1808", "19c1e38389fa46c7b7e2152a56e1df34", "0e067d8db8ed48308a718d5f57683fd1", "131065f118274a1586ac38e39ed84ef0", "8640ac440fbc4644b9a3af7ba3ae7183", "5cea7996f02040b187ece0bb2d6a8d1f", "2e257c8be2da40b4bb67a9e4ab6811f3", "56e3768bef5a4b9db4168c5c17f509c2", "62c028fdef904dedb9cdeca2b3bda725", "a7cf477e80fc43e0ad82c7997b076dce", "835bcc28a5564fb9b3d651bc8e32dc46", "9f1c9a0695384bdaa6f8b847ef89bee8", "b1bea589efa14258a9982071b87938bf", "590eef89881545aa8bbef9a8bbe7fb00", "4b1f04ff63d14a118fdd15814dff50e4", "39789237703c4a418134243055c9cbf5", "a3a945817f684328b34651fe052393ec" ] }, "id": "2yw8pLvlSMl8", "outputId": "6e489ab2-4abe-4e28-84ca-959f912433a4" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c12ea43372ac4d57bb9605f1a429b397", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='

\n", " sys.exit(main())\n", " ^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n", " service.run()\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n", " print(self._upload())\n", " ^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n", " return self.api.upload_folder(\n", " ^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n", " return fn(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n", " return fn(self, *args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n", " commit_info = self.create_commit(\n", " ^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n", " return fn(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n", " return fn(self, *args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n", " self.preupload_lfs_files(\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n", " _upload_xet_files(**upload_kwargs, create_pr=create_pr) # type: ignore [arg-type]\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n", " return fn(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n", " with progress_cm as progress:\n", " File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n", " def __exit__(self, exc_type, exc_value, traceback):\n", "\n", "KeyboardInterrupt\n", "^C\n" ] } ], "source": [ "from huggingface_hub import notebook_login\n", "\n", "# remove the partial epoch checkpoints\n", "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n", "\n", "# HF Notebook login widget\n", "notebook_login()\n", "\n", "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n", "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\"" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "00321cce58884f6f9b3855a21fcd9187": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "004d9177a6a14118a5930dc3cc13147b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72", "IPY_MODEL_c6e00f5224364822bc4239b176686919", "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e" ], "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7" } }, "0077aedc3d174560bce924ee89e9c006": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "03a3c744d716431488163b4358b80f92": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "03b093d592ba4386aa61f7b8483da660": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b8766a88716948cf968f4563531a76d9", "IPY_MODEL_6f3a28b912714c6e931003549664bfa3", "IPY_MODEL_16d1283741404b7bb319094c992fce01" ], "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503" } }, "042e091f75694c47aee761e760e76773": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0546d04aae644dde846c58a4afb598a6": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "054c8dffadba48c6b895a6cc62448ecc": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "07fb3a2c8315494e97b447e672dfae06": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8", "placeholder": "​", "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639", "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%" } }, "083f9cda8d754c168beee10d2f8955a2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33", "max": 728, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2", "value": 728 } }, "09007681cf8d42aeb8c1d2f6a74e470a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af", "placeholder": "​", "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd", "value": " 11.4M/11.4M [00:00<00:00, 21.8MB/s]" } }, "0a46ad75c198463d843fb35e813642cb": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759", "max": 11422654, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc", "value": 11422654 } }, "0aa8ab56b85f4171a79c3bc210594025": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "0b4c9753a7cb4354b8e5f187e6e1ad7c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0cd7efffbb3c4c4b972e63749f61ab97": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0dea5caa27384f5689e3cab51f558727": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0e067d8db8ed48308a718d5f57683fd1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf", "placeholder": "​", "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks.
" } }, "0e50870ed0c643e0b6c18cc5d7ddae7f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff", "placeholder": "​", "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c", "value": " 3.84G/3.84G [00:09<00:00, 664MB/s]" } }, "0e936d9dbf9c4fdd86bbfe9730dedc47": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0f417447a7bd4a33acca96fa37aec877": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "0f480e3a0b0a45d2a2d2dec3cad923f3": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0f6907ebbc6242c8bde059cef1e1bd29": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_5bdfd87fc6cd4f9dabef7cfee29c8060", "IPY_MODEL_64f54d4a744a4627a07c3c0120276f3b", "IPY_MODEL_65b75b9b8bc143cf997796af68ff6668" ], "layout": "IPY_MODEL_d6fe74e4255444368f8f90a62157d869" } }, "114dece49dba437c8572ef94b23c3b1e": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "12815f401eba44658caa7b2e490137a8": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "12b56912736849fea2ad8124456fdc5c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_97e36007e1304e1583fd81bfb13f0edd", "max": 1671853, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_c65dc74c7d6f4bab8f7dd28455161dd8", "value": 1671853 } }, "131065f118274a1586ac38e39ed84ef0": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "158c8b85dbf34de6a94b4e35e2fc7d5a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "16a188a0b06d45f980dcf3933509fe0a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b", "placeholder": "​", "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4", "value": " 9985/9985 [00:04<00:00, 2604.11 examples/s]" } }, "16d1283741404b7bb319094c992fce01": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08", "placeholder": "​", "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99", "value": " 9985/0 [00:00<00:00, 50763.46 examples/s]" } }, "1811cda0644e4190a9469d1774435d82": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "18357b321ce44d7b8bd9d1c886f69275": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af", "placeholder": "​", "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11", "value": "model-00008-of-00008.safetensors: 100%" } }, "19127c7bb1554ccbac877059f9a82db0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e400cbf14bcc446a9d33b210cd93550b", "max": 3963750880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_71002199df6b40c9a1ac40df5fb27a1b", "value": 3963750502 } }, "19c1e38389fa46c7b7e2152a56e1df34": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_835bcc28a5564fb9b3d651bc8e32dc46", "style": "IPY_MODEL_9f1c9a0695384bdaa6f8b847ef89bee8", "tooltip": "" } }, "1bec6297c90242a88672d195bc09d429": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1c6f1f10667545aaab958016ba7e2c94": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1d5117195d4b49eb8f1a73b18419f7ce": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727", "placeholder": "​", "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838", "value": " 9.68k/9.68k [00:00<00:00, 812kB/s]" } }, "1f7d30f71bbd4547a9150d21da071055": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "200df5e79b9244849e589ecb0250a520": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba", "placeholder": "​", "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177", "value": "model-00002-of-00008.safetensors: 100%" } }, "20352e5f58d24bb8b1f3940efd14fe4a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "253017b0d0534e54ab44e181f6d7c82d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94", "placeholder": "​", "style": "IPY_MODEL_e6e969610738449887259063967f82b0", "value": " 2.78M/2.78M [00:00<00:00, 17.8MB/s]" } }, "258b7c635c1045329d4669e48c46ccd5": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_6f68ed9889f54ad2ae8a3b95ac263a83", "IPY_MODEL_80366349d81e4dcc892db6cd56e384f3", "IPY_MODEL_c73055099c084dca996159e23e162d0b" ], "layout": "IPY_MODEL_977f799afaac4a55b2dc1cffa7d5b63b" } }, "279937fe03bc4e4eb25b472d7e9df163": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b634bb73cfa743d09a5999101b840976", "max": 1912371880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_742b1030acfd414bbd9d5327b7e3826d", "value": 1912371698 } }, "27beaf06e41b472abdb544a43c720c5a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2860e3bb3baf4f7da058465850e800c5": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3efd18ea8eaa41918894883da9541bfa", "IPY_MODEL_e09f1bcbb9d94c09be53e5e1303642c2", "IPY_MODEL_82177df57a494de8900c14c2f5185175" ], "layout": "IPY_MODEL_ccfcdc95baf646f8aeb3d516742383f2" } }, "2a51b36be41745468e4c2d7a21b1c0d2": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2a5bb0e818ab47be8cf6465988328503": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2b3a2659b12244bd8548320320016dbf": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2e257c8be2da40b4bb67a9e4ab6811f3": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2e2b0c1599c341a198f632f46a40c90e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd", "placeholder": "​", "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0", "value": "model-00005-of-00008.safetensors: 100%" } }, "3036608c71904ce9ae4bb2a9fa8802d9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c", "placeholder": "​", "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f", "value": " 3.96G/3.96G [00:10<00:00, 531MB/s]" } }, "30a81da86f8043eca301e86a8651201a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "30e02aa2d0d241979369e598287f2639": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3225603166b54e7aab766b9964a2f660": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "33b3b1d0295646edaac7b4822761aeb0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "349eee9f56d64f0cba6fc24ff2c50c9b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "34c9c0137b504cd799c6bd6de69507c2": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "34cf3df51fbc41cabfdbba153c007f0e": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "35c811d2ae8e43f3b5cecbdd3cfa857f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "35cc989ca3374e7dba0cb166febc4bde": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "366a343b62fa47d8985a3bd464d99f9e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "37de928300e34184881039378bd75e7f": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "388f618924274d21a066f098f4f1e744": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7c95f85a2b1f47a1bd846d110c47bb3c", "IPY_MODEL_083f9cda8d754c168beee10d2f8955a2", "IPY_MODEL_62e1a65582f446a78612eaa804e08a7d" ], "layout": "IPY_MODEL_487a177d020f4605834878b2fdc7afa3" } }, "39789237703c4a418134243055c9cbf5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3aaecbf540f54a2db9ab0931e3b1fe57": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3c21e4a511b4441192c03b7f1d0976e9": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3efd18ea8eaa41918894883da9541bfa": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6", "placeholder": "​", "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3", "value": "Loading checkpoint shards: 100%" } }, "41f3b32c2f6b4034ae7a3b9124e28bc7": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4471ff62258549fba9514bb67050f965": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_9cd5211b5d8b457aa0002f1d17b80028", "IPY_MODEL_19127c7bb1554ccbac877059f9a82db0", "IPY_MODEL_f4667818b9d34a09891cd727a429a610" ], "layout": "IPY_MODEL_9ed02dc43412471a9ab47f3620ccf3a5" } }, "4540927d98f54466b434ba4c0edf045d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "487a177d020f4605834878b2fdc7afa3": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4b1f04ff63d14a118fdd15814dff50e4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LabelModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5", "placeholder": "​", "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec", "value": "Connecting..." } }, "4b27c267393640f28f6eae0875bd2ed9": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4c727d40ef0443449afc31724ee79f0c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4d05314858354e729d76094b3b0ce761": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_c42acf646f344a88b8c11f81e67f7206", "IPY_MODEL_7be6f04c284e4326bb4ff3d301e7b3c6", "IPY_MODEL_ffdbb12a2f2c4d14911685e7683e0ef0" ], "layout": "IPY_MODEL_bee3501b2a17427784a717e50a85e7fa" } }, "4d468f96ec924681ad65eb671674b93e": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4f1977d7e4824ef1a14b65f0f42bba10": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4fd114abe9f5494ab59858949f5055f1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "500e272208a246089613bf788a165271": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_200df5e79b9244849e589ecb0250a520", "IPY_MODEL_cc94432d08464affa3e58b560bdad194", "IPY_MODEL_3036608c71904ce9ae4bb2a9fa8802d9" ], "layout": "IPY_MODEL_adacfdcc1b0140efac56918e9ccf064e" } }, "519a7b154022443db6703f04a9142bae": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d02274afd47b462291c745f261209d42", "max": 27341251, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_0f417447a7bd4a33acca96fa37aec877", "value": 27341251 } }, "56e3768bef5a4b9db4168c5c17f509c2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "590eef89881545aa8bbef9a8bbe7fb00": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "598da69727bd4fb8b1caf465ac736d7a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5bdfd87fc6cd4f9dabef7cfee29c8060": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e", "placeholder": "​", "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299", "value": "Dropping Long Sequences (num_proc=2): 100%" } }, "5ca240f31e6b44e3882c5eb37cd5a309": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "20px" } }, "5ca6be24acb548cea130bd58e9954c7c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5cea7996f02040b187ece0bb2d6a8d1f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5cfb02ee044b4011a378efa8b54a370f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5dd7d150dbe04f08b165ce7f2c27cd11": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5e18768f7ad6434ba8b8b8a2e853e204": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5e5e15b0569b474c9620083b3ec6af55": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5e746eb25bbe416fb585fa24e79f5177": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5eb06edeb58e4930b1affef2a59eae81": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "5f86cd894de94c3280fadc1e2fd0ee13": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a20927bf5f2c41f58c1e31ac858ab36c", "IPY_MODEL_0a46ad75c198463d843fb35e813642cb", "IPY_MODEL_09007681cf8d42aeb8c1d2f6a74e470a" ], "layout": "IPY_MODEL_ebc80d1a55fa47f4a5ea2756588569ec" } }, "60c1a0d765c14a1d888317e6a507e4ea": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "62c028fdef904dedb9cdeca2b3bda725": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "62e1a65582f446a78612eaa804e08a7d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204", "placeholder": "​", "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994", "value": " 728/728 [00:00<00:00, 20.3kB/s]" } }, "62e302ebdad64aada0ffe64ae1c873f3": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "63580b6fb30642479fe3000915bf551a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "63b4e563e85c4f03b1b72beda9577bcc": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "64f54d4a744a4627a07c3c0120276f3b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0546d04aae644dde846c58a4afb598a6", "max": 9985, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_897b77a56c09479bb11d7f2a30997e55", "value": 9985 } }, "65b75b9b8bc143cf997796af68ff6668": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537", "placeholder": "​", "style": "IPY_MODEL_042e091f75694c47aee761e760e76773", "value": " 9985/9985 [00:02<00:00, 3977.47 examples/s]" } }, "67da6c4260574869aa24c3cbc1bc1654": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6932489232ec4ab18a160b1e7fbcdfe1": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6ebb2ec171414e47a14765505f64bb3c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6f05e9bebf7b40c9835808e77de6c236": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PasswordModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3", "placeholder": "​", "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2", "value": "" } }, "6f3a28b912714c6e931003549664bfa3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5ca240f31e6b44e3882c5eb37cd5a309", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_5eb06edeb58e4930b1affef2a59eae81", "value": 1 } }, "6f68ed9889f54ad2ae8a3b95ac263a83": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7", "placeholder": "​", "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156", "value": "Tokenizing Prompts (num_proc=2): 100%" } }, "704f2f5a9b1c49d5a75a0025a5dda11b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "71002199df6b40c9a1ac40df5fb27a1b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "71c8af139cd248b1b51101fd46a93f35": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d0e9dce55cec4c1ca619a0ccf209d924", "max": 9675, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4c727d40ef0443449afc31724ee79f0c", "value": 9675 } }, "734185351eb543fa9a00a881dcbb9fe7": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "735d4f225b24414294fc1b213c61223c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "742b1030acfd414bbd9d5327b7e3826d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "77304d1a46b3468a98483e02ec0ac4a4": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7baeab52d6694c32b1efd1ea1a0a7782": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627", "placeholder": "​", "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa", "value": "train.jsonl: 100%" } }, "7be6f04c284e4326bb4ff3d301e7b3c6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9503a45960984adc97b58e16c50662e0", "max": 3963750880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_da6e93f3e4984780b930fe7a706983ea", "value": 3963750502 } }, "7c2485c6cdfe463da6fdb35982a1070d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_ad1236893754446881e153adc9d5c962", "IPY_MODEL_daee63fd167e4441a32324b51b00ad2b", "IPY_MODEL_fe41858c6bd04c58840112b67c19a336" ], "layout": "IPY_MODEL_d262c82138024169b9f3aa034ca756fa" } }, "7c95f85a2b1f47a1bd846d110c47bb3c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14", "placeholder": "​", "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e", "value": "config.json: 100%" } }, "7cd0b85ebd204b7aba908417811ce4e0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7baeab52d6694c32b1efd1ea1a0a7782", "IPY_MODEL_519a7b154022443db6703f04a9142bae", "IPY_MODEL_d4183e9715f34d249942b8271cca3bdf" ], "layout": "IPY_MODEL_da2347ac94764a3fa2743343cf0d3cd2" } }, "7e5d3774060e4589aa65982da5ea4ef4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7fd44cf9ca6e4726bfd7ac21846d6a14": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "80366349d81e4dcc892db6cd56e384f3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f8ef805b776145c3bfa9ba8d90972058", "max": 9985, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_cc587493c33c4f118d1b1170f85be24c", "value": 9985 } }, "813621384dc748b0ad06775e22761c0b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "81c3db71ac704280ad030072655f1537": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "82177df57a494de8900c14c2f5185175": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654", "placeholder": "​", "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853", "value": " 8/8 [01:47<00:00, 11.64s/it]" } }, "823f1c78f15043e38bbd4dca3932a86a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_03a3c744d716431488163b4358b80f92", "max": 239, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a5434ee714f9498d83870544b67c0cb7", "value": 239 } }, "835bcc28a5564fb9b3d651bc8e32dc46": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8640ac440fbc4644b9a3af7ba3ae7183": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "86816687746246b4a6105e8010384e25": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183", "placeholder": "​", "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "879c8ab5873847a8833bd74123be90a4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41", "placeholder": "​", "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a", "value": " 1.67M/1.67M [00:00<00:00, 19.0MB/s]" } }, "897b77a56c09479bb11d7f2a30997e55": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "8bc9d8ba866c442b9118d9630009939c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8c4d4fc5a30f4e7cb3be53fe2adda33d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8f5bd719974e41c3a8dd9a5b0d3d71e6": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8f726dbfb45d4528afa33e36a6313267": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9327977822be4b1294f80e876552e305": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f", "placeholder": "​", "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47", "value": " 3.96G/3.96G [00:13<00:00, 273MB/s]" } }, "936d04b5fe1b4c63bf0b080e423d051b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "93a44a11aa4846fa8efc6c1413ef1627": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "94b9088614464f60a203de39dbcae853": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9503a45960984adc97b58e16c50662e0": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "95caff42f08a4c2aa14c867b8f37f231": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_de7c37ee83e24f0c889e84d07279c2ec", "IPY_MODEL_9d4897eefb5f48259ffb2d23e332f752", "IPY_MODEL_253017b0d0534e54ab44e181f6d7c82d" ], "layout": "IPY_MODEL_27beaf06e41b472abdb544a43c720c5a" } }, "977f799afaac4a55b2dc1cffa7d5b63b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "97e36007e1304e1583fd81bfb13f0edd": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9858cb74a09748a39e8149baac96702c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9b42e08b3c9548818488268768a118b1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03", "placeholder": "​", "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720", "value": "merges.txt: 100%" } }, "9cd5211b5d8b457aa0002f1d17b80028": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1", "placeholder": "​", "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d", "value": "model-00007-of-00008.safetensors: 100%" } }, "9d4897eefb5f48259ffb2d23e332f752": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_30a81da86f8043eca301e86a8651201a", "max": 2776833, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e8b7a81040904c1e89e58978223b1737", "value": 2776833 } }, "9e333ed3b5014069ac1dd969255dd591": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9ed02dc43412471a9ab47f3620ccf3a5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9f1c9a0695384bdaa6f8b847ef89bee8": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "9f56a2d9979c4bd8928c644c22c3ecdf": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a0a11e929edd4189b79723d618522c33": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a10d0a76010f4e508c65a9b69ebc5156": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a138859f19b74fc0928dc236ab5359db": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_9b42e08b3c9548818488268768a118b1", "IPY_MODEL_12b56912736849fea2ad8124456fdc5c", "IPY_MODEL_879c8ab5873847a8833bd74123be90a4" ], "layout": "IPY_MODEL_20352e5f58d24bb8b1f3940efd14fe4a" } }, "a1959759c5424da9961fb2a308d4dee4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57", "placeholder": "​", "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591", "value": " 239/239 [00:00<00:00, 30.9kB/s]" } }, "a20927bf5f2c41f58c1e31ac858ab36c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82", "placeholder": "​", "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f", "value": "tokenizer.json: 100%" } }, "a3a945817f684328b34651fe052393ec": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a44f630e099e43899f20a77084ae60cd": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0", "placeholder": "​", "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62", "value": "model-00001-of-00008.safetensors: 100%" } }, "a4e5789584564049b83df7c6c54a3e08": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a5434ee714f9498d83870544b67c0cb7": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "a55060adc3564407ac81ad7297d34aaa": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a6f48410b9964fefba0c3009a77dc838": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a7cf477e80fc43e0ad82c7997b076dce": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a80410b919e442c49aea15acc1ce1a72": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5", "placeholder": "​", "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039", "value": "model.safetensors.index.json: 100%" } }, "ab93eabd7cea4b94b4b7a387f101e8a1": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ac764024cf1c4e08ba7749afd2cd20ac": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ad1236893754446881e153adc9d5c962": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3", "placeholder": "​", "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec", "value": "Saving the dataset (1/1 shards): 100%" } }, "ad7599de524549c48bf2d3124ad4b299": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "adacfdcc1b0140efac56918e9ccf064e": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "af401d117d5047629d3a6e2361757b62": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b191ac001a2e4962bc9a245fcdf26e6b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b195f160ca20442fadd8b5aed0ee41af": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b1bea589efa14258a9982071b87938bf": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b5b65414154544aa8a71b1a39164aad7": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b634bb73cfa743d09a5999101b840976": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b82aa8c57f7c422a9a9c90f333ed2a99": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_c0991cf63ee6458b96e9a75e7a88b61a", "IPY_MODEL_71c8af139cd248b1b51101fd46a93f35", "IPY_MODEL_1d5117195d4b49eb8f1a73b18419f7ce" ], "layout": "IPY_MODEL_3c21e4a511b4441192c03b7f1d0976e9" } }, "b8766a88716948cf968f4563531a76d9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf", "placeholder": "​", "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97", "value": "Generating train split: " } }, "b87c84de30e84b3abf4871461fb9cbd3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b8e39e4dddc3497fbc29ae45c66da759": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bb33aec33a6447078c31bfd728942994": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bbbf575d2a4b4c6ea8389be79b2a6039": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bca2c7185b6749fd899c06a2ba4c5e46": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3", "placeholder": "​", "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2", "value": " 1.91G/1.91G [00:05<00:00, 444MB/s]" } }, "bd1b0dfed6d34d16af33a4a58330f5ec": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "be724f04b03942b2a033a7e8898bb4fd": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bed8726b8069434687c75452e21f19e5": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fa864b41586f4a7aa56aeafd1d84eb75", "max": 9985, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_3225603166b54e7aab766b9964a2f660", "value": 9985 } }, "bee3501b2a17427784a717e50a85e7fa": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bfcdbba993b74972a9e3e575f86908ff": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bff139df987d4a62abec6456cb27f3d4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c1f9c267ba3f40039cdb5eb3267e8043", "max": 3963750880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_33b3b1d0295646edaac7b4822761aeb0", "value": 3963750502 } }, "c0892a1881de4eb4bfabc6a68f87ae99": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a", "placeholder": "​", "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c", "value": " 3.96G/3.96G [00:15<00:00, 564MB/s]" } }, "c0991cf63ee6458b96e9a75e7a88b61a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66", "placeholder": "​", "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c", "value": "tokenizer_config.json: 100%" } }, "c12ea43372ac4d57bb9605f1a429b397": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "VBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_131065f118274a1586ac38e39ed84ef0" } }, "c1314f241a434c41b45d84dc4d3b30f8": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c1f9c267ba3f40039cdb5eb3267e8043": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c33ced495f70464aa4a3a91922090853": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c3725c7f79fe415fbd1ea336f0cc9cf1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b191ac001a2e4962bc9a245fcdf26e6b", "max": 3841788544, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_054c8dffadba48c6b895a6cc62448ecc", "value": 3841788178 } }, "c3be9109d63c485d9c0ef4f9bc0f9218": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c42acf646f344a88b8c11f81e67f7206": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c", "placeholder": "​", "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf", "value": "model-00003-of-00008.safetensors: 100%" } }, "c6164e05a1914ae48083db9ad7f4ef7c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c65dc74c7d6f4bab8f7dd28455161dd8": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c6e00f5224364822bc4239b176686919": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2a51b36be41745468e4c2d7a21b1c0d2", "max": 36514, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4fd114abe9f5494ab59858949f5055f1", "value": 36514 } }, "c73055099c084dca996159e23e162d0b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf", "placeholder": "​", "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31", "value": " 9985/9985 [01:04<00:00, 189.08 examples/s]" } }, "c7433acd3c4841e6958ae8f7e87b1808": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "CheckboxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_62c028fdef904dedb9cdeca2b3bda725", "style": "IPY_MODEL_a7cf477e80fc43e0ad82c7997b076dce", "value": false } }, "c84cc07789be48aebb322c23d355289e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006", "placeholder": "​", "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187", "value": "Add position_id column (Sample Packing) (num_proc=2): 100%" } }, "ca65e32eb52f48c09a84b33cb18f22cd": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "cc587493c33c4f118d1b1170f85be24c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "cc94432d08464affa3e58b560bdad194": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b5b65414154544aa8a71b1a39164aad7", "max": 3963750816, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_f0a58fbd0fca4340890041f99fa2f8c8", "value": 3963750438 } }, "ccfcdc95baf646f8aeb3d516742383f2": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cdebbc55a1164c018546c2ac6f8c620c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a44f630e099e43899f20a77084ae60cd", "IPY_MODEL_c3725c7f79fe415fbd1ea336f0cc9cf1", "IPY_MODEL_0e50870ed0c643e0b6c18cc5d7ddae7f" ], "layout": "IPY_MODEL_c33ced495f70464aa4a3a91922090853" } }, "d02274afd47b462291c745f261209d42": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d07c8b97d3314f1c852e44bdd40f61ed": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d0e9dce55cec4c1ca619a0ccf209d924": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d1f9b10c130542f094c8fd3d1e23b5e9": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d262c82138024169b9f3aa034ca756fa": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d3de2662c7964f1ba96e58da382af720": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d4183e9715f34d249942b8271cca3bdf": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a", "placeholder": "​", "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267", "value": " 27.3M/27.3M [00:00<00:00, 31.0MB/s]" } }, "d43c6df07ddb466587807d6dbe1ff614": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d", "placeholder": "​", "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152", "value": "model-00004-of-00008.safetensors: 100%" } }, "d65b6b060d9845779299491ac5599c31": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d6fe74e4255444368f8f90a62157d869": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d93f134f802b4b69b575bdaf07dbd27c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d955dcaa0e944e719f3a06139dd54a03": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "da2347ac94764a3fa2743343cf0d3cd2": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "da6e93f3e4984780b930fe7a706983ea": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "daee63fd167e4441a32324b51b00ad2b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d07c8b97d3314f1c852e44bdd40f61ed", "max": 9985, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_ebb69a2c3d0a4299a484698287b3087c", "value": 9985 } }, "dc892a596f6942d7973c616c38f0eebb": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_c84cc07789be48aebb322c23d355289e", "IPY_MODEL_bed8726b8069434687c75452e21f19e5", "IPY_MODEL_16a188a0b06d45f980dcf3933509fe0a" ], "layout": "IPY_MODEL_60c1a0d765c14a1d888317e6a507e4ea" } }, "dd0e646fad3f4a89ba23b39d162bd8d9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d43c6df07ddb466587807d6dbe1ff614", "IPY_MODEL_e0e8b840b8ea4d0d9db09afe99fa287d", "IPY_MODEL_9327977822be4b1294f80e876552e305" ], "layout": "IPY_MODEL_77304d1a46b3468a98483e02ec0ac4a4" } }, "de7c37ee83e24f0c889e84d07279c2ec": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e", "placeholder": "​", "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac", "value": "vocab.json: 100%" } }, "dfd2a2649b8341ef913207526708aff1": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e09f1bcbb9d94c09be53e5e1303642c2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e7d8e4fe58384e93a106de546068c65e", "max": 8, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_0aa8ab56b85f4171a79c3bc210594025", "value": 8 } }, "e0e8b840b8ea4d0d9db09afe99fa287d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f7434f3e03124a1c938a39af79d7fa59", "max": 3963750880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_c1314f241a434c41b45d84dc4d3b30f8", "value": 3963750502 } }, "e21e180307e5485cbbe908672fd6639a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_2e2b0c1599c341a198f632f46a40c90e", "IPY_MODEL_bff139df987d4a62abec6456cb27f3d4", "IPY_MODEL_ebe1cc366d324ad59b264c8b3c431441" ], "layout": "IPY_MODEL_114dece49dba437c8572ef94b23c3b1e" } }, "e366ae3fceec4566b9ed303d6c5f90af": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e3fb3fc6afe04b3c9b7ac61809ce78fa": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c", "placeholder": "​", "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b", "value": " 9985/9985 [00:03<00:00, 3622.89 examples/s]" } }, "e400cbf14bcc446a9d33b210cd93550b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e40d1c1ac9494b3bade9858324e7ffdf": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e575d87a7efe4ec7b1efde489839d4a6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e5a82df528bb4e408797a3b6c2758f4a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e6e969610738449887259063967f82b0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e7d8e4fe58384e93a106de546068c65e": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e87ea87fcff247b5bbcc331ba79a8dc2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "e8b7a81040904c1e89e58978223b1737": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "e90658f4bcb642baa78426012f863152": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "eb1c9535e6a546098b760528b2ea387c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_18357b321ce44d7b8bd9d1c886f69275", "IPY_MODEL_279937fe03bc4e4eb25b472d7e9df163", "IPY_MODEL_bca2c7185b6749fd899c06a2ba4c5e46" ], "layout": "IPY_MODEL_1f7d30f71bbd4547a9150d21da071055" } }, "ebb69a2c3d0a4299a484698287b3087c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "ebc80d1a55fa47f4a5ea2756588569ec": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ebe1cc366d324ad59b264c8b3c431441": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec", "placeholder": "​", "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596", "value": " 3.96G/3.96G [00:13<00:00, 398MB/s]" } }, "ec030fc3c346426f9abc3a89892258d3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_dfd2a2649b8341ef913207526708aff1", "max": 9985, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4f1977d7e4824ef1a14b65f0f42bba10", "value": 9985 } }, "ec11d1e5ae7b42c883d9b1f38a65356e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b", "placeholder": "​", "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074", "value": " 36.5k/36.5k [00:00<00:00, 4.32MB/s]" } }, "ed28e2e0410d4e0b855467e798e53d66": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ed5ca967ad5342929e578ac6aa4dc4c0": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "edc99591b9c747b689b94d0052fec14c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ef0a3c7a6f14460fb4da096928ae249e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_07fb3a2c8315494e97b447e672dfae06", "IPY_MODEL_ec030fc3c346426f9abc3a89892258d3", "IPY_MODEL_e3fb3fc6afe04b3c9b7ac61809ce78fa" ], "layout": "IPY_MODEL_c3be9109d63c485d9c0ef4f9bc0f9218" } }, "ef223e8504b64e3592589880326aaf41": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f0a58fbd0fca4340890041f99fa2f8c8": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f113ebd8c1c34806bea4dd7ed3035173": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f1cef8e8dc2646fb9fd09f3b09081074": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f3075dccbd2747b4a7913b66f44f2596": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f365820a3d3c42b2948abfe32065de14": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c", "placeholder": "​", "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55", "value": "generation_config.json: 100%" } }, "f4667818b9d34a09891cd727a429a610": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9", "placeholder": "​", "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c", "value": " 3.96G/3.96G [00:11<00:00, 457MB/s]" } }, "f4a1795dc7514a718f478245f521f0ba": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f60a2bdb6b6b4e0e8c3508580e247132": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "danger", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_edc99591b9c747b689b94d0052fec14c", "max": 3963750880, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_35cc989ca3374e7dba0cb166febc4bde", "value": 3963750502 } }, "f7434f3e03124a1c938a39af79d7fa59": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f8ef805b776145c3bfa9ba8d90972058": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fa1282ccc7544e4f818e2f03ccffe4a5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fa864b41586f4a7aa56aeafd1d84eb75": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fba7aa824b38467ab3061b226114cdec": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fcb30372e7404c5d8a1ad4df91e6c7b2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fcbab4d8dced41a18dfccce81e3a45a0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fd4f333f7ece4450b04e1a9af1f9d2f6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9", "placeholder": "​", "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6", "value": "model-00006-of-00008.safetensors: 100%" } }, "fe18bba7f3fb4c31bf840541f36b3425": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_fd4f333f7ece4450b04e1a9af1f9d2f6", "IPY_MODEL_f60a2bdb6b6b4e0e8c3508580e247132", "IPY_MODEL_c0892a1881de4eb4bfabc6a68f87ae99" ], "layout": "IPY_MODEL_1bec6297c90242a88672d195bc09d429" } }, "fe41858c6bd04c58840112b67c19a336": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a", "placeholder": "​", "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173", "value": " 9985/9985 [00:00<00:00, 44264.88 examples/s]" } }, "fea1b70fb46745feb5111b3929175b5d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f365820a3d3c42b2948abfe32065de14", "IPY_MODEL_823f1c78f15043e38bbd4dca3932a86a", "IPY_MODEL_a1959759c5424da9961fb2a308d4dee4" ], "layout": "IPY_MODEL_34c9c0137b504cd799c6bd6de69507c2" } }, "ff3a94b146a948b6907f5d80c7157f99": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ffdbb12a2f2c4d14911685e7683e0ef0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1", "placeholder": "​", "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b", "value": " 3.96G/3.96G [00:12<00:00, 656MB/s]" } } } } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml ================================================ base_model: deepcogito/cogito-v1-preview-llama-3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false strict: false datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml ================================================ base_model: deepcogito/cogito-v1-preview-qwen-14B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false strict: false datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/deepseek-v2/fft-fsdp-16b.yaml ================================================ base_model: deepseek-ai/DeepSeek-V2-Lite # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/deepseek-v2/qlora-fsdp-2_5.yaml ================================================ base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true plugins: - axolotl.integrations.liger.LigerPlugin liger_rms_norm: true liger_glu_activation: true liger_fused_linear_cross_entropy: true chat_template: deepseek_v2 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: adapter: qlora lora_r: 256 lora_alpha: 256 lora_target_linear: true peft_use_rslora: true gradient_accumulation_steps: 1 micro_batch_size: 8 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/devstral/README.md ================================================ # Finetune Devstral with Axolotl Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support. This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking. The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens. Thanks to the team at MistralAI for giving us early access to prepare for this release. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage ```bash python scripts/cutcrossentropy_install.py | sh ``` 3. Run the finetuning example: ```bash axolotl train examples/devstral/devstral-small-qlora.yml ``` This config uses about 21GB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) - [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) - [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels) ## Limitations We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only. In addition, we do not support overriding tokens yet. ## Related Resources - [MistralAI Devstral Blog](https://mistral.ai/news/devstral) - [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Website](https://axolotl.ai) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ## Future Work - Add parity to Preference Tuning, RL, Multi-modal, etc. - Add parity to other tokenizer configs like overriding tokens. ================================================ FILE: examples/devstral/devstral-small-qlora.yml ================================================ base_model: mistralai/Devstral-Small-2507 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true load_in_8bit: false load_in_4bit: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true scaling_softmax: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.05 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/distributed-parallel/README.md ================================================ # ND Parallelism Examples This directory contains example configurations for training models using ND Parallelism in Axolotl. These examples demonstrate how to compose different parallelism strategies (FSDP, TP, CP, HSDP) for efficient multi-GPU training. ## Quick Start 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Run the command below: ```bash # Train Qwen3 8B with FSDP + TP + CP on a single 8-GPU node axolotl train examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml # Train Llama 3.1 8B with HSDP + TP on 2 nodes (16 GPUs total) axolotl train examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml ``` ## Example Configurations ### Single Node (8 GPUs) **Qwen3 8B with FSDP + TP + CP** ([qwen3-8b-fsdp-tp-cp.yaml](./qwen3-8b-fsdp-tp-cp.yaml)) - Uses all 3 parallelism dimensions on a single node - Ideal for: when model weights, activations, and/or context are too large to fit on single GPU ```yaml dp_shard_size: 2 # FSDP across 2 GPUs tensor_parallel_size: 2 # TP across 2 GPUs context_parallel_size: 2 # CP across 2 GPUs # Total: 2 × 2 × 2 = 8 GPUs ``` ### Multi-Node **Llama 3.1 8B with HSDP + TP** ([llama-3_1-8b-hsdp-tp.yaml](./llama-3_1-8b-hsdp-tp.yaml)) - FSDP & TP within nodes, DDP across nodes to minimize inter-node communication - Ideal for: Scaling to multiple nodes while maintaining training efficiency ```yaml dp_shard_size: 4 # FSDP within each 4-GPU group tensor_parallel_size: 2 # TP within each node dp_replicate_size: 2 # DDP across 2 groups # Total: (4 × 2) × 2 = 16 GPUs (2 nodes) ``` ## Learn More - [ND Parallelism Documentation](https://docs.axolotl.ai/docs/nd_parallelism.html) - [Blog: Accelerate ND-Parallel Guide](https://huggingface.co/blog/accelerate-nd-parallel) - [Multi-GPU Training Guide](https://docs.axolotl.ai/docs/multi-gpu.html) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml ================================================ base_model: meta-llama/Llama-3.1-8B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin dp_shard_size: 4 dp_replicate_size: 2 tensor_parallel_size: 2 # context_parallel_size: 2 dataset_prepared_path: last_run_prepared special_tokens: pad_token: <|end_of_text|> fsdp_version: 2 fsdp_config: offload_params: false state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer reshard_after_forward: true datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/ndp-out/ sequence_len: 2048 sample_packing: true flash_attention: true gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: constant_with_warmup learning_rate: 2e-6 bf16: true tf32: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.1 ================================================ FILE: examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml ================================================ base_model: Qwen/Qwen3-8B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin dp_shard_size: 2 # dp_replicate_size: 1 context_parallel_size: 2 tensor_parallel_size: 2 dataset_prepared_path: last_run_prepared fsdp_version: 2 fsdp_config: offload_params: false state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3DecoderLayer reshard_after_forward: true datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/ndp-out/ sequence_len: 8192 sample_packing: true flash_attention: true gradient_accumulation_steps: 1 micro_batch_size: 1 # must be 1 when using context parallel num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: constant_with_warmup learning_rate: 2e-6 bf16: true tf32: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.1 special_tokens: ================================================ FILE: examples/eaft/eaft-example.yml ================================================ base_model: google/gemma-3-1b-it model_type: Gemma3ForCausalLM cls_model_config: Gemma3TextConfig # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true chat_template: gemma3 eot_tokens: - load_in_8bit: false load_in_4bit: false strict: false datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: val_set_size: 0 output_dir: ./outputs/eaft-gemma-3-1b use_eaft: true eaft_alpha: 1.0 eaft_k: 20 sequence_len: 1024 sample_packing: false adapter: lora_model_dir: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 eval_batch_size: 1 max_steps: 1000 evaluation_strategy: "no" optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-5 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true warmup_ratio: 0.1 weight_decay: 0.0 debug: deepspeed: fsdp: fsdp_config: special_tokens: ================================================ FILE: examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-1.5B-Deep-Base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/falcon-h1/falcon-h1-1b-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-1.5B-Base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/falcon-h1/falcon-h1-34b-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-34B-Base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/falcon-h1/falcon-h1-3b-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-3B-Base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/falcon-h1/falcon-h1-500m-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-0.5B-Instruct # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/falcon-h1/falcon-h1-7b-qlora.yaml ================================================ base_model: tiiuae/Falcon-H1-7B-Base # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: falcon_h1 datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - in_proj - gate_proj - up_proj - down_proj sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma2/qlora.yml ================================================ base_model: google/gemma-2-9b # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: gemma datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template drop_system_message: true field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma2/reward-model.yaml ================================================ base_model: google/gemma-2-2b # optionally might have model_type or tokenizer_type model_type: AutoModelForSequenceClassification num_labels: 1 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name reward_model: true chat_template: gemma datasets: - path: argilla/distilabel-intel-orca-dpo-pairs type: bradley_terry.chat_template val_set_size: 0.0 output_dir: ./outputs/out remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma3/gemma-3-1b-qlora.yml ================================================ base_model: google/gemma-3-1b-it # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: gemma3 eot_tokens: - datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out # Freeze vision tower unfrozen_parameters: - ^model\.language_model\..* - ^lm_head\..* adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma3/gemma-3-270m-qlora.yml ================================================ base_model: google/gemma-3-270m-it # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true load_in_8bit: false load_in_4bit: true # huggingface repo chat_template: gemma3 eot_tokens: - datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out # Freeze vision tower unfrozen_parameters: - ^model\.language_model\..* - ^lm_head\..* adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/gemma3/gemma-3-4b-qlora.yml ================================================ base_model: google/gemma-3-4b-it load_in_4bit: true # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true chat_template: gemma3 eot_tokens: - datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out # Freeze vision tower unfrozen_parameters: - ^model\.language_model\..* - ^lm_head\..* adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma3/gemma-3-4b-vision-qlora.yml ================================================ base_model: google/gemma-3-4b-it processor_type: AutoProcessor load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true chat_template: gemma3 eot_tokens: - datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/gemma3n/README.md ================================================ # Finetune Gemma-3n with Axolotl Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. In addition to Axolotl's requirements, Gemma-3n requires: ```bash pip3 install timm==1.0.17 # for loading audio data pip3 install librosa==0.11.0 ``` 3. Download sample dataset files ```bash # for text + vision + audio only wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga ``` 4. Run the finetuning example: ```bash # text only axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml # text + vision axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml # text + vision + audio axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml ``` Let us know how it goes. Happy finetuning! 🚀 WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look. ### TIPS - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/gemma3n/gemma-3n-e2b-qlora.yml ================================================ base_model: google/gemma-3n-E2B-it # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin cut_cross_entropy: true load_in_8bit: false load_in_4bit: true # for use with fft to only train on language model layers # unfrozen_parameters: # - model.language_model.* # - lm_head # - embed_tokens chat_template: gemma3n eot_tokens: - datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template split: train[:1%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 # lora_target_linear: # Does not work with gemma3n currently lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' sequence_len: 2048 sample_packing: true eval_sample_packing: true pad_to_sequence_len: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 # flash_attention: true # Any attention impl does not work with gemma3n now warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml ================================================ base_model: google/gemma-3n-E2B-it processor_type: AutoProcessor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin cut_cross_entropy: true # for use with fft to only train on language model layers # unfrozen_parameters: # - model.language_model.* # - lm_head # - embed_tokens load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true chat_template: gemma3n eot_tokens: - # sample dataset below requires downloading audio/image in advance # wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg # wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga datasets: - path: Nanobit/text-vision-audio-2k-test type: chat_template dataset_prepared_path: val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 # flash_attention: true # Any attention impl does not work with gemma3n now warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/gemma3n/gemma-3n-e2b-vision-qlora.yml ================================================ base_model: google/gemma-3n-E2B-it processor_type: AutoProcessor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin cut_cross_entropy: true # for use with fft to only train on language model layers # unfrozen_parameters: # - model.language_model.* # - lm_head # - embed_tokens load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true chat_template: gemma3n eot_tokens: - datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 # flash_attention: true # Any attention impl does not work with gemma3n now warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/glm4/qlora-32b.yaml ================================================ base_model: THUDM/GLM-4-32B-0414 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_4bit: true datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/glm45/README.md ================================================ # Finetune Z.ai's GLM-4.5-Air with Axolotl [GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air) is a MoE model by Z.ai. This guide shows how to fine-tune it with Axolotl. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash # QLoRA (1x80GB @ ~63.4GiB/GPU) axolotl train examples/glm45/glm-45-air-qlora.yaml ``` ### Dataset In addition to the standard OpenAI Messages format, GLM-4.5 supports an extra parameter for thinking in the assistant section. ```json { "role": "assistant", "reasoning_content": "...", // or have
... in `content` "content": "..." } ``` Make sure you set the below extra attributes if needed: ```yaml datasets: - path: ... type: chat_template message_property_mappings: role: role content: content # tool_calls: tool_calls # uncomment if using tools # reasoning_content: reasoning_content # uncomment if have reasoning # Uncomment if training on tool role (you would rarely if ever need this) # eot_tokens: # - <|observation|> ``` ### Tips - The role name for tools in this template is `tool`. - You will see this Axolotl WARNING — this is expected as the template does not use EOS: ``` EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct. ``` - You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. - **LoRA kernels**: Incompatible with this model. Must be explicitly disabled (`lora_*_kernel: false`). - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [GLM-4.5-Air on HuggingFace](https://huggingface.co/zai-org/GLM-4.5-Air) - [GLM-4.5 Blog](https://z.ai/blog/glm-4.5) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/glm45/glm-45-air-qlora.yaml ================================================ base_model: zai-org/GLM-4.5-Air # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true quantize_moe_experts: true # important datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 16 lora_alpha: 8 lora_dropout: 0 lora_target_modules: - q_proj - v_proj - k_proj - o_proj # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/glm46v/README.md ================================================ # Finetune GLM-4.6V with Axolotl GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks. ## Getting started 1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the fine-tuning: glm-4-6v-flash(9B) ```bash axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml ``` Let us know how it goes. Happy finetuning! 🚀 ## Tips - Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) - You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html). ## Supported Models - **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`) - **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`) ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/glm46v/glm-4-6v-flash-ddp.yaml ================================================ base_model: zai-org/GLM-4.6V-Flash trust_remote_code: true processor_type: AutoProcessor load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false ddp_find_unused_parameters: true output_dir: ./outputs/glm-4-6v-flash-qlora datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj sequence_len: 2048 gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 sdp_attention: true warmup_ratio: 0.1 evals_per_epoch: 0 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/glm46v/glm-4-6v-flash-qlora.yaml ================================================ base_model: zai-org/GLM-4.6V-Flash trust_remote_code: true processor_type: AutoProcessor load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false output_dir: ./outputs/glm-4-6v-flash-qlora datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj sequence_len: 2048 gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true logging_steps: 1 sdp_attention: true warmup_ratio: 0.1 evals_per_epoch: 0 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/glm47-flash/README.md ================================================ # Finetune Z.ai's GLM-4.7-Flash with Axolotl [GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash) is a 30B-A3B MoE model by Z.ai. This guide shows how to fine-tune it with Axolotl. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash # QLoRA # - no target experts (1x48GB @ ~24GiB/GPU) # - target experts (1x48GB @ ~34GiB/GPU) axolotl train examples/glm47-flash/qlora.yaml # QLoRA FSDP2 no target experts (2x48GB @ ~29GiB/GPU) axolotl train examples/glm47-flash/qlora_fsdp.yaml ``` ```bash # LoRA # - no target experts (1x48GB @ ~35GiB/GPU) # - target experts (1x48GB @ OOM. Projected ~45-50GiB/GPU) axolotl train examples/glm47-flash/lora.yaml # LoRA FSDP2 no target experts (2x48GB @ ~43GiB/GPU) axolotl train examples/glm47-flash/lora_fsdp.yaml ``` ### MoE Expert Quantization & Expert LoRA This model quantize expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs. ## Limitations - **lora_target_linear**: Incompatible for this model. - **LoRA kernels**: Incompatible with this model due to non-standard attention projections (DSA). Must be explicitly disabled (`lora_*_kernel: false`). ### TIPS - For inference, the official Z.ai team recommends these default settings (most tasks): - `temperature: 1.0` - `top_p: 0.95` - `max_new_tokens: 131072` - You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy, so we have not tested this. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [GLM-4.7-Flash on HuggingFace](https://huggingface.co/zai-org/GLM-4.7-Flash) - [GLM-4.7 Blog](https://z.ai/blog/glm-4.7) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/glm47-flash/lora.yaml ================================================ base_model: zai-org/GLM-4.7-Flash plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/glm4.7-flash-lora-8bit-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_modules: - q_proj - v_proj - k_proj - o_proj # Uncomment to also target MoE expert weights: # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # LoRA kernels incompatible with DSA attention lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 ================================================ FILE: examples/glm47-flash/lora_fsdp.yaml ================================================ base_model: zai-org/GLM-4.7-Flash plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/glm4.7-flash-lora-8bit-fsdp-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_modules: - q_proj - v_proj - k_proj - o_proj # Uncomment to also target MoE expert weights: # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # LoRA kernels incompatible with DSA attention lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 fsdp_config: fsdp_version: 2 offload_params: false cpu_ram_efficient_loading: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/glm47-flash/qlora.yaml ================================================ base_model: zai-org/GLM-4.7-Flash plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/glm4.7-flash-qlora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_modules: - q_proj - v_proj - k_proj - o_proj # Uncomment to also target MoE expert weights: # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # LoRA kernels incompatible with DSA attention lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 ================================================ FILE: examples/glm47-flash/qlora_fsdp.yaml ================================================ base_model: zai-org/GLM-4.7-Flash plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/glm4.7-flash-qlora-fsdp-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0 lora_target_modules: - q_proj - v_proj - k_proj - o_proj # Uncomment to also target MoE expert weights: # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # LoRA kernels incompatible with DSA attention lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 fsdp_config: fsdp_version: 2 offload_params: false cpu_ram_efficient_loading: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/gpt-oss/README.md ================================================ # Finetune OpenAI's GPT-OSS with Axolotl [GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B. In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b)) ```bash # LoRA SFT linear layers (1x48GB @ ~44GiB) axolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml # FFT SFT with offloading (2x24GB @ ~21GiB/GPU) axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml # FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU) axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml ``` Note: Memory usage taken from `device_mem_reserved(gib)` from logs. ### Training 120B On 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base model, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints. ```bash # FFT SFT with offloading (8x80GB @ ~49GiB/GPU) axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml ``` To simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we've partnered with [Baseten](https://baseten.co) to showcase multi-node training of the 120B model using Baseten Truss. You can read more about this recipe on [Baseten's blog](https://www.baseten.co/blog/how-to-fine-tune-gpt-oss-120b-with-baseten-and-axolotl/). The recipe can be found on their [GitHub](https://github.com/basetenlabs/ml-cookbook/tree/main/examples/oss-gpt-120b-axolotl/training). ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`. See https://github.com/huggingface/transformers/pull/40207 for the status of this issue. ```bash sed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json ``` When using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your configured `output_dir`. However, if that step fails due to a disk space error, you can take an additional step to merge the sharded weights. This step will automatically determine the last checkpoint directory and merge the sharded weights to `{output_dir}/merged`. ```bash axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/ ``` ### How to set reasoning_effort in template? The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config: ```yaml chat_template_kwargs: reasoning_effort: "high" # low | medium | high ``` Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss. ### Inferencing your fine-tuned model #### vLLM GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425 for more information about using a special vllm-openai docker image for inferencing with vLLM. Optionally, vLLM can be installed from nightly: ```bash pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly ``` and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment): ```bash vllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888 --tensor-parallel-size 8 ``` #### SGLang SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server: ```bash python3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8 ``` ### Tool use GPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning. Here is an example dataset config: ```yaml datasets: - path: Nanobit/text-tools-2k-test type: chat_template ``` See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-tools-2k-test) for the sample dataset. Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info. ### Thinking and chat_template masking conflict OpenAI’s Harmony template hides `thinking` in all non-final turns, which conflicts with Axolotl’s `chat_template` masking. If your dataset has `thinking` content mid-turn, there are two paths we recommend: - Train only on the last turn. This can be accomplished via chat_template's [train on last doc](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#training-on-last-message). - Adjust your dataset to only have `thinking` content in the last turn. ### TIPS - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) ## Related Resources - [GPT-OSS Blog](https://openai.com/index/introducing-gpt-oss/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml ================================================ # the original mxfp4 quantized model is not supported with FSDP cpu_ram_efficient_loading # FSDP cpu_ram_efficient_loading is used to reduce the initial CPU memory usage when loading the model base_model: axolotl-ai-co/gpt-oss-120b-dequantized use_kernels: false dp_shard_size: 16 # requires 2x8xH100 nodes plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-out/ save_total_limit: 2 # the 120B model can use up to 720GB of disk space per checkpoint, so let's only keep the last 2 sequence_len: 4096 sample_packing: true pad_to_sequence_len: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload lr_scheduler: constant_with_warmup learning_rate: 2e-5 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" fsdp_version: 2 fsdp_config: offload_params: true state_dict_type: SHARDED_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: GptOssDecoderLayer reshard_after_forward: true cpu_ram_efficient_loading: true ================================================ FILE: examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml ================================================ base_model: openai/gpt-oss-20b use_kernels: false model_quantization_config: Mxfp4Config model_quantization_config_kwargs: dequantize: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-out/ sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: constant_with_warmup learning_rate: 2e-5 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" # choose the zero3 configuration that best fits your system capabilities deepspeed: deepspeed_configs/zero3_bf16.json ================================================ FILE: examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml ================================================ base_model: openai/gpt-oss-20b use_kernels: true model_quantization_config: Mxfp4Config model_quantization_config_kwargs: dequantize: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: ./outputs/last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-out/ sequence_len: 4096 sample_packing: true pad_to_sequence_len: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload lr_scheduler: constant_with_warmup learning_rate: 2e-5 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" fsdp_version: 2 fsdp_config: offload_params: true state_dict_type: SHARDED_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: GptOssDecoderLayer reshard_after_forward: true # cpu_ram_efficient_loading: true # cpu_ram_efficient_loading cannot be used with MXFP4 model quantization. # It can only be used with a dequantized model like `axolotl-ai-co/gpt-oss-120b-dequantized` ================================================ FILE: examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml ================================================ base_model: openai/gpt-oss-20b use_kernels: false model_quantization_config: Mxfp4Config model_quantization_config_kwargs: dequantize: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: ./outputs/last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-out/ sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: constant_with_warmup learning_rate: 2e-5 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" fsdp_version: 2 fsdp_config: offload_params: false state_dict_type: SHARDED_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: GptOssDecoderLayer reshard_after_forward: true # cpu_ram_efficient_loading: true ================================================ FILE: examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml ================================================ base_model: openai/gpt-oss-20b use_kernels: true model_quantization_config: Mxfp4Config model_quantization_config_kwargs: dequantize: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-out/ sequence_len: 4096 sample_packing: true adapter: lora lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters lora_target_linear: true # TODO: not supported for now, see peft#2710 #lora_target_parameters: # target the experts in the last two layers # - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj" # - "22._checkpoint_wrapped_module.mlp.experts.down_proj" # - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj" # - "23._checkpoint_wrapped_module.mlp.experts.down_proj" wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: constant_with_warmup learning_rate: 2e-4 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.1 special_tokens: eot_tokens: - "<|end|>" ================================================ FILE: examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml ================================================ base_model: openai/gpt-oss-safeguard-20b use_kernels: true model_quantization_config: Mxfp4Config model_quantization_config_kwargs: dequantize: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding datasets: - path: HuggingFaceH4/Multilingual-Thinking type: chat_template field_thinking: thinking template_thinking_key: thinking dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/gpt-oss-safeguard-out/ sequence_len: 4096 sample_packing: true adapter: lora lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters lora_target_linear: true # TODO: not supported for now, see peft#2710 #lora_target_parameters: # target the experts in the last two layers # - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj" # - "22._checkpoint_wrapped_module.mlp.experts.down_proj" # - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj" # - "23._checkpoint_wrapped_module.mlp.experts.down_proj" wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: trackio_project_name: trackio_run_name: trackio_space_id: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit lr_scheduler: constant_with_warmup learning_rate: 2e-4 bf16: true tf32: true flash_attention: true attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true activation_offloading: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.1 special_tokens: eot_tokens: - "<|end|>" ================================================ FILE: examples/granite4/README.md ================================================ # Finetune IBM's Granite 4.0 with Axolotl [Granite 4.0](https://huggingface.co/collections/ibm-granite/granite-40-language-models) are a family of open source models trained by IBM Research. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Granite4 is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). Here is an example of how to install from main for pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.7.1 min) git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation -e '.[flash-attn]' # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy python scripts/cutcrossentropy_install.py | sh ``` 2. Run the finetuning example: ```bash axolotl train examples/granite4/granite-4.0-tiny-fft.yaml ``` This config uses about 40.8GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ### Limitation Adapter finetuning does not work at the moment. It would error with ```bash RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648) ``` In addition, if adapter training works, `lora_target_linear: true` will not work due to: ```bash ValueError: Target module GraniteMoeHybridParallelExperts() is not supported. ``` ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [Granite Docs](https://www.ibm.com/granite/docs/models/granite) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/granite4/granite-4.0-tiny-fft.yaml ================================================ base_model: ibm-granite/granite-4.0-tiny-preview # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/model-out sequence_len: 2048 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/hunyuan/README.md ================================================ # Finetune HunYuan with Axolotl Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at [HuggingFace](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as HunYuan is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). Here is an example of how to install from main for pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation -e '.[flash-attn]' # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy python scripts/cutcrossentropy_install.py | sh ``` 2. Run the finetuning example: ```bash axolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml ``` This config uses about 4.7 GB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Dataset HunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern. ```python # fast think pattern messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "/no_think What color is the sun?" }, {"role": "assistant", "content": "\n\n\n\nThe sun is yellow.\n"} ] # slow think pattern messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "/no_think What color is the sun?" }, {"role": "assistant", "content": "\nThe user is asking about the color of the sun. I need to ...\n\n\nThe sun is yellow.\n"} ] ``` ### TIPS - For inference, the official Tencent team recommends ```json { "do_sample": true, "top_k": 20, "top_p": 0.8, "repetition_penalty": 1.05, "temperature": 0.7 } ``` - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [Tencent HunYuan Blog](https://hunyuan.tencent.com/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/hunyuan/hunyuan-v1-dense-qlora.yaml ================================================ base_model: tencent/Hunyuan-0.5B-Instruct # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/internvl3_5/README.md ================================================ # Finetune OpenGV's InternVL with Axolotl [InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding. This guide shows how to fine-tune it with Axolotl. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install `timm` for vision model support: ```bash pip install timm==1.0.19 ``` 3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 4. Run the finetuning example: ```bash axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml ``` This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Tips - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [InternVL Paper](https://huggingface.co/papers/2508.18265) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/internvl3_5/internvl3_5-8b-qlora.yml ================================================ base_model: OpenGVLab/InternVL3_5-8B-HF processor_type: AutoProcessor plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] field_messages: messages dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/jamba/README.md ================================================ # Jamba - ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and - 35GiB VRAM per GPU w minimal context length - 56GiB VRAM per GPU (w multipack enabled) - ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?) - ✅ qlora single-gpu, ~51GiB VRAM - ✅ multipack - ✅ FSDP - ❓ 8-bit LoRA ================================================ FILE: examples/jamba/qlora.yaml ================================================ base_model: ai21labs/Jamba-v0.1 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: false pad_to_sequence_len: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: adapter: qlora lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true low_cpu_mem_usage: true gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 2 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.00001 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/jamba/qlora_deepspeed.yaml ================================================ base_model: ai21labs/Jamba-v0.1 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: false pad_to_sequence_len: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: adapter: qlora lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true low_cpu_mem_usage: true gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 2 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.00001 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 deepspeed: deepspeed_configs/zero2.json weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/jamba/qlora_fsdp_large.yaml ================================================ base_model: ai21labs/AI21-Jamba-1.5-Large # optionally might have model_type or tokenizer_type tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_4bit: true use_tensorboard: true chat_template: jamba datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template drop_system_message: true field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: jamba-large-fsdp-qlora-ft adapter: qlora sequence_len: 2048 sample_packing: true lora_r: 16 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: [down_proj,gate_proj,in_proj,k_proj,o_proj,out_proj,q_proj,up_proj,v_proj,x_proj] lora_target_linear: false gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 bf16: true tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: false fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/kimi-linear/README.md ================================================ # Finetune MoonshotAI's Kimi Linear with Axolotl [Kimi Linear](https://huggingface.co/collections/moonshotai/kimi-linear-a3b) is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. **Note:** Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install CCE via [docs](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) 3. Run the finetuning example: ```bash axolotl train examples/kimi-linear/kimi-48b-lora.yaml ``` This config uses about 98.7GiB VRAM. Let us know how it goes. Happy finetuning! ### TIPS - Kimi Linear requires `trust_remote_code: true`. - You can run a full finetuning by removing the `adapter: lora` and `load_in_8bit: true`. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html) - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template) ## Optimization Guides See 👉 [docs](https://docs.axolotl.ai/docs/optimizations.html). ## Limitations This is not yet compatible with MoE kernels from transformers v5. ## Related Resources - [Kimi Linear Paper](https://huggingface.co/papers/2510.26692) - [Kimi Linear GitHub](https://github.com/MoonshotAI/Kimi-Linear) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/kimi-linear/kimi-48b-lora.yaml ================================================ base_model: moonshotai/Kimi-Linear-48B-A3B-Instruct # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: true load_in_4bit: false strict: false datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template split: train dataset_prepared_path: last_run_prepared val_set_size: 0.2 output_dir: ./outputs/lora-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true pad_to_sequence_len: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: ================================================ FILE: examples/llama-2/README.md ================================================ # Overview This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b. The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes. The 13b variant will fit if you change these settings to these values: gradient_accumulation_steps: 2 micro_batch_size: 1 ```shell accelerate launch -m axolotl.cli.train examples/llama-2/qlora.yml ``` or ```shell accelerate launch -m axolotl.cli.train examples/llama-2/lora.yml ``` To launch a full finetuning with 16-bit precision: ```shell accelerate launch -m axolotl.cli.train examples/llama-2/fft_optimized.yml ``` ================================================ FILE: examples/llama-2/fft_optimized.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only weight_decay: 0.1 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/gptq-lora.yml ================================================ base_model: TheBloke/Llama-2-7B-GPTQ # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name gptq: true gptq_disable_exllama: true tokenizer_use_fast: true tokenizer_legacy: true push_dataset_to_hub: hf_use_auth_token: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 adapter: lora lora_model_dir: sequence_len: 4096 sample_packing: lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - k_proj - o_proj - q_proj - v_proj lora_target_linear: wandb_project: wandb_watch: wandb_name: wandb_log_model: output_dir: ./outputs/model-out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_eps: 0.00001 max_grad_norm: 1.0 torchdistx_path: lr_scheduler: cosine lr_quadratic_warmup: true learning_rate: 0.000017 bf16: false fp16: false float16: true tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: sdp_attention: flash_optimum: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/lisa.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/lisa-out sequence_len: 4096 sample_packing: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: lisa_n_layers: 4 lisa_step_interval: 20 lisa_layers_attribute: model.layers wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 5e-5 # recommendation from lisa paper for 7b bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/loftq.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true peft: loftq_config: loftq_bits: 4 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/lora.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/qlora-fsdp.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: yahma/alpaca-cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 512 sample_packing: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 4 num_epochs: 4 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT # fsdp_cpu_offload_pin_memory: false # uncomment to enable swap memory usage when RAM is insufficient special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/qlora.yml ================================================ base_model: NousResearch/Llama-2-7b-hf # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-2/relora.yml ================================================ base_model: NousResearch/Llama-2-7b-hf model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false load_in_4bit: true datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/relora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true relora: true relora_prune_ratio: 0.9 relora_cpu_offload: false jagged_restart_steps: 150 jagged_restart_warmup_steps: 10 jagged_restart_anneal_steps: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 4 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "" eos_token: "" unk_token: "" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/3b-fp8-fsdp2.yaml ================================================ base_model: meta-llama/Llama-3.2-3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true datasets: - path: yahma/alpaca-cleaned type: alpaca output_dir: ./outputs/fp8_out/ sample_packing: true pad_to_sequence_len: true sequence_len: 512 flex_attention: true flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs torch_compile: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused cosine_constant_lr_ratio: 0 cosine_min_lr_ratio: 1.0 learning_rate: 2e-5 save_only_model: true fp8: true fp8_enable_fsdp_float8_all_gather: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_steps: 10 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: false special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/3b-qat-fsdp2.yaml ================================================ base_model: meta-llama/Llama-3.2-3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true datasets: - path: yahma/alpaca-cleaned type: alpaca split: train[:95%] output_dir: ./outputs/qat_out/ dataset_prepared_path: ./outputs/qat_out/dataset_prepared sample_packing: false sequence_len: 8192 flash_attention: true qat: activation_dtype: int8 weight_dtype: int4 group_size: 32 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused cosine_constant_lr_ratio: 0 cosine_min_lr_ratio: 1.0 learning_rate: 2e-5 save_only_model: true bf16: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_cpu_ram_efficient_loading: false fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_reshard_after_forward: true fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad_id|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/3b-qat-mxfp4.yaml ================================================ base_model: meta-llama/Llama-3.2-3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true datasets: - path: yahma/alpaca-cleaned type: alpaca split: train[:95%] output_dir: ./outputs/qat_out/ dataset_prepared_path: ./outputs/dataset_prepared sequence_len: 2048 flash_attention: true qat: activation_dtype: mxfp4 weight_dtype: mxfp4 group_size: 32 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_checkpointing: true activation_offloading: true gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_8bit cosine_constant_lr_ratio: 0 cosine_min_lr_ratio: 1.0 learning_rate: 2e-5 save_only_model: true bf16: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 special_tokens: pad_token: <|finetune_right_pad_id|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/3b-qat-nvfp4.yaml ================================================ base_model: meta-llama/Llama-3.2-3B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true datasets: - path: yahma/alpaca-cleaned type: alpaca split: train[:95%] output_dir: ./outputs/qat_out/ dataset_prepared_path: ./outputs/dataset_prepared sequence_len: 8192 flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_checkpointing: true gradient_accumulation_steps: 1 micro_batch_size: 64 num_epochs: 1 optimizer: adamw_torch_fused cosine_constant_lr_ratio: 0 cosine_min_lr_ratio: 1.0 learning_rate: 2e-5 save_only_model: true bf16: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 special_tokens: pad_token: <|finetune_right_pad_id|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/README.md ================================================ # Llama-3 https://llama.meta.com/llama3/ [8B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-8B) - [Full Fine Tune](./fft-8b.yaml) - Single GPU @ 48GB VRAM - [LoRA](./lora-8b.yml) - Single GPU @ 11GB VRAM [70B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-70B) - [QLORA+FSDP](./qlora-fsdp-70b.yaml) - Dual GPU @ 21GB VRAM ================================================ FILE: examples/llama-3/diffusion/pretrain-1b.yaml ================================================ base_model: meta-llama/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name pretraining_dataset: - path: wikitext name: wikitext-103-raw-v1 type: completion field: text plugins: - axolotl.integrations.diffusion.DiffusionPlugin diffusion: noise_schedule: cosine min_mask_ratio: 0.15 max_mask_ratio: 0.85 num_diffusion_steps: 128 eps: 5e-4 importance_weighting: true mask_token_id: 128002 generate_samples: true generation_interval: 250 output_dir: ./outputs/model-out sequence_len: 512 sample_packing: true gradient_accumulation_steps: 8 micro_batch_size: 4 max_steps: 10000 warmup_ratio: 0.1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 3e-4 sdp_attention: true bf16: auto tf32: true logging_steps: 1 save_strategy: steps save_steps: 1000 special_tokens: pad_token: "<|end_of_text|>" wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/diffusion/sft-1b.yaml ================================================ base_model: meta-llama/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca val_set_size: 0.05 plugins: - axolotl.integrations.diffusion.DiffusionPlugin diffusion: noise_schedule: cosine min_mask_ratio: 0.1 max_mask_ratio: 0.9 num_diffusion_steps: 128 eps: 1e-3 importance_weighting: true mask_token_id: 128002 generate_samples: true generation_interval: 250 output_dir: ./outputs/model-out sequence_len: 512 sample_packing: true eval_sample_packing: true gradient_accumulation_steps: 4 micro_batch_size: 4 num_epochs: 1 warmup_steps: 0.1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 1e-5 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: sdp_attention: true logging_steps: 1 save_strategy: best eval_strategy: epoch special_tokens: pad_token: "<|end_of_text|>" wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/fft-8b-liger-fsdp.yaml ================================================ base_model: NousResearch/Meta-Llama-3.1-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_fused_linear_cross_entropy: true chat_template: llama3 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.02 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_backward_prefetch: BACKWARD_PRE special_tokens: pad_token: <|finetune_right_pad_id|> eos_token: <|eot_id|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/fft-8b.yaml ================================================ base_model: NousResearch/Meta-Llama-3.1-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 8192 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/instruct-dpo-lora-8b.yml ================================================ base_model: meta-llama/Meta-Llama-3-8B-Instruct # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name special_tokens: pad_token: <|finetune_right_pad_id|> eos_token: <|eot_id|> load_in_8bit: true load_in_4bit: false chat_template: llama3 rl: dpo datasets: - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/instruct-lora-8b.yml ================================================ base_model: NousResearch/Meta-Llama-3-8B-Instruct # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false chat_template: llama3 datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b-deduplicate-dpo.yml ================================================ base_model: meta-llama/Llama-3.2-1B # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false chat_template: llama3 rl: dpo datasets: - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant dataset_exact_deduplication: true dataset_prepared_path: val_set_size: 0 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b-deduplicate-sft.yml ================================================ base_model: meta-llama/Llama-3.2-1B # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/lora-out dataset_exact_deduplication: true sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_modules_to_save: - embed_tokens - lm_head wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b-kernels.yml ================================================ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 16 lora_alpha: 32 # Currently, we don't support dropout with our custom Triton kernels # lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj # These options enable our custom Triton kernels / autograd # functions for MLP and attention calculations lora_mlp_kernel: true lora_qkv_kernel: true lora_o_kernel: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b-ray.yml ================================================ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 deepspeed: deepspeed_configs/zero3.json weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" use_ray: true ray_num_workers: 4 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b-sample-packing-sequentially.yml ================================================ base_model: meta-llama/Llama-3.2-1B # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/lora-out test_value: true sequence_len: 4096 sample_packing: true sample_packing_sequentially: true curriculum_sampling: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_modules_to_save: - embed_tokens - lm_head wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-1b.yml ================================================ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: lora lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/lora-8b.yml ================================================ base_model: NousResearch/Meta-Llama-3-8B # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_modules_to_save: - embed_tokens - lm_head wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/opentelemetry-qlora.yml ================================================ base_model: NousResearch/Llama-3.2-1B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca output_dir: ./outputs/opentelemetry-example adapter: qlora sequence_len: 512 sample_packing: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true # OpenTelemetry Configuration use_otel_metrics: true otel_metrics_host: "localhost" otel_metrics_port: 8000 # Disable WandB use_wandb: false gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true logging_steps: 1 flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" ================================================ FILE: examples/llama-3/qlora-1b-gdpo.yaml ================================================ base_model: meta-llama/Llama-3.2-1B-Instruct chat_template: llama3 rl: gdpo trl: beta: 0.001 max_completion_length: 128 num_generations: 2 temperature: 0.7 top_p: 0.95 use_vllm: false multi_objective_aggregation: normalize_then_sum reward_funcs: - rwd.format_reward - rwd.correctness_reward reward_weights: [1.0, 2.0] log_completions: true num_completions_to_print: 3 scale_rewards: true datasets: - path: openai/gsm8k name: main split: train[:1000] type: rwd.gsm8k_transform val_set_size: 0.0 output_dir: ./outputs/llama3-gdpo-out sequence_len: 512 sample_packing: false pad_to_sequence_len: false gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 max_steps: 100 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-5 weight_decay: 0.01 warmup_steps: 10 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false flash_attention: true logging_steps: 1 save_steps: 50 save_safetensors: true special_tokens: pad_token: "<|end_of_text|>" seed: 42 ================================================ FILE: examples/llama-3/qlora-1b-kto.yaml ================================================ base_model: meta-llama/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true rl: kto rl_beta: 0.5 kto_desirable_weight: 0.2 datasets: - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto type: llama3.ultra split: train dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/qlora-out remove_unused_columns: false adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: false # not supported with kto eval_sample_packing: false pad_to_sequence_len: false lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/qlora-1b.yml ================================================ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/qlora-fsdp-405b.yaml ================================================ base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16 # optionally might have model_type or tokenizer_type tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out/qlora-llama3_1-405b adapter: qlora sequence_len: 2048 sample_packing: true lora_r: 16 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 bf16: true tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD special_tokens: pad_token: <|finetune_right_pad_id|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/qlora-fsdp-70b.yaml ================================================ base_model: casperhansen/llama-3-70b-fp16 # optionally might have model_type or tokenizer_type model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out/qlora-llama3-70b adapter: qlora lora_model_dir: sequence_len: 512 sample_packing: false lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD special_tokens: pad_token: <|end_of_text|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/qlora.yml ================================================ base_model: NousResearch/Meta-Llama-3-8B # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: aaditya/alpaca_subset_1 type: alpaca dataset_prepared_path: val_set_size: 0 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: "<|end_of_text|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3/sparse-finetuning.yaml ================================================ base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4 plugins: - axolotl.integrations.llm_compressor.LLMCompressorPlugin load_in_8bit: false load_in_4bit: false strict: false datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 2e-5 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false early_stopping_patience: resume_from_checkpoint: logging_steps: 1 xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 2 eval_table_size: saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: pad_token: <|end_of_text|> llmcompressor: recipe: finetuning_stage: finetuning_modifiers: ConstantPruningModifier: targets: [ 're:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight', 're:.*o_proj.weight', 're:.*gate_proj.weight', 're:.*up_proj.weight', 're:.*down_proj.weight', ] start: 0 save_compressed: true # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-3-vision/lora-11b.yaml ================================================ base_model: alpindale/Llama-3.2-11B-Vision-Instruct # optionally might have model_type or tokenizer_type or processor_type processor_type: AutoProcessor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: llama3_2_vision datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 # flash_attention: true # use for text-only mode sdp_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/README.md ================================================ # Llama 4 by Meta AI ## Flash Attention vs Flex Attention While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended. ## Available Examples ### Llama 4 Scout 17Bx16Experts (109B) Flex Attention - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml) - [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml) [//]: # (Flash Attention (Do not use)) [//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)) [//]: # (- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)) [//]: # (- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)) Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj) Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8) ### Llama 4 Maverick 17Bx128Experts (400B) Coming Soon ## Delinearized Llama 4 Models We provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models. ```bash axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir ``` Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing. ================================================ FILE: examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml ================================================ base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: # - lm_head # - embed_tokens chat_template: llama4 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 1e-4 bf16: true tf32: true logging_steps: 1 flash_attention: true gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # torch_compile: true plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: - lm_head - embed_tokens chat_template: llama4 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: # - lm_head # - embed_tokens lora_mlp_kernel: true lora_qkv_kernel: true lora_o_kernel: true chat_template: llama4 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 # up to 8k will work on a single H100 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 1e-4 bf16: true tf32: true logging_steps: 1 flash_attention: true gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration processor_type: Llama4Processor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false sequence_len: 4096 plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true # use Axolotl's customized model load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj - vision_adapter.mlp.fc1 - vision_adapter.mlp.fc2 # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: - lm_head - embed_tokens chat_template: llama4 datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/scout-qlora-flexattn-fsdp2.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: # - lm_head # - embed_tokens chat_template: llama4 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 sample_packing: true gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 3 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 1e-4 bf16: true tf32: true logging_steps: 1 flex_attention: true flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: fsdp_version: 2 fsdp_offload_params: false # fsdp_cpu_ram_efficient_loading: true # does not work with load_in_8bit/4bit fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer fsdp_state_dict_type: SHARDED_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_reshard_after_forward: true fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/scout-qlora-single-h100-flex.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.liger.LigerPlugin - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true # needed with custom linearized experts model load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj # - experts.gate_projs.[0-9]+$ # optionally train the moe experts # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: # - lm_head # needed if modifying vocabulary # - embed_tokens lora_mlp_kernel: true lora_qkv_kernel: true lora_o_kernel: true chat_template: llama4 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 # up to 8k will work on a single H100 sample_packing: true gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 1e-4 bf16: true tf32: true torch_compile: true flex_attention: true flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml ================================================ base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration processor_type: Llama4Processor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false sequence_len: 4096 plugins: - axolotl.integrations.liger.LigerPlugin liger_glu_activation: true liger_rms_norm: true liger_layer_norm: true llama4_linearized_experts: true # use Axolotl's customized model load_in_4bit: true adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: - self_attn.q_proj - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj - shared_expert.gate_proj - shared_expert.up_proj - shared_expert.down_proj - vision_adapter.mlp.fc1 - vision_adapter.mlp.fc2 # - experts.gate_projs.[0-9]+$ # - experts.up_projs.[0-9]+$ # - experts.down_projs.[0-9]+$ lora_modules_to_save: - lm_head - embed_tokens chat_template: llama4 datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 1e-4 bf16: true tf32: true logging_steps: 1 flex_attention: true flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer fsdp_state_dict_type: SHARDED_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_reshard_after_forward: true fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad|> eos_token: <|eot|> # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/llava/lora-7b.yaml ================================================ base_model: llava-hf/llava-1.5-7b-hf processor_type: AutoProcessor # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: llava datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/magistral/README.md ================================================ # Finetune Magistral Small with Axolotl Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506), [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)), and [2509](https://huggingface.co/mistralai/Magistral-Small-2509) (see [Vision](#vision)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. MistralAI has also released a proprietary medium-sized version called Magistral Medium. Thanks to the team at MistralAI for giving us early access to prepare for these releases. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.7.0 min) pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage ```bash python scripts/cutcrossentropy_install.py | sh ``` 3. Run the finetuning example: ```bash axolotl train examples/magistral/magistral-small-qlora.yaml ``` This config uses about 24GB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Thinking MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps. 📚 **[See the Thinking fine-tuning guide →](./think/README.md)** ### Vision MistralAI has released their [2509](https://huggingface.co/mistralai/Magistral-Small-2509) model with vision capabilities. 📚 **[See the Vision fine-tuning guide →](./vision/README.md)** ### Tips - We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`. - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Limitations We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only. In addition, we do not support overriding tokens yet. ## Related Resources - [MistralAI Magistral Blog](https://mistral.ai/news/magistral/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ## Future Work - Add parity to Preference Tuning, RL, etc. - Add parity to other tokenizer configs like overriding tokens. ================================================ FILE: examples/magistral/magistral-small-fsdp-qlora.yaml ================================================ base_model: mistralai/Magistral-Small-2506 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer fsdp_activation_checkpointing: true # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/magistral/magistral-small-qlora.yaml ================================================ base_model: mistralai/Magistral-Small-2506 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/magistral/think/README.md ================================================ # Magistral Small Thinking Fine-tuning This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mistralai/Magistral-Small-2507) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections. ## Prerequisites Before starting, ensure you have: - Installed Axolotl (see [main README](../README.md)) ## Getting Started Run the thinking model fine-tuning: ```bash axolotl train examples/magistral/think/magistral-small-think-qlora.yaml ``` This config uses about 19.1 GiB VRAM. ### Tips - Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below. - You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent. ## Dataset Format The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages. Example format: ```json { "messages": [ { "role": "system", "content": [ { "type": "text", "text": "{SYSTEM_PROMPT}"} ] }, { "role": "user", "content": [ { "type": "text", "text": "Solve this step by step: What is 15% of 240?"} ] }, { "role": "assistant", "content": [ { "type": "thinking", "thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36." }, { "type": "text", "text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36." } ] } ] } ``` ### Advanced Options The `thinking` section supports an optional `closed` parameter: ```json { "type": "thinking", "thinking": "Internal reasoning here...", "closed": true // Default: true, controls adding the closing [/THINK] tag } ``` ================================================ FILE: examples/magistral/think/magistral-small-think-qlora.yaml ================================================ base_model: mistralai/Magistral-Small-2507 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: Nanobit/text-think-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/magistral/vision/README.md ================================================ # Magistral Small Vision Fine-tuning This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mistralai/Magistral-Small-2509) with vision capabilities using Axolotl. ## Prerequisites Before starting, ensure you have: - Installed Axolotl from source (see [main README](../README.md)) ## Getting started 1. Install the required vision lib: ```bash pip install 'mistral-common[opencv]==1.8.5' ``` 2. Download the example dataset image: ```bash wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg ``` 3. Run the fine-tuning: ```bash axolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml ``` This config uses about 17GiB VRAM. WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look. ### Tips Key differences from text-only model: - `max_tokens: 131072` for inference - Multi-modal dataset format required - Sample packing not supported ## Dataset Format The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now. Example: ```json { "messages": [ {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]}, {"role": "user", "content": [ { "type": "text", "text": "What's in this image?"}, {"type": "image", "path": "path/to/image.jpg" } ]}, {"role": "assistant", "content": [{ "type": "text", "text": "..." }]}, ], } ``` ## Limitations - Sample Packing is not supported for multi-modality training currently. ================================================ FILE: examples/magistral/vision/magistral-small-vision-24B-qlora.yml ================================================ base_model: mistralai/Magistral-Small-2509 processor_type: AutoProcessor # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # sample dataset below requires downloading image in advance # wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg datasets: - path: Nanobit/text-vision-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mamba/config.yml ================================================ base_model: state-spaces/mamba-2.8b # optionally might have model_type or tokenizer_type or tokenizer_config model_type: MambaLMHeadModel tokenizer_type: AutoTokenizer tokenizer_config: EleutherAI/gpt-neox-20b # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 2048 sample_packing: false pad_to_sequence_len: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 2 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 5e-5 train_on_inputs: false group_by_length: true bf16: auto tf32: true gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mimo/README.md ================================================ # Finetune Xiaomi's MiMo with Axolotl [MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Run the finetuning example: ```bash axolotl train examples/mimo/mimo-7b-qlora.yaml ``` This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Tips - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Limitations **Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future. ## Related Resources - [MiMo Paper](https://arxiv.org/abs/2505.07608) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/mimo/mimo-7b-qlora.yaml ================================================ base_model: XiaomiMiMo/MiMo-7B-RL trust_remote_code: true revision_of_model: 6299b5a # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # CCE - N/A as of now # plugins: # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/ministral/README.md ================================================ # Finetune Ministral with Axolotl Ministral is a family of openweight models from MistralAI found on [HuggingFace](mistralai/Ministral-8B-Instruct-2410). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash axolotl train examples/ministral/ministral-small-qlora.yaml ``` This config uses about 8.76 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Tips - We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Limitations We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only. In addition, we do not support overriding tokens yet. ## Related Resources - [MistralAI Ministral Blog](https://mistral.ai/news/ministraux) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ## Future Work - Add parity to Preference Tuning, RL, etc. - Add parity to other tokenizer configs like overriding tokens. ================================================ FILE: examples/ministral/ministral-small-qlora.yaml ================================================ base_model: mistralai/Ministral-8B-Instruct-2410 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/ministral3/README.md ================================================ # Finetune Ministral3 with Axolotl Ministral3 is a family of open-weight models from MistralAI found on [HuggingFace](https://huggingface.co/collections/mistralai/ministral-3). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. Please see [Thinking](#thinking) and [Vision](#vision) for their respective fine-tuning. Thanks to the team at MistralAI for giving us early access to prepare for these releases. Note: This is still experimental given it is based on transformers v5 RC. ## Getting started 1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Swap to the Axolotl transformers v5 branch ```bash cp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml git fetch git checkout transformers-v5 # Install packages for transformers v5 pip install -e . ``` 4. Run the fine-tuning: ```bash axolotl train ministral3-3b-qlora.yaml ``` Let us know how it goes. Happy finetuning! 🚀 ### Tips - We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ### Thinking Ministral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps. 📚 **[See the Thinking fine-tuning guide →](./think/README.md)** ### Vision Ministral3 2512 model also supports vision capabilities. 📚 **[See the Vision fine-tuning guide →](./vision/README.md)** ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Limitations We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only. In addition, we do not support overriding tokens yet. ## Related Resources - [MistralAI Mistral3 Blog](https://mistral.ai/news/mistral-3) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ## Future Work - Add parity to Preference Tuning, RL, etc. - Add parity to other tokenizer configs like overriding tokens. ================================================ FILE: examples/ministral3/ministral3-3b-qlora.yaml ================================================ base_model: mistralai/Ministral-3-3B-Reasoning-2512 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true scaling_softmax: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/ministral3/think/README.md ================================================ # Ministral3 2512 Thinking Fine-tuning This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections. ## Prerequisites Before starting, ensure you have: - Installed Axolotl (see [main README](../README.md)) ## Getting Started Run the thinking model fine-tuning: ```bash axolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml ``` This config uses about 4.76 GiB VRAM. ### Tips - Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below. - You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent. ## Dataset Format The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages. Example format: ```json { "messages": [ { "role": "system", "content": [ { "type": "text", "text": "{SYSTEM_PROMPT}"} ] }, { "role": "user", "content": [ { "type": "text", "text": "Solve this step by step: What is 15% of 240?"} ] }, { "role": "assistant", "content": [ { "type": "thinking", "thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36." }, { "type": "text", "text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36." } ] } ] } ``` ### Advanced Options The `thinking` section supports an optional `closed` parameter: ```json { "type": "thinking", "thinking": "Internal reasoning here...", "closed": true // Default: true, controls adding the closing [/THINK] tag } ``` ================================================ FILE: examples/ministral3/think/ministral3-3b-think-qlora.yaml ================================================ base_model: mistralai/Ministral-3-3B-Reasoning-2512 # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: Nanobit/text-think-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/ministral3/vision/README.md ================================================ # Ministral3 2512 Vision Fine-tuning This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with vision capabilities using Axolotl. ## Prerequisites Before starting, ensure you have: - Installed Axolotl from source (see [main README](../README.md)) ## Getting started 1. Install the required vision lib: ```bash pip install 'mistral-common[opencv]==1.8.6' ``` 2. Download the example dataset image: ```bash wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg ``` 3. Run the fine-tuning: ```bash axolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml ``` WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look. ### Tips Key differences from text-only model: - Multi-modal dataset format required - Sample packing not supported ## Dataset Format The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now. Example: ```json { "messages": [ {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]}, {"role": "user", "content": [ { "type": "text", "text": "What's in this image?"}, {"type": "image", "path": "path/to/image.jpg" } ]}, {"role": "assistant", "content": [{ "type": "text", "text": "..." }]}, ], } ``` ## Limitations - Sample Packing is not supported for multi-modality training currently. ================================================ FILE: examples/ministral3/vision/ministral3-3b-vision-qlora.yml ================================================ base_model: mistralai/Ministral-3-3B-Reasoning-2512 processor_type: AutoProcessor # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # sample dataset below requires downloading image in advance # wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg datasets: - path: Nanobit/text-vision-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/README.md ================================================ **Mistral 7B** is a language model with a total of 7.3 billion parameters, showcasing a notable performance across a variety of benchmarks. Fine Tune: ```shell accelerate launch -m axolotl.cli.train examples/mistral/config.yml ``` If you run into CUDA OOM, use deepspeed with config zero2.json: ```shell accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json ``` ================================================ FILE: examples/mistral/bigstral/bigstral-ds-zero3.yaml ================================================ base_model: mistral-community/Mixtral-8x22B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true unfrozen_parameters: - ^lm_head.weight$ - ^model.embed_tokens.weight$ - model.layers.4[4-9]+.block_sparse_moe.gate - model.layers.4[4-9]+.block_sparse_moe.experts - model.layers.5[0-5]+.block_sparse_moe.gate - model.layers.5[0-5]+.block_sparse_moe.experts model_config: output_router_logits: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true save_total_limit: 1 save_steps: deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json weight_decay: 0.0 special_tokens: eos_token: "<|im_end|>" tokens: - "<|im_start|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/config.yml ================================================ base_model: mistralai/Mistral-7B-v0.1 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 8192 sample_packing: true eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.000005 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/dpo/mistral-dpo-qlora.yml ================================================ #Note that we are switching from the regular chat template to chatml. #If you experience problems with the special tokens, training for more epochs can help. #After training, merge the model before inference otherwise you might #face problems with the special tokens. base_model: mistralai/Mistral-7B-Instruct-v0.2 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true chat_template: chatml rl: dpo datasets: - path: olivermolenschot/alpaca_messages_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/dpo-qlora sequence_len: 2048 sample_packing: false adapter: qlora lora_model_dir: lora_r: 8 lora_alpha: 16 lora_dropout: 0.2 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj lora_modules_to_save: - embed_tokens - lm_head wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 16 num_epochs: 6 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: bos_token: "<|im_start|>" eos_token: "<|im_end|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/lora.yml ================================================ base_model: mistralai/Mistral-7B-v0.1 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: lora lora_model_dir: sequence_len: 8192 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mistral-qlora-fsdp.yml ================================================ base_model: mistralai/Mixtral-8x7B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.02 output_dir: ./outputs/qlora-out model_config: output_router_logits: true adapter: qlora lora_model_dir: sequence_len: 1024 sample_packing: false pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: false fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: false fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml ================================================ base_model: mistral-community/Mixtral-8x22B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.02 output_dir: ./outputs/qlora-out model_config: output_router_logits: true adapter: qlora lora_model_dir: sequence_len: 1024 sample_packing: false pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mixtral/mixtral-qlora-fsdp.yml ================================================ base_model: mistralai/Mixtral-8x7B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.02 output_dir: ./outputs/qlora-out model_config: output_router_logits: true adapter: qlora lora_model_dir: sequence_len: 1024 sample_packing: false pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_sharding_strategy: FULL_SHARD fsdp_forward_prefetch: false fsdp_backward_prefetch: BACKWARD_PRE special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mixtral/mixtral.yml ================================================ base_model: mistralai/Mixtral-8x7B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/qlora-out ## You can optionally freeze the entire model and unfreeze a subset of parameters unfrozen_parameters: # - ^lm_head.weight$ # - ^model.embed_tokens.weight$[:32000] # - model.layers.2[0-9]+.block_sparse_moe.gate # - model.layers.2[0-9]+.block_sparse_moe.experts # - model.layers.3[0-9]+.block_sparse_moe.gate # - model.layers.3[0-9]+.block_sparse_moe.experts model_config: output_router_logits: true adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true #lora_target_modules: # - gate # - q_proj # - k_proj # - v_proj # - o_proj # - w1 # - w2 # - w3 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 deepspeed: deepspeed_configs/zero2.json weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mixtral/mixtral_22.yml ================================================ base_model: mistral-community/Mixtral-8x22B-v0.1 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true unfrozen_parameters: - ^lm_head.weight$ - ^model.embed_tokens.weight$ - model.layers.4[4-9]+.block_sparse_moe.gate - model.layers.4[4-9]+.block_sparse_moe.experts - model.layers.5[0-5]+.block_sparse_moe.gate - model.layers.5[0-5]+.block_sparse_moe.experts model_config: output_router_logits: true datasets: - path: yahma/alpaca-cleaned type: alpaca output_dir: ./outputs/out sequence_len: 8000 sample_packing: true gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true save_total_limit: 1 save_steps: deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json weight_decay: 0.0 special_tokens: eos_token: "<|im_end|>" tokens: - "<|im_start|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/mps/lora-mps.yml ================================================ base_model: mistralai/Mistral-7B-v0.1 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/lora-out eval_sample_packing: false adapter: lora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto fp16: false tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: false sdp_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/orpo/mistral-qlora-orpo.yml ================================================ base_model: mistralai/Mistral-7B-v0.1 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true rl: orpo orpo_alpha: 0.1 remove_unused_columns: false chat_template: chatml datasets: - path: argilla/ultrafeedback-binarized-preferences-cleaned type: chat_template.argilla dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/mistral-qlora-orpo-out adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral/qlora.yml ================================================ base_model: mistralai/Mistral-7B-v0.1 # optionally might have model_type or tokenizer_type model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/qlora-out adapter: qlora lora_model_dir: sequence_len: 8192 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral-small/README.md ================================================ # Mistral Small 3.1/3.2 Fine-tuning This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24B-Instruct-2503) and [Mistral Small 3.2](mistralai/Mistral-Small-3.2-24B-Instruct-2506) with vision capabilities using Axolotl. ## Prerequisites Before starting, ensure you have: - Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html)) ## Getting Started 1. Install the required vision lib: ```bash pip install 'mistral-common[opencv]==1.8.5' ``` 2. Download the example dataset image: ```bash wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg ``` 3. Run the fine-tuning: ```bash axolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml ``` This config uses about 29.4 GiB VRAM. ## Dataset Format The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now. Example: ```json { "messages": [ {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]}, {"role": "user", "content": [ { "type": "text", "text": "What's in this image?"}, {"type": "image", "path": "path/to/image.jpg" } ]}, {"role": "assistant", "content": [{ "type": "text", "text": "..." }]}, ], } ``` ## Limitations - Sample Packing is not supported for multi-modality training currently. ================================================ FILE: examples/mistral-small/mistral-small-3.1-24B-lora.yml ================================================ base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 processor_type: AutoProcessor # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true load_in_8bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # sample dataset below requires downloading image in advance # wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg datasets: - path: Nanobit/text-vision-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 2048 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/mistral4/README.md ================================================ # Finetune Mistral Small 4 with Axolotl Mistral Small 4 is a 119B parameter (6.5B active) multimodal MoE model from MistralAI that unifies instruct, reasoning, and coding capabilities into a single model. It is available on HuggingFace at [Mistral-Small-4-119B-2603](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603). Thanks to the team at MistralAI for giving us early access to prepare for this release. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage 3. Install transformers from main ```bash pip install git+https://github.com/huggingface/transformers.git ``` 4. Run one of the example configs: ```bash # text-only axolotl train examples/mistral4/qlora-text.yml # no experts ~69 GiB, experts ~93 GiB axolotl train examples/mistral4/fft-text.yml # text + vision # run: wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg axolotl train examples/mistral4/qlora-vision.yml # no experts ~68 GiB axolotl train examples/mistral4/fft-vision.yml ``` Note: FFT configs provided as reference. Please adjust hyperparameters as needed. ## Reasoning Effort The chat template supports a `reasoning_effort` variable to control the model's reasoning depth: - `"none"` — instruct mode (default) - `"high"` — reasoning mode with explicit thinking steps Pass it via `chat_template_kwargs` under your dataset config: ```yaml datasets: - path: your/dataset type: chat_template chat_template_kwargs: reasoning_effort: high ``` ## Thinking Support The chat template supports a `thinking` content type in assistant messages for training on reasoning traces (rendered as `[THINK]...[/THINK]` blocks). To use thinking datasets, add the `thinking` mapping via `message_property_mappings`: ```yaml datasets: - path: your/thinking-dataset type: chat_template message_property_mappings: role: role content: content thinking: thinking chat_template_kwargs: reasoning_effort: high ``` See the [Magistral thinking guide](../magistral/think/README.md) for dataset format details. ## Tips - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). ## Related Resources - [MistralAI Mistral Small 4 Blog](https://mistral.ai/news/mistral-small-4) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/mistral4/fft-text.yml ================================================ base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16 plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin - axolotl.integrations.kernels.KernelsPlugin use_kernels: true use_sonicmoe: true # only train language model layers, freeze vision tower unfrozen_parameters: - model.language_model.* - lm_head - embed_tokens datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: false state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Mistral4DecoderLayer reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/mistral4/fft-vision.yml ================================================ base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16 processor_type: AutoProcessor plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin - axolotl.integrations.kernels.KernelsPlugin use_kernels: true use_sonicmoe: true # vision requirements skip_prepare_dataset: true remove_unused_columns: false sample_packing: false datasets: - path: Nanobit/text-vision-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out sequence_len: 2048 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: false state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Mistral4DecoderLayer reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/mistral4/qlora-text.yml ================================================ base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16 plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' # uncomment to train on expert layers # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # lora_mlp_kernel: false # lora_qkv_kernel: false # lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/mistral4/qlora-vision.yml ================================================ base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16 processor_type: AutoProcessor plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_4bit: true quantize_moe_experts: true # vision chat template requirements skip_prepare_dataset: true remove_unused_columns: false sample_packing: false datasets: - path: Nanobit/text-vision-2k-test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora sequence_len: 2048 lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' # uncomment to train on expert layers # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj # lora_mlp_kernel: false # lora_qkv_kernel: false # lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/nemotron/nemotron-mini-4b-qlora.yaml ================================================ base_model: nvidia/Nemotron-Mini-4B-Instruct load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/nemotron-mini-4b-qlora adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - q_proj - k_proj - v_proj - o_proj - up_proj - down_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 special_tokens: ================================================ FILE: examples/olmo3/README.md ================================================ # Finetune Allenai's Olmo 3 with Axolotl [Olmo 3](https://huggingface.co/collections/allenai/olmo-3) are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash axolotl train examples/olmo3/olmo3-7b-qlora.yaml ``` This uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - The example config can be re-used for Olmo and Olmo 2. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [Olmo 3 Blog](https://allenai.org/blog/olmo3) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/olmo3/olmo3-7b-qlora.yaml ================================================ base_model: allenai/Olmo-3-7B-Instruct-SFT # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/orpheus/README.md ================================================ # Finetuning LLMs to output audio In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio. The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB. ## Dataset pre-processing for pre-training If you are adding another voice in English, please jump ahead to finetuning pre-processing. For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer. Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset. ```python import torch from snac import SNAC from datasets import load_dataset from huggingface_hub import snapshot_download from datasets import load_dataset import random import torchaudio.transforms as T from transformers import AutoTokenizer import os my_original_dataset_name = "" name_to_push_dataset_to = "" dsn = my_original_dataset_name snapshot_download( repo_id=dsn, repo_type="dataset", revision="main", max_workers=64, ) ds = load_dataset(dsn, split="train") ds_sample_rate = ds[0]["audio"]["sampling_rate"] model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") model = model.to("mps") def tokenise_audio(waveform): waveform = torch.from_numpy(waveform).unsqueeze(0) waveform = waveform.to(dtype=torch.float32) resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000) waveform = resample_transform(waveform) waveform = waveform.unsqueeze(0).to("cuda") #generate the codes from snac with torch.inference_mode(): codes = model.encode(waveform) all_codes = [] for i in range(codes[0].shape[1]): all_codes.append(codes[0][0][i].item()+128266) all_codes.append(codes[1][0][2*i].item()+128266+4096) all_codes.append(codes[2][0][4*i].item()+128266+(2*4096)) all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096)) all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096)) all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096)) all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096)) return all_codes def add_codes(example): # Always initialize codes_list to None codes_list = None try: answer_audio = example.get("audio") # If there's a valid audio array, tokenise it if answer_audio and "array" in answer_audio: audio_array = answer_audio["array"] codes_list = tokenise_audio(audio_array) except Exception as e: print(f"Skipping row due to error: {e}") # Keep codes_list as None if we fail example["codes_list"] = codes_list return example ds = ds.map(add_codes, remove_columns=["audio"]) #@title Load Tokenizer tokeniser_length = 128256 start_of_text = 128000 end_of_text = 128009 start_of_speech = tokeniser_length + 1 end_of_speech = tokeniser_length + 2 start_of_human = tokeniser_length + 3 end_of_human = tokeniser_length + 4 start_of_ai = tokeniser_length + 5 end_of_ai = tokeniser_length + 6 pad_token = tokeniser_length + 7 audio_tokens_start = tokeniser_length + 10 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained" tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) num_proc = os.cpu_count() - 2 ds = ds.filter(lambda x: x["codes_list"] is not None) ds = ds.filter(lambda x: len(x["codes_list"]) > 0) #@title Create Input Ids def remove_duplicate_frames(example): vals = example["codes_list"] if len(vals) % 7 != 0: raise ValueError("Input list length must be divisible by 7") result = vals[:7] removed_frames = 0 for i in range(7, len(vals), 7): current_first = vals[i] previous_first = result[-7] if current_first != previous_first: result.extend(vals[i:i+7]) else: removed_frames += 1 example["codes_list"] = result return example ds = ds.map(remove_duplicate_frames, num_proc=num_proc) def create_input_ids(example): text_ids = tokenizer.encode({example['text']}, add_special_tokens=True) text_ids.append(end_of_text) example["text_tokens"] = text_ids input_ids = ( [start_of_human] + example["text_tokens"] + [end_of_human] + [start_of_ai] + [start_of_speech] + example["codes_list"] + [end_of_speech] + [end_of_ai] ) example["input_ids"] = input_ids example["labels"] = input_ids example["attention_mask"] = [1] * len(input_ids) return example ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"]) #@title Remove unnecessary columns columns_to_keep = ["input_ids", "labels", "attention_mask"] columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep] ds = ds.remove_columns(columns_to_remove) ds.push_to_hub(name_to_push_dataset_to) ``` ## Finetune pre-processing Use this code to add a new voice. ```python import torch from snac import SNAC from datasets import load_dataset from huggingface_hub import snapshot_download from datasets import load_dataset import random import torchaudio.transforms as T from transformers import AutoTokenizer import os my_original_dataset_name = "" name_to_push_dataset_to = "" dsn = my_original_dataset_name snapshot_download( repo_id=dsn, repo_type="dataset", revision="main", max_workers=64, ) ds = load_dataset(dsn, split="train") ds_sample_rate = ds[0]["audio"]["sampling_rate"] model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") model = model.to("mps") def tokenise_audio(waveform): waveform = torch.from_numpy(waveform).unsqueeze(0) waveform = waveform.to(dtype=torch.float32) resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000) waveform = resample_transform(waveform) waveform = waveform.unsqueeze(0).to("cuda") #generate the codes from snac with torch.inference_mode(): codes = model.encode(waveform) all_codes = [] for i in range(codes[0].shape[1]): all_codes.append(codes[0][0][i].item()+128266) all_codes.append(codes[1][0][2*i].item()+128266+4096) all_codes.append(codes[2][0][4*i].item()+128266+(2*4096)) all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096)) all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096)) all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096)) all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096)) return all_codes def add_codes(example): # Always initialize codes_list to None codes_list = None try: answer_audio = example.get("audio") # If there's a valid audio array, tokenise it if answer_audio and "array" in answer_audio: audio_array = answer_audio["array"] codes_list = tokenise_audio(audio_array) except Exception as e: print(f"Skipping row due to error: {e}") # Keep codes_list as None if we fail example["codes_list"] = codes_list return example ds = ds.map(add_codes, remove_columns=["audio"]) #@title Load Tokenizer tokeniser_length = 128256 start_of_text = 128000 end_of_text = 128009 start_of_speech = tokeniser_length + 1 end_of_speech = tokeniser_length + 2 start_of_human = tokeniser_length + 3 end_of_human = tokeniser_length + 4 start_of_ai = tokeniser_length + 5 end_of_ai = tokeniser_length + 6 pad_token = tokeniser_length + 7 audio_tokens_start = tokeniser_length + 10 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained" tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) num_proc = os.cpu_count() - 2 ds = ds.filter(lambda x: x["codes_list"] is not None) ds = ds.filter(lambda x: len(x["codes_list"]) > 0) #@title Create Input Ids def remove_duplicate_frames(example): vals = example["codes_list"] if len(vals) % 7 != 0: raise ValueError("Input list length must be divisible by 7") result = vals[:7] removed_frames = 0 for i in range(7, len(vals), 7): current_first = vals[i] previous_first = result[-7] if current_first != previous_first: result.extend(vals[i:i+7]) else: removed_frames += 1 example["codes_list"] = result return example ds = ds.map(remove_duplicate_frames, num_proc=num_proc) tok_info = '''*** HERE you can modify the text prompt i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass: f"{example["source"]}: {example["text"]}", as is passed. ''' print(tok_info) def create_input_ids(example): text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}", add_special_tokens=True) text_ids.append(end_of_text) example["text_tokens"] = text_ids input_ids = ( [start_of_human] + example["text_tokens"] + [end_of_human] + [start_of_ai] + [start_of_speech] + example["codes_list"] + [end_of_speech] + [end_of_ai] ) example["input_ids"] = input_ids example["labels"] = input_ids example["attention_mask"] = [1] * len(input_ids) return example ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"]) #@title Remove unnecessary columns columns_to_keep = ["input_ids", "labels", "attention_mask"] columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep] ds = ds.remove_columns(columns_to_remove) ds.push_to_hub(name_to_push_dataset_to) ``` ## Training After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml` ## Inference For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main). ================================================ FILE: examples/orpheus/finetune.yml ================================================ base_model: canopylabs/orpheus-3b-0.1-pretrained hub_model_id: plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_fused_linear_cross_entropy: true datasets: - path: type: # leave empty to load pre-tokenized dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/out sequence_len: 8192 sample_packing: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 4 num_epochs: 3 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: auto tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 5 saves_per_epoch: 5 weight_decay: 0.05 special_tokens: pad_token: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/README.md ================================================ # Phi Due to some nuances with the phi code, please use deepspeed when training phi for full finetune. ```shell accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json # OR python -m axolotl.cli.train examples/phi/phi-qlora.yml ``` ================================================ FILE: examples/phi/lora-3.5.yaml ================================================ base_model: microsoft/Phi-3.5-mini-instruct # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: true load_in_4bit: false chat_template: phi_3 datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 4 num_epochs: 2 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bfloat16: true bf16: true fp16: tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 4 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/phi-ft.yml ================================================ base_model: microsoft/phi-1_5 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: garage-bAInd/Open-Platypus type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/phi-qlora.yml ================================================ base_model: microsoft/phi-1_5 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true datasets: - path: garage-bAInd/Open-Platypus type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true adapter: qlora lora_model_dir: lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/phi2-ft.yml ================================================ base_model: microsoft/phi-2 # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: garage-bAInd/Open-Platypus type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/phi3-ft-fsdp.yml ================================================ base_model: microsoft/Phi-3-mini-4k-instruct # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0 output_dir: ./phi-sft-out sequence_len: 4096 sample_packing: true trust_remote_code: true adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: wandb_project: phi3 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 12 num_epochs: 2 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Phi3DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/phi/phi3-ft.yml ================================================ base_model: microsoft/Phi-3-mini-4k-instruct # optionally might have model_type or tokenizer_type trust_remote_code: true model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name chat_template: phi_3 datasets: - path: garage-bAInd/Open-Platypus type: alpaca:phi dataset_prepared_path: val_set_size: 0.01 output_dir: ./out sequence_len: 4096 sample_packing: true adapter: lora lora_model_dir: lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_torch_fused adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 5.0e-6 bf16: auto gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True early_stopping_patience: 3 logging_steps: 1 flash_attention: true eval_steps: 1000 save_steps: 5000 eval_batch_size: 2 eval_sample_packing: false eval_table_size: 2 eval_max_new_tokens: 32 eval_causal_lm_metrics: ["perplexity"] do_causal_lm_eval: true warmup_ratio: 0.2 debug: true weight_decay: 0.1 resize_token_embeddings_to_32x: true # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/pixtral/lora-12b.yml ================================================ base_model: mistral-community/pixtral-12b processor_type: AutoProcessor # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: pixtral datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: pad_token: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/plano/README.md ================================================ # Finetune Katanemo's Plano-Orchestrator with Axolotl [Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash axolotl train examples/plano/plano-4b-qlora.yaml ``` This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### Orchestration Prompt Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template. ### Tips - To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [Plano GitHub](https://github.com/katanemo/plano) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/plano/plano-4b-qlora.yaml ================================================ base_model: katanemo/Plano-Orchestrator-4B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true chat_template: qwen3 datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Gemma3-12B_baseline.yml ================================================ base_model: google/gemma-3-12b-it # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/out_gemma/ sequence_len: 8096 sample_packing: true flash_attention: true wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 4e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Gemma3-12B_qat.yml ================================================ base_model: google/gemma-3-12b-it # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/qat_out_gemma/ sequence_len: 8096 sample_packing: true flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 4e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml ================================================ base_model: google/gemma-3-12b-it # Math finetuning configuration for Gemma3-12B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/out_math_gemma/ sequence_len: 4096 sample_packing: true flash_attention: true wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 8 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 3e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Gemma3-12B_qat.yml ================================================ base_model: google/gemma-3-12b-it # Math finetuning configuration for Gemma3-12B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/qat_out_math_gemma/ sequence_len: 4096 sample_packing: true flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 8 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 3e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml ================================================ base_model: google/gemma-3-27b-it # Math finetuning configuration for Gemma3-27B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/out_math_gemma27/ sequence_len: 4096 sample_packing: true flash_attention: true wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-6 eta_min: 7e-7 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Gemma3-27B_qat.yml ================================================ base_model: google/gemma-3-27b-it # Math finetuning configuration for Gemma3-27B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: gemma3 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/qat_out_math_gemma27/ sequence_len: 4096 sample_packing: true flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-6 eta_min: 7e-7 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Gemma3DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml ================================================ base_model: Qwen/Qwen2.5-72B # Math finetuning configuration for Qwen2.5-72B (non-instruct) # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: qwen_25 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/out_math_72b/ sequence_len: 4096 sample_packing: true flash_attention: true wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 8 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-6 eta_min: 7e-7 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen2DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml ================================================ base_model: Qwen/Qwen2.5-72B # Math finetuning configuration for Qwen2.5-72B (non-instruct) # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: qwen_25 datasets: - path: AI-MO/NuminaMath-CoT type: chat_template output_dir: ./outputs/qat_out_math_72b/ sequence_len: 4096 sample_packing: true flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 8 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 5e-6 eta_min: 7e-7 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen2DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Qwen2.5-72B_baseline.yml ================================================ base_model: Qwen/Qwen2.5-72B # Alpaca finetuning configuration for Qwen2.5-72B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: qwen_25 datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/out_qwen72b/ sequence_len: 8096 sample_packing: true flash_attention: true wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen2DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qat_nvfp4/Qwen2.5-72B_qat.yml ================================================ base_model: Qwen/Qwen2.5-72B # Alpaca finetuning configuration for Qwen2.5-72B # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true seed: 42 chat_template: qwen_25 datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/qat_out_qwen72b/ sequence_len: 8096 sample_packing: true flash_attention: true qat: activation_dtype: nvfp4 weight_dtype: nvfp4 group_size: 16 # only group_size of 16 is supported with nvfp4 wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 # evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp_version: 2 fsdp_config: offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen2DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2/adamw-pretrain-fsdp2.yaml ================================================ base_model: Qwen/Qwen2.5-0.5B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Use random initialization for fair comparison reinit_weights: true load_in_8bit: false load_in_4bit: false strict: false # Pretraining dataset pretraining_dataset: - path: allenai/c4 name: en type: pretrain split: train dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/compare-adamw-pretrain sequence_len: 2048 sample_packing: true pad_to_sequence_len: true wandb_project: dist_muon wandb_entity: wandb_watch: wandb_name: adamw wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 1 max_steps: 305 # AdamW optimizer settings (standard LR for AdamW) optimizer: adamw_torch_fused learning_rate: 0.0002 weight_decay: 0.01 lr_scheduler: cosine train_on_inputs: true group_by_length: false bf16: auto fp16: false tf32: false gradient_checkpointing: false logging_steps: 1 flash_attention: true warmup_steps: 10 evals_per_epoch: 0 saves_per_epoch: 1 # Reproducibility seed: 42 fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_cpu_ram_efficient_loading: false fsdp_reshard_after_forward: true special_tokens: ================================================ FILE: examples/qwen2/dpo.yaml ================================================ base_model: Qwen/Qwen2.5-0.5B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name chat_template: qwen_25 rl: dpo datasets: - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/dpo-out sequence_len: 2048 sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2/muon-pretrain-fsdp2.yaml ================================================ base_model: Qwen/Qwen2.5-0.5B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Use random initialization for fair comparison reinit_weights: true load_in_8bit: false load_in_4bit: false strict: false # Pretraining dataset pretraining_dataset: - path: allenai/c4 name: en type: pretrain split: train dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/compare-muon-pretrain sequence_len: 2048 sample_packing: true pad_to_sequence_len: true wandb_project: dist_muon wandb_entity: wandb_watch: wandb_name: muon wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 1 max_steps: 305 # Muon optimizer settings optimizer: muon learning_rate: 0.02 weight_decay: 0.01 lr_scheduler: cosine train_on_inputs: true group_by_length: false bf16: auto fp16: false tf32: false gradient_checkpointing: false logging_steps: 1 flash_attention: true warmup_steps: 10 evals_per_epoch: 0 saves_per_epoch: 1 # Reproducibility seed: 42 fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_cpu_ram_efficient_loading: false fsdp_reshard_after_forward: true special_tokens: ================================================ FILE: examples/qwen2/prm.yaml ================================================ base_model: Qwen/Qwen2.5-3B # optionally might have model_type or tokenizer_type model_type: AutoModelForTokenClassification num_labels: 2 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name process_reward_model: true chat_template: datasets: - path: trl-lib/math_shepherd type: stepwise_supervised step_separator: "\n" max_completion_length: train_on_last_step_only: false val_set_size: 0.2 output_dir: ./outputs/out remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 8 eval_batch_size: 8 num_epochs: 1 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: eval_steps: 100 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2/qlora-fsdp.yaml ================================================ base_model: Qwen/Qwen2-7B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true load_in_8bit: false load_in_4bit: true datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true eval_sample_packing: true adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2/reward-model.yaml ================================================ base_model: Qwen/Qwen2.5-0.5B # optionally might have model_type or tokenizer_type model_type: AutoModelForSequenceClassification num_labels: 1 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name reward_model: true chat_template: qwen_25 datasets: - path: argilla/distilabel-intel-orca-dpo-pairs type: bradley_terry.chat_template val_set_size: 0.0 output_dir: ./outputs/out remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2-vl/lora-7b.yaml ================================================ base_model: Qwen/Qwen2-VL-7B-Instruct processor_type: AutoProcessor # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: qwen2_vl datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen2_5-vl/lora-7b.yaml ================================================ base_model: Qwen/Qwen2.5-VL-7B-Instruct processor_type: AutoProcessor # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: qwen2_vl datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen3/32b-qlora.yaml ================================================ base_model: Qwen/Qwen3-32B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true eval_sample_packing: true load_in_4bit: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - down_proj - up_proj lora_mlp_kernel: true lora_qkv_kernel: true lora_o_kernel: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen3/8b-qat-fsdp2.yml ================================================ base_model: Qwen/Qwen3-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: false strict: false plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/qat_out/ sequence_len: 2048 sample_packing: true flex_attention: true flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs qat: activation_dtype: int8 weight_dtype: int4 group_size: 256 fake_quant_after_n_steps: 1000 wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 2 max_steps: 2000 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true resume_from_checkpoint: logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 warmup_ratio: 0.1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD fsdp_reshard_after_forward: true fsdp_activation_checkpointing: true special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen3/README.md ================================================ # Finetune Qwen3 with Axolotl [Qwen3](https://huggingface.co/collections/Qwen/qwen3) are a family of open source models trained by Alibaba. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash axolotl train examples/qwen3/32b-qlora.yaml ``` Let us know how it goes. Happy finetuning! 🚀 ### Chat template masking a few tokens off If you notice that the `chat_template` masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml. ```yaml chat_template: qwen3 ``` ### TIPS - For inference, please check the official model card as it depends on your reasoning mode. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [Qwen3 Blog](https://qwenlm.github.io/blog/qwen3/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/qwen3/qlora-fsdp.yaml ================================================ base_model: Qwen/Qwen3-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true strict: false datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true eval_sample_packing: true adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - full_shard - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true fsdp_offload_params: true fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD special_tokens: # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen3/reward-model.yaml ================================================ base_model: Skywork/Skywork-Reward-V2-Qwen3-8B model_type: AutoModelForSequenceClassification num_labels: 1 reward_model: true center_rewards_coefficient: 0.01 # Incentivize mean-zero rewards for improved stability chat_template: qwen3 datasets: - path: argilla/distilabel-intel-orca-dpo-pairs type: bradley_terry.chat_template val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 8192 sample_packing: false eval_sample_packing: false pad_to_sequence_len: true deepspeed: deepspeed_configs/zero1.json wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 eval_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit lr_scheduler: linear learning_rate: 0.00002 bf16: true tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false warmup_ratio: 0.1 logging_steps: 1 weight_decay: 0.01 ================================================ FILE: examples/qwen3-next/README.md ================================================ # Finetune Qwen3-Next with Axolotl [Qwen3-Next](https://huggingface.co/collections/Qwen/qwen3-next-68c25fd6838e585db8eeea9d) represents the next-generation foundation models optimized for extreme context length and large-scale parameter efficiency. The series introduces architectural innovations including Hybrid Attention (Gated DeltaNet + Gated Attention), High-Sparsity MoE with 1:50 activation ratio, and Multi-Token Prediction for enhanced performance and inference acceleration. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Install FLA for improved performance ```bash pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1 ``` 4. Run the finetuning example: ```bash axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml ``` This config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. See [Multi-GPU](#optimization-guides) section below. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Related Resources - [Qwen3-Next Blog](https://qwenlm.github.io/blog/qwen3_next/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml ================================================ base_model: Qwen/Qwen3-Next-80B-A3B-Instruct # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true quantize_moe_experts: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 16 lora_alpha: 8 lora_dropout: 0 lora_target_modules: - linear_attn.in_proj_ba - linear_attn.in_proj_qkvz - linear_attn.out_proj - shared_expert.up_proj - shared_expert.down_proj - shared_expert.gate_proj - shared_expert_gate - q_proj - v_proj - k_proj - o_proj # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml ================================================ base_model: Qwen/Qwen3.5-122B-A10B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true quantize_moe_experts: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0 lora_target_modules: - q_proj - k_proj - v_proj - o_proj # Regex matching to target shared experts too # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' # Target experts # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: fsdp_config: fsdp_version: 2 offload_params: true cpu_ram_efficient_loading: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/qwen3.5/122b-a10b-moe-qlora.yaml ================================================ base_model: Qwen/Qwen3.5-122B-A10B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true quantize_moe_experts: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0 lora_target_modules: - q_proj - k_proj - v_proj - o_proj # Regex matching to target shared experts too # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' # Target experts # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/qwen3.5/27b-fft.yaml ================================================ base_model: Qwen/Qwen3.5-27B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # Full fine-tune (FFT) of the text-only path of Qwen3.5-27B. plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true # Freeze vision encoder unfrozen_parameters: - model\.language_model\..* - lm_head\..* wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/qwen3.5/27b-qlora-fsdp.yaml ================================================ base_model: Qwen/Qwen3.5-27B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - down_proj - up_proj # Uncomment below to also target the linear attention projections. # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific). # - linear_attn.in_proj_qkv # - linear_attn.in_proj_z # - linear_attn.out_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: fsdp_config: fsdp_version: 2 offload_params: false cpu_ram_efficient_loading: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/qwen3.5/27b-qlora.yaml ================================================ base_model: Qwen/Qwen3.5-27B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - down_proj - up_proj # Uncomment below to also target the linear attention projections. # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific). # - linear_attn.in_proj_qkv # - linear_attn.in_proj_z # - linear_attn.out_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml ================================================ base_model: Qwen/Qwen3.5-35B-A3B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true quantize_moe_experts: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0 lora_target_modules: - q_proj - k_proj - v_proj - o_proj # Regex matching to target shared experts too # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' # Target experts # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: fsdp_config: fsdp_version: 2 offload_params: true cpu_ram_efficient_loading: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer state_dict_type: FULL_STATE_DICT sharding_strategy: FULL_SHARD reshard_after_forward: true activation_checkpointing: true ================================================ FILE: examples/qwen3.5/35b-a3b-moe-qlora.yaml ================================================ base_model: Qwen/Qwen3.5-35B-A3B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin strict: false chat_template: qwen3_5 datasets: - path: mlabonne/FineTome-100k type: chat_template split: train[:20%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true load_in_4bit: true quantize_moe_experts: true adapter: qlora lora_r: 16 lora_alpha: 32 lora_dropout: 0 lora_target_modules: - q_proj - k_proj - v_proj - o_proj # Regex matching to target shared experts too # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' # Target experts # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/qwen3.5/9b-fft-vision.yaml ================================================ base_model: Qwen/Qwen3.5-9B processor_type: AutoProcessor # Required for multimodal training skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: qwen3_5 datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out sequence_len: 4096 pad_to_sequence_len: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: examples/qwen3.5/9b-lora-vision.yaml ================================================ base_model: Qwen/Qwen3.5-9B processor_type: AutoProcessor # These 3 lines are required for vision/multimodal training skip_prepare_dataset: true remove_unused_columns: false sample_packing: false chat_template: qwen3_5 datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 # Targets the language model attention and MLP layers. lora_target_modules: - q_proj - k_proj - v_proj - o_proj - down_proj - up_proj # Uncomment to also target the linear attention (GatedDeltaNet) projections: # - linear_attn.in_proj_qkv # - linear_attn.in_proj_z # - linear_attn.out_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/qwen3.5/README.md ================================================ # Finetune Qwen3.5 with Axolotl [Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers: ```bash pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1 ``` > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there. 4. Pick any config from the table below and run: ```bash axolotl train examples/qwen3.5/.yaml ``` Available configs: | Config | Model | Type | Peak VRAM | |---|---|---|---| | `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — | | `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB | | `27b-qlora.yaml` | Qwen3.5-27B | Dense, text-only QLoRA | ~47 GiB | | `27b-fft.yaml` | Qwen3.5-27B | Dense, text-only FFT (vision frozen) | ~53 GiB | | `27b-qlora-fsdp.yaml` | Qwen3.5-27B | Dense, text-only QLoRA + FSDP2 | — | | `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — | | `35b-a3b-moe-qlora-fsdp.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA + FSDP2 | — | | `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — | | `122b-a10b-moe-qlora-fsdp.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA + FSDP2 | — | ### Gated DeltaNet Linear Attention Qwen3.5 interleaves standard attention with Gated DeltaNet linear attention layers. To apply LoRA to them, add to `lora_target_modules`: ```yaml lora_target_modules: # ... standard projections ... - linear_attn.in_proj_qkv - linear_attn.in_proj_z - linear_attn.out_proj ``` ### Routed Experts (MoE) To apply LoRA to routed expert parameters, add `lora_target_parameters`: ```yaml lora_target_parameters: - mlp.experts.gate_up_proj - mlp.experts.down_proj # - mlp.gate.weight # router ``` ### Shared Experts (MoE) Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately: ```yaml lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' ``` ### TIPS - For inference hyp, please see the respective model card details. - You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below. - Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`. ## Optimization Guides - [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html) ## Related Resources - [Qwen3.5 Blog](https://qwenlm.github.io/blog/qwen3.5/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/seed-oss/README.md ================================================ # Finetune ByteDance's Seed-OSS with Axolotl [Seed-OSS](https://huggingface.co/collections/ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd) are a series of 36B parameter open source models trained by ByteDance's Seed Team. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have a compatible version of Pytorch installed pip3 install packaging setuptools wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' # Install Cut Cross Entropy python scripts/cutcrossentropy_install.py | sh ``` 2. Run the finetuning example: ```bash axolotl train examples/seed-oss/seed-oss-36b-qlora.yaml ``` This config uses about 27.7 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - For inference, the official Seed Team recommends `top_p=0.95` and `temperature=1.1`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [ByteDance Seed Website](https://seed.bytedance.com/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/seed-oss/seed-oss-36b-qlora.yaml ================================================ base_model: ByteDance-Seed/Seed-OSS-36B-Instruct # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/slurm/README.md ================================================ # SLURM Multi-Node Training This directory contains an example SLURM script for running Axolotl training jobs across multiple nodes in a SLURM cluster. ## Prerequisites - Access to a SLURM cluster with GPU nodes - Axolotl installed on all nodes (see [installation docs](https://docs.axolotl.ai/docs/installation.html)) ## Usage ### Standard SLURM Clusters 1. Copy [`axolotl.slurm`](./axolotl.slurm) to your working directory. 2. Place your Axolotl config file (`train.yaml`) in the same directory. 3. Set the appropriate environment variables for the job: ```bash export HF_TOKEN="your-huggingface-token" # metric tracking # export WANDB_API_KEY="your-wandb-api-key" # ... ``` 4. Submit the job: ```bash sbatch --export=ALL,NUM_NODES=2,NUM_TRAINERS=8,PRIMARY_ADDR=,PRIMARY_PORT=29400 axolotl.slurm ``` Where: - `NUM_NODES`: Number of nodes to use - `NUM_TRAINERS`: GPUs per node (typically 8) - `PRIMARY_ADDR`: Hostname/IP of the master node - `PRIMARY_PORT`: Port for distributed training (default: 29400) 5. (Optional) Run other slurm commands: ```bash # check job info scontrol show job axolotl-cli # check job queue squeue # check cluster status sinfo ``` ### RunPod Instant Clusters Axolotl works with RunPod Instant Clusters. This feature provides managed SLURM clusters with zero configuration. 1. **Deploy a SLURM Cluster**: - Go to [RunPod Instant Clusters](https://console.runpod.io/cluster) - Click "Create a Cluster" - Choose your GPU type, node count, and region - Choose an [Axolotl cloud docker image](https://docs.axolotl.ai/docs/docker.html#cloud) - Deploy the cluster 2. **Connect to the Controller Node**: Find the controller node in the RunPod console and connect via SSH 3. **Follow the instructions in [Standard SLURM Clusters](#standard-slurm-clusters)** ## Additional Resources - [Axolotl Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [SLURM Documentation](https://slurm.schedmd.com/documentation.html) - [RunPod SLURM Clusters Guide](https://docs.runpod.io/instant-clusters/slurm-clusters) ================================================ FILE: examples/slurm/axolotl.slurm ================================================ #!/bin/bash # Prior to running this script, export your HF_TOKEN and WANDB_API_KEY to your environment; i.e. # export HF_TOKEN="..." # export WANDB_API_KEY="..." # # ---------- SBATCH commands ---------- # #SBATCH --job-name=axolotl-slurm-multinode #SBATCH --ntasks-per-node=1 #SBATCH --nodes=$NUM_NODES #SBATCH --gpus-per-task=8 #SBATCH --cpus-per-task=128 export TORCH_DIST_INIT_BARRIER=0 srun axolotl preprocess train.yaml srun axolotl train train.yaml --launcher torchrun -- \ --nproc_per_node=$NUM_TRAINERS --nnodes=$NUM_NODES \ --rdzv_id axolotl-cli --rdzv_backend c10d --rdzv_endpoint "${PRIMARY_ADDR}:${PRIMARY_PORT}" --rdzv-conf="join_timeout=1800" ================================================ FILE: examples/smolvlm2/README.md ================================================ # Finetune SmolVLM2 with Axolotl [SmolVLM2](https://huggingface.co/collections/HuggingFaceTB/smolvlm2-smallest-video-lm-ever-67ab6b5e84bf8aaa60cb17c7) are a family of lightweight, open-source multimodal models from HuggingFace designed to analyze and understand video, image, and text content. These models are built for efficiency, making them well-suited for on-device applications where computational resources are limited. Models are available in multiple sizes, including 2.2B, 500M, and 256M. This guide shows how to fine-tune SmolVLM2 models with Axolotl. ## Getting Started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have a compatible version of Pytorch installed pip3 install packaging setuptools wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Install an extra dependency: ```bash pip3 install num2words==0.5.14 ``` 3. Run the finetuning example: ```bash # LoRA SFT (1x48GB @ 6.8GiB) axolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml ``` ## TIPS - **Dataset Format**: For video finetuning, your dataset must be compatible with the multi-content Messages format. For more details, see our documentation on [Multimodal Formats](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). - **Dataset Loading**: Read more on how to prepare and load your own datasets in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [SmolVLM2 Blog](https://huggingface.co/blog/smolvlm2) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/smolvlm2/smolvlm2-2B-lora.yaml ================================================ base_model: HuggingFaceTB/SmolVLM2-2.2B-Instruct trust_remote_code: true processor_type: AutoProcessor # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out adapter: lora lora_model_dir: sequence_len: 8192 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'model.text_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true logging_steps: 1 flash_attention: true eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/streaming/README.md ================================================ # Streaming Dataset Examples This directory contains example configurations for using Axolotl's streaming dataset functionality, which enables memory-efficient training with large datasets. ## Examples Run the following examples with e.g. `axolotl train examples/streaming/sft.yaml`; no `axolotl preprocess` required! ### Pretraining (`pretrain.yaml`) Demonstrates streaming configuration for pretraining tasks using the fineweb-edu dataset with SmolLM2-135M. - Uses `pretraining_dataset` configuration for automatic streaming - Multipack attention control to prevent cross-attention between packed sequences - Buffer size configuration for memory management ### SFT (`sft.yaml`) Shows how to use streaming for supervised fine-tuning with the Alpaca dataset. - Explicit `streaming: true` flag for SFT datasets - Memory-efficient training on instruction datasets - Evaluation datasets are currently not streamed ## Key Configuration Options ### `streaming` - Enables streaming mode for standard datasets - Automatically enabled for `pretraining_dataset` ### `streaming_multipack_buffer_size` - Controls buffer size for sample packing (default: 10,000) - Larger values improve packing efficiency but use more memory - Adjust based on available memory ### `shuffle_merged_datasets` - Enables shuffling of streaming datasets - Requires additional memory for shuffle buffer ### `sample_packing` - Packs multiple samples into single sequences - Minimize per-step padding tokens ## Performance Tips - Download small / frequently-used datasets locally for better performance - Larger buffer sizes improve packing efficiency ================================================ FILE: examples/streaming/pretrain.yaml ================================================ base_model: HuggingFaceTB/SmolLM2-135M # Streaming pretraining configuration pretraining_dataset: - path: HuggingFaceFW/fineweb-edu name: sample-10BT type: pretrain text_column: text split: train # Streaming-specific settings streaming_multipack_buffer_size: 10000 shuffle_merged_datasets: true # Training configuration max_steps: 1000 output_dir: ./outputs/smollm2-135m-pretrain-streaming # Sequence and packing settings sequence_len: 1024 sample_packing: true pretrain_multipack_attn: true # Prevent cross-attention between packed sequences flash_attention: true # Batch size settings gradient_accumulation_steps: 8 micro_batch_size: 1 # Optimizer and scheduler optimizer: adamw_torch lr_scheduler: cosine learning_rate: 5e-4 warmup_ratio: 0.1 weight_decay: 0.01 # Precision and performance bf16: auto tf32: true # Logging and checkpointing logging_steps: 10 save_strategy: steps save_steps: 250 save_total_limit: 3 # Weights & Biases (optional) wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: # Special tokens special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/streaming/sft.yaml ================================================ base_model: HuggingFaceTB/SmolLM2-135M # Dataset configuration datasets: - path: tatsu-lab/alpaca type: alpaca split: train # Streaming-specific settings streaming: true streaming_multipack_buffer_size: 10000 shuffle_merged_datasets: true # Training configuration max_steps: 1000 output_dir: ./outputs/smollm2-135m-sft-streaming # Sequence and packing settings sequence_len: 1024 sample_packing: true flash_attention: true # Batch size settings gradient_accumulation_steps: 4 micro_batch_size: 1 # Optimizer and scheduler optimizer: adamw_torch lr_scheduler: cosine learning_rate: 2e-4 warmup_ratio: 0.1 weight_decay: 0.0 # Precision and performance bf16: auto tf32: true # Logging and checkpointing logging_steps: 10 save_strategy: steps save_steps: 100 save_total_limit: 3 # Weights & Biases (optional) wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: # Special tokens special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/swanlab/README.md ================================================ # SwanLab Integration Examples This directory contains example configurations demonstrating SwanLab integration with Axolotl. ## Examples Overview ### 1. DPO with Completion Logging **File**: `dpo-swanlab-completions.yml` Demonstrates DPO (Direct Preference Optimization) training with RLHF completion table logging. **Features**: - Basic SwanLab experiment tracking - Completion table logging (prompts, chosen/rejected responses, rewards) - Memory-bounded buffer for long training runs - Cloud sync configuration **Best for**: RLHF practitioners who want to analyze model outputs qualitatively **Quick start**: ```bash export SWANLAB_API_KEY=your-api-key accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml ``` --- ### 2. LoRA with Performance Profiling **File**: `lora-swanlab-profiling.yml` Demonstrates standard LoRA fine-tuning with performance profiling enabled. **Features**: - SwanLab experiment tracking - Automatic profiling of trainer methods - Profiling metrics visualization - Performance optimization guidance **Best for**: Engineers optimizing training performance and comparing different configurations **Quick start**: ```bash export SWANLAB_API_KEY=your-api-key accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml ``` --- ### 3. Full-Featured DPO Production Setup **File**: `dpo-swanlab-full-featured.yml` Comprehensive production-ready configuration with ALL SwanLab features enabled. **Features**: - Experiment tracking with team workspace - RLHF completion logging - Performance profiling - Lark (Feishu) team notifications - Private deployment support - Production checklist and troubleshooting **Best for**: Production RLHF training with team collaboration **Quick start**: ```bash export SWANLAB_API_KEY=your-api-key export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/... export SWANLAB_LARK_SECRET=your-webhook-secret accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml ``` --- ### 4. Custom Trainer Profiling (Python) **File**: `custom_trainer_profiling.py` Python code examples showing how to add SwanLab profiling to custom trainers. **Features**: - `@swanlab_profile` decorator examples - Context manager profiling for fine-grained timing - `ProfilingConfig` for advanced filtering and throttling - Multiple profiling patterns and best practices **Best for**: Advanced users creating custom trainers **Usage**: ```python from custom_trainer_profiling import CustomTrainerWithProfiling # See file for detailed examples and patterns ``` --- ## Feature Matrix | Example | Tracking | Completion Logging | Profiling | Lark Notifications | Team Workspace | |---------|----------|-------------------|-----------|-------------------|----------------| | dpo-swanlab-completions.yml | ✅ | ✅ | ✅ (auto) | ➖ (commented) | ➖ (commented) | | lora-swanlab-profiling.yml | ✅ | ➖ (disabled) | ✅ (auto) | ➖ (commented) | ➖ (commented) | | dpo-swanlab-full-featured.yml | ✅ | ✅ | ✅ (auto) | ✅ | ✅ | | custom_trainer_profiling.py | N/A | N/A | ✅ (manual) | N/A | N/A | --- ## Configuration Quick Reference ### Basic SwanLab Setup ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: my-project swanlab_experiment_name: my-experiment swanlab_mode: cloud # cloud, local, offline, disabled ``` ### RLHF Completion Logging ```yaml swanlab_log_completions: true swanlab_completion_log_interval: 100 # Log every 100 steps swanlab_completion_max_buffer: 128 # Memory-bounded buffer ``` ### Lark Team Notifications ```yaml swanlab_lark_webhook_url: https://open.feishu.cn/... swanlab_lark_secret: your-webhook-secret # Required for production ``` ### Team Workspace ```yaml swanlab_workspace: my-research-team ``` ### Private Deployment ```yaml swanlab_web_host: https://swanlab.yourcompany.com swanlab_api_host: https://api.swanlab.yourcompany.com ``` --- ## Authentication ### Recommended: Environment Variable ```bash export SWANLAB_API_KEY=your-api-key export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/... export SWANLAB_LARK_SECRET=your-webhook-secret ``` ### Alternative: Config File (less secure) ```yaml swanlab_api_key: your-api-key swanlab_lark_webhook_url: https://open.feishu.cn/... swanlab_lark_secret: your-webhook-secret ``` --- ## Common Use Cases ### Use Case 1: Migrate from WandB to SwanLab Start with `lora-swanlab-profiling.yml`, add your model/dataset config, disable WandB: ```yaml use_swanlab: true use_wandb: false ``` ### Use Case 2: Analyze DPO Model Outputs Use `dpo-swanlab-completions.yml`, adjust completion logging interval based on your training length: ```yaml swanlab_completion_log_interval: 50 # More frequent for short training swanlab_completion_log_interval: 200 # Less frequent for long training ``` ### Use Case 3: Optimize Training Performance Use `lora-swanlab-profiling.yml`, run multiple experiments with different optimizations: - Baseline: `flash_attention: false, gradient_checkpointing: false` - Flash Attention: `flash_attention: true` - Gradient Checkpointing: `gradient_checkpointing: true` - Both: `flash_attention: true, gradient_checkpointing: true` Compare profiling metrics in SwanLab dashboard. ### Use Case 4: Production RLHF with Team Collaboration Use `dpo-swanlab-full-featured.yml`, set up team workspace and Lark notifications: ```yaml swanlab_workspace: ml-team swanlab_lark_webhook_url: ... swanlab_lark_secret: ... ``` --- ## Viewing Your Experiments ### Cloud Mode Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project. **Dashboard sections**: - **Metrics**: Training loss, learning rate, profiling metrics - **Tables**: RLHF completions (for DPO/KTO/ORPO/GRPO) - **Config**: Hyperparameters and configuration - **System**: Resource usage (GPU, memory, CPU) - **Files**: Logged artifacts ### Local Mode ```bash swanlab watch ./swanlog # Open browser to http://localhost:5092 ``` --- ## Troubleshooting ### SwanLab not initializing ```bash # Check API key echo $SWANLAB_API_KEY # Verify SwanLab is installed pip show swanlab # Check config grep -A 5 "use_swanlab" your-config.yml ``` ### Completions not appearing - Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO) - Check `swanlab_log_completions: true` - Wait for `swanlab_completion_log_interval` steps - Look for "Registered SwanLab RLHF completion logging" in logs ### Lark notifications not working - Test webhook manually: `curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...` - Verify `SWANLAB_LARK_SECRET` is set correctly - Check bot is added to Lark group chat - Look for "Registered Lark notification callback" in logs ### Profiling metrics not appearing - Verify `use_swanlab: true` - Check SwanLab is initialized (look for init log message) - Profiling metrics are under "profiling/" namespace - Profiling auto-enabled when SwanLab is enabled --- ## Performance Notes ### Overhead Comparison | Feature | Overhead per Step | Memory Usage | |---------|------------------|--------------| | Basic tracking | < 0.1% | ~10 MB | | Completion logging | < 0.5% | ~64 KB (buffer=128) | | Profiling | < 0.1% | ~1 KB | | **Total** | **< 0.7%** | **~10 MB** | ### Best Practices 1. Use ONE logging tool in production (disable WandB/MLflow when using SwanLab) 2. Adjust completion log interval based on training length (100-200 steps) 3. Keep completion buffer size reasonable (128-512) 4. Profile critical path methods first (training_step, compute_loss) 5. Use ProfilingConfig to throttle high-frequency operations --- ## Further Reading - **Full Documentation**: [src/axolotl/integrations/swanlab/README.md](../../src/axolotl/integrations/swanlab/README.md) - **SwanLab Docs**: [https://docs.swanlab.cn](https://docs.swanlab.cn) - **Axolotl Docs**: [https://axolotl-ai-cloud.github.io/axolotl/](https://axolotl-ai-cloud.github.io/axolotl/) - **DPO Paper**: [Direct Preference Optimization](https://arxiv.org/abs/2305.18290) --- ## Contributing Found an issue or have an improvement? Please submit a PR or open an issue: - [Axolotl Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) - [SwanLab Issues](https://github.com/SwanHubX/SwanLab/issues) ================================================ FILE: examples/swanlab/custom_trainer_profiling.py ================================================ """Example: Custom Trainer with SwanLab Profiling This example demonstrates how to add SwanLab profiling to your custom trainer. Features: - @swanlab_profile decorator for automatic profiling - swanlab_profiling_context for fine-grained profiling - ProfilingConfig for advanced filtering and throttling Usage: 1. Create your custom trainer extending AxolotlTrainer 2. Add @swanlab_profile decorators to methods you want to profile 3. Use swanlab_profiling_context for fine-grained profiling within methods 4. Enable SwanLab in your config (use_swanlab: true) See also: - examples/swanlab/lora-swanlab-profiling.yml for config - src/axolotl/integrations/swanlab/profiling.py for implementation """ from axolotl.core.trainers.base import AxolotlTrainer from axolotl.integrations.swanlab.profiling import ( ProfilingConfig, swanlab_profile, swanlab_profiling_context, swanlab_profiling_context_advanced, ) class CustomTrainerWithProfiling(AxolotlTrainer): """Custom trainer with SwanLab profiling enabled. This trainer demonstrates three profiling patterns: 1. Decorator-based profiling (@swanlab_profile) 2. Context manager profiling (swanlab_profiling_context) 3. Advanced profiling with filtering (ProfilingConfig) """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Create custom profiling config for high-frequency operations self.fast_op_config = ProfilingConfig( enabled=True, min_duration_ms=0.5, # Only log if duration > 0.5ms log_interval=50, # Log every 50th call ) # ======================================================================== # Pattern 1: Decorator-based Profiling # ======================================================================== # Best for: Methods you always want to profile # Overhead: ~2-5 microseconds per call (negligible) @swanlab_profile def training_step(self, model, inputs): """Main training step - always profile. Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.training_step """ return super().training_step(model, inputs) @swanlab_profile def compute_loss(self, model, inputs, return_outputs=False): """Loss computation - always profile. Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.compute_loss """ return super().compute_loss(model, inputs, return_outputs) @swanlab_profile def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None): """Prediction step - always profile. Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prediction_step """ return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) # ======================================================================== # Pattern 2: Fine-grained Context Manager Profiling # ======================================================================== # Best for: Profiling specific code blocks within a method # Use case: When you want to profile forward vs backward separately def complex_training_step(self, model, inputs): """Training step with fine-grained profiling. Profiling metrics: - profiling/Time taken: CustomTrainerWithProfiling.forward_pass - profiling/Time taken: CustomTrainerWithProfiling.backward_pass - profiling/Time taken: CustomTrainerWithProfiling.optimizer_step """ # Profile just the forward pass with swanlab_profiling_context(self, "forward_pass"): outputs = model(**inputs) loss = outputs.loss # Profile just the backward pass with swanlab_profiling_context(self, "backward_pass"): loss.backward() # Profile optimizer step with swanlab_profiling_context(self, "optimizer_step"): self.optimizer.step() self.optimizer.zero_grad() return outputs # ======================================================================== # Pattern 3: Advanced Profiling with Filtering # ======================================================================== # Best for: High-frequency operations where you want to throttle logging # Use case: Methods called 100+ times per step def _prepare_inputs(self, inputs): """Prepare inputs - throttled profiling. This method is called frequently (once per batch), so we throttle profiling to reduce overhead: - Only log if duration > 0.5ms (skip very fast operations) - Only log every 50th call (reduce logging frequency) Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_inputs """ with swanlab_profiling_context_advanced( self, "prepare_inputs", config=self.fast_op_config ): return super()._prepare_inputs(inputs) def _prepare_input_for_model(self, input_ids): """Another high-frequency operation - throttled profiling. Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_input_for_model """ with swanlab_profiling_context_advanced( self, "prepare_input_for_model", config=self.fast_op_config ): # Your custom input preparation logic return input_ids # ======================================================================== # Pattern 4: Exception-safe Profiling # ======================================================================== # Profiling is exception-safe: duration is logged even if method raises @swanlab_profile def potentially_failing_method(self): """This method may raise an exception. SwanLab profiling will still log the duration before re-raising. Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.potentially_failing_method """ # Do some work result = self._do_risky_computation() # If this raises, profiling duration is still logged if result < 0: raise ValueError("Invalid result") return result def _do_risky_computation(self): """Placeholder for risky computation.""" return 42 # ============================================================================ # Advanced Example: Custom ProfilingConfig Per Method # ============================================================================ class AdvancedProfilingTrainer(AxolotlTrainer): """Trainer with method-specific profiling configurations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Different profiling configs for different method types self.critical_path_config = ProfilingConfig( enabled=True, min_duration_ms=0.0, # Log everything on critical path log_interval=1, # Log every call ) self.fast_path_config = ProfilingConfig( enabled=True, min_duration_ms=1.0, # Only log if > 1ms log_interval=100, # Log every 100th call ) self.debug_config = ProfilingConfig( enabled=True, min_duration_ms=0.0, # Log everything log_interval=1, # Log every call ) def training_step(self, model, inputs): """Critical path - log everything.""" with swanlab_profiling_context_advanced( self, "training_step", config=self.critical_path_config ): return super().training_step(model, inputs) def _prepare_inputs(self, inputs): """Fast path - throttle logging.""" with swanlab_profiling_context_advanced( self, "prepare_inputs", config=self.fast_path_config ): return super()._prepare_inputs(inputs) def _debug_method(self, data): """Debug-only method - verbose logging.""" with swanlab_profiling_context_advanced( self, "debug_method", config=self.debug_config ): # Your debug logic pass # ============================================================================ # How to Use This Custom Trainer # ============================================================================ """ To use this custom trainer: 1. Save this file to your project (e.g., my_custom_trainer.py) 2. Create a config file that uses your custom trainer: # config.yml base_model: NousResearch/Llama-3.2-1B # ... other config ... plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: my-profiling-experiment # Optional: Specify custom trainer # (Or modify axolotl to use your custom trainer class) 3. Run training: export SWANLAB_API_KEY=your-api-key accelerate launch -m axolotl.cli.train config.yml 4. View profiling metrics in SwanLab dashboard: - profiling/Time taken: CustomTrainerWithProfiling.training_step - profiling/Time taken: CustomTrainerWithProfiling.forward_pass - profiling/Time taken: CustomTrainerWithProfiling.backward_pass - etc. 5. Compare profiling metrics across runs: - Run baseline without optimizations - Run with flash_attention enabled - Run with gradient_checkpointing enabled - Compare profiling metrics to see performance impact """ # ============================================================================ # Tips for Effective Profiling # ============================================================================ """ 1. Profile the critical path first: - training_step, compute_loss, prediction_step - These methods are called most frequently and have biggest impact 2. Use throttling for high-frequency operations: - Methods called 100+ times per step - Use log_interval=50 or log_interval=100 - Reduces profiling overhead and dashboard clutter 3. Filter noise with min_duration_ms: - Set min_duration_ms=1.0 to skip very fast operations - Focus on operations that actually take time 4. Compare across runs: - Run same config multiple times to check consistency - Compare different optimization strategies - Track profiling trends over time 5. Monitor distributed training: - Check for per-rank timing differences - Look for stragglers (slower ranks) - Identify synchronization bottlenecks 6. Disable profiling in production: - from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG - DEFAULT_PROFILING_CONFIG.enabled = False 7. Exception handling: - Profiling is exception-safe - Duration logged even if method raises - Useful for debugging methods that fail intermittently """ ================================================ FILE: examples/swanlab/dpo-swanlab-completions.yml ================================================ # SwanLab DPO Training Example with Completion Logging # # This example demonstrates DPO (Direct Preference Optimization) training # with SwanLab integration for experiment tracking and completion table logging. # # Features enabled: # - SwanLab experiment tracking # - RLHF completion table logging (prompts, chosen/rejected responses, rewards) # - Lark (Feishu) team notifications (optional) # # To run: # export SWANLAB_API_KEY=your-api-key # accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml # Model Configuration base_model: meta-llama/Meta-Llama-3-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer special_tokens: pad_token: <|finetune_right_pad_id|> eos_token: <|eot_id|> # Quantization load_in_8bit: true load_in_4bit: false # LoRA Configuration adapter: lora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true # DPO Configuration chat_template: llama3 rl: dpo datasets: - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant # Dataset and Output dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/dpo-swanlab-out # Training Configuration sequence_len: 4096 sample_packing: false micro_batch_size: 2 gradient_accumulation_steps: 4 num_epochs: 4 # Optimization optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 warmup_ratio: 0.1 weight_decay: 0.0 # Precision bf16: auto tf32: false # Performance gradient_checkpointing: true flash_attention: true # Checkpointing and Logging logging_steps: 1 evals_per_epoch: 4 saves_per_epoch: 1 # ============================================================================ # SwanLab Integration # ============================================================================ plugins: - axolotl.integrations.swanlab.SwanLabPlugin # Basic SwanLab Configuration use_swanlab: true swanlab_project: dpo-training swanlab_experiment_name: llama-3-dpo-completions-demo swanlab_description: "DPO training with completion table logging" swanlab_mode: cloud # Options: cloud, local, offline, disabled # SwanLab Authentication # Recommended: Set via environment variable # export SWANLAB_API_KEY=your-api-key # Or set in config (less secure): # swanlab_api_key: your-api-key # Optional: Team workspace # swanlab_workspace: my-research-team # ============================================================================ # RLHF Completion Table Logging # ============================================================================ # # Automatically logs model completions to SwanLab for qualitative analysis: # - Prompts from your DPO dataset # - Chosen responses (preferred) # - Rejected responses (non-preferred) # - Reward differences # # View the table in SwanLab dashboard under "rlhf_completions" swanlab_log_completions: true swanlab_completion_log_interval: 100 # Log every 100 training steps swanlab_completion_max_buffer: 128 # Keep last 128 completions in memory # Memory Usage Notes: # - Buffer size 128: ~64 KB (default, recommended) # - Buffer size 512: ~256 KB (for more historical completions) # - Buffer size 1024: ~512 KB (maximum for very long training runs) # Performance Notes: # - Completion logging overhead: < 0.5% per training step # - Only logs every N steps to minimize impact # - Memory-bounded buffer prevents memory leaks # ============================================================================ # Optional: Lark (Feishu) Team Notifications # ============================================================================ # # Get real-time training notifications in your team chat # Uncomment to enable: # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx # swanlab_lark_secret: your-webhook-secret # Recommended for production # Notifications sent for: # - Training start # - Training completion # - Training errors # - Metric milestones (if configured) # ============================================================================ # Optional: Private SwanLab Deployment # ============================================================================ # # For enterprise users with private SwanLab deployment: # swanlab_web_host: https://swanlab.yourcompany.com # swanlab_api_host: https://api.swanlab.yourcompany.com # ============================================================================ # Disable WandB if you're migrating from it # ============================================================================ # wandb_project: # wandb_entity: # use_wandb: false ================================================ FILE: examples/swanlab/dpo-swanlab-full-featured.yml ================================================ # SwanLab Full-Featured DPO Training Example # # This example demonstrates ALL SwanLab integration features: # - Experiment tracking with cloud sync # - RLHF completion table logging # - Performance profiling # - Lark (Feishu) team notifications # - Team workspace collaboration # # Use this as a reference for production RLHF training setups. # # To run: # export SWANLAB_API_KEY=your-api-key # export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/... # export SWANLAB_LARK_SECRET=your-webhook-secret # accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml # ============================================================================ # Model Configuration # ============================================================================ base_model: meta-llama/Meta-Llama-3-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer special_tokens: pad_token: <|finetune_right_pad_id|> eos_token: <|eot_id|> # Quantization for efficient training load_in_8bit: true load_in_4bit: false # ============================================================================ # LoRA Configuration # ============================================================================ adapter: lora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true # Target all linear layers # ============================================================================ # DPO (Direct Preference Optimization) Configuration # ============================================================================ chat_template: llama3 rl: dpo # Enable DPO trainer datasets: - path: fozziethebeat/alpaca_messages_2k_dpo_test type: chat_template.default field_messages: conversation field_chosen: chosen field_rejected: rejected message_property_mappings: role: role content: content roles: system: - system user: - user assistant: - assistant # ============================================================================ # Dataset and Output Configuration # ============================================================================ dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/dpo-swanlab-full-featured-out # ============================================================================ # Training Configuration # ============================================================================ sequence_len: 4096 sample_packing: false micro_batch_size: 2 gradient_accumulation_steps: 4 num_epochs: 4 # ============================================================================ # Optimization # ============================================================================ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 warmup_ratio: 0.1 weight_decay: 0.0 # ============================================================================ # Precision and Performance # ============================================================================ bf16: auto tf32: false gradient_checkpointing: true flash_attention: true # ============================================================================ # Checkpointing and Logging # ============================================================================ logging_steps: 1 evals_per_epoch: 4 saves_per_epoch: 1 # ============================================================================ # SwanLab Integration - Full Configuration # ============================================================================ plugins: - axolotl.integrations.swanlab.SwanLabPlugin # ------------------------------------------------------------------------------ # Basic SwanLab Configuration # ------------------------------------------------------------------------------ use_swanlab: true swanlab_project: dpo-production swanlab_experiment_name: llama-3-dpo-full-featured-v1 swanlab_description: | Production DPO training with all SwanLab features enabled: - Completion table logging for qualitative analysis - Performance profiling for optimization - Lark notifications for team collaboration swanlab_mode: cloud # Options: cloud, local, offline, disabled # ------------------------------------------------------------------------------ # Team Collaboration # ------------------------------------------------------------------------------ # Workspace for team collaboration (shared experiments) swanlab_workspace: ml-research-team # Authentication (recommended: use environment variable) # export SWANLAB_API_KEY=your-api-key # Or set in config (less secure): # swanlab_api_key: your-api-key # ------------------------------------------------------------------------------ # RLHF Completion Table Logging # ------------------------------------------------------------------------------ # Automatically logs model completions for qualitative analysis: # - Prompts from your DPO dataset # - Chosen responses (preferred) # - Rejected responses (non-preferred) # - Reward differences # # View in SwanLab dashboard under "rlhf_completions" table swanlab_log_completions: true swanlab_completion_log_interval: 100 # Log every 100 steps swanlab_completion_max_buffer: 256 # Larger buffer for long training runs # Buffer size recommendations: # - 128: Default, ~64 KB memory (recommended for most cases) # - 256: ~128 KB memory (this config, good for longer training) # - 512: ~256 KB memory (maximum for very long runs) # ------------------------------------------------------------------------------ # Lark (Feishu) Team Notifications # ------------------------------------------------------------------------------ # Get real-time training notifications in your team chat # # Notifications sent for: # - Training start # - Training completion # - Training errors # - Metric milestones (if configured) # Recommended: Set via environment variables # export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/... # export SWANLAB_LARK_SECRET=your-webhook-secret # Or set in config (less secure): # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx # swanlab_lark_secret: your-webhook-secret # REQUIRED for production # Security note: ALWAYS use swanlab_lark_secret in production to prevent # unauthorized parties from sending fake notifications to your team chat. # ------------------------------------------------------------------------------ # Performance Profiling # ------------------------------------------------------------------------------ # Profiling is automatically enabled when SwanLab is enabled. # Metrics logged to SwanLab under "profiling/" namespace: # profiling/Time taken: AxolotlTrainer.training_step # profiling/Time taken: AxolotlTrainer.compute_loss # profiling/Time taken: AxolotlTrainer.prediction_step # # Use these metrics to: # - Identify bottlenecks in training loop # - Compare performance across different configurations # - Monitor performance regressions over time # - Debug unexpected slowdowns # For custom profiling in your own trainer, see: # examples/swanlab/custom_trainer_profiling.py # ------------------------------------------------------------------------------ # Optional: Private SwanLab Deployment # ------------------------------------------------------------------------------ # For enterprise users with private SwanLab deployment: # swanlab_web_host: https://swanlab.yourcompany.com # swanlab_api_host: https://api.swanlab.yourcompany.com # ------------------------------------------------------------------------------ # Optional: Model Checkpointing to SwanLab # ------------------------------------------------------------------------------ # Log model checkpoints to SwanLab (coming soon) swanlab_log_model: false # ============================================================================ # Disable Other Logging Tools (Recommended) # ============================================================================ # Using multiple logging tools simultaneously can impact performance: # - Expected overhead: ~1-2% per logger # - Potential config/callback conflicts # # For production training, use ONLY SwanLab: # wandb_project: # use_wandb: false # # use_mlflow: false # # use_comet: false # ============================================================================ # Expected Training Behavior # ============================================================================ # With this configuration, you should see: # # 1. SwanLab Initialization (rank 0 only): # INFO: SwanLab initialized for project: dpo-production # INFO: SwanLab experiment: llama-3-dpo-full-featured-v1 # INFO: SwanLab mode: cloud # INFO: SwanLab workspace: ml-research-team # # 2. Completion Logging (rank 0 only): # INFO: Registered SwanLab RLHF completion logging callback for DPOTrainer # (log_interval=100, max_buffer=256) # # 3. Lark Notifications (rank 0 only): # INFO: Registered Lark notification callback with HMAC authentication # # 4. Distributed Training Detection (if multi-GPU): # INFO: Distributed training detected (world_size=N) # INFO: Only rank 0 will initialize SwanLab # INFO: Other ranks will skip SwanLab to avoid conflicts # # 5. Training Start Notification (Lark): # Your team chat receives: "Training started: llama-3-dpo-full-featured-v1" # # 6. Periodic Completion Logging: # Every 100 steps, completion table is updated in SwanLab dashboard # # 7. Training Complete Notification (Lark): # Your team chat receives: "Training completed: llama-3-dpo-full-featured-v1" # With link to SwanLab dashboard and final metrics # # 8. SwanLab Dashboard Shows: # - Training metrics (loss, learning rate, etc.) # - Completion table (rlhf_completions) # - Profiling metrics (profiling/Time taken: ...) # - Hyperparameters and configuration # - System resource usage # ============================================================================ # Production Checklist # ============================================================================ # Before deploying to production, verify: # ✅ SwanLab API key is set via environment variable (not in config) # ✅ Lark webhook secret is set (required for HMAC authentication) # ✅ Workspace is set to your team's workspace # ✅ Experiment name is descriptive and unique # ✅ Only SwanLab is enabled (other loggers disabled) # ✅ Completion logging buffer size is appropriate for your training duration # ✅ Private deployment hosts are set (if using enterprise SwanLab) # ✅ Test run completes successfully and shows up in SwanLab dashboard # ✅ Lark notifications are received in team chat # ✅ Profiling metrics are logged correctly # ============================================================================ # Troubleshooting # ============================================================================ # If SwanLab initialization fails: # 1. Check SWANLAB_API_KEY environment variable is set # 2. Verify swanlab_project is set in config # 3. Check swanlab_mode is valid (cloud/local/offline/disabled) # 4. Verify internet connectivity (for cloud mode) # If Lark notifications not received: # 1. Check SWANLAB_LARK_WEBHOOK_URL is set correctly # 2. Verify SWANLAB_LARK_SECRET matches your Lark bot settings # 3. Test webhook manually: curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ... # 4. Check training logs for "Registered Lark notification callback" # 5. Verify bot is added to the target Lark group chat # If completions not appearing in SwanLab: # 1. Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO) # 2. Check swanlab_log_completions is true # 3. Wait for log_interval steps (default: 100) # 4. Check training logs for "Registered SwanLab RLHF completion logging" # If profiling metrics not appearing: # 1. Verify use_swanlab is true # 2. Check SwanLab is initialized (check logs) # 3. Look under "profiling/" namespace in dashboard # 4. Profiling may be disabled if DEFAULT_PROFILING_CONFIG.enabled = False # For more help: # - SwanLab docs: https://docs.swanlab.cn # - Axolotl SwanLab integration: src/axolotl/integrations/swanlab/README.md # - GitHub issues: https://github.com/axolotl-ai-cloud/axolotl/issues ================================================ FILE: examples/swanlab/lora-swanlab-profiling.yml ================================================ # SwanLab LoRA Training Example with Performance Profiling # # This example demonstrates standard LoRA fine-tuning with SwanLab integration # for performance profiling and optimization. # # Features enabled: # - SwanLab experiment tracking # - Performance profiling (training step, forward/backward pass timing) # - Real-time metrics visualization # # To run: # export SWANLAB_API_KEY=your-api-key # accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml # Model Configuration base_model: NousResearch/Llama-3.2-1B # Dataset Configuration datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca val_set_size: 0.1 output_dir: ./outputs/lora-swanlab-profiling-out # LoRA Configuration adapter: lora lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj # Training Configuration sequence_len: 2048 sample_packing: true eval_sample_packing: true micro_batch_size: 2 gradient_accumulation_steps: 2 num_epochs: 1 # Optimization optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 warmup_ratio: 0.1 weight_decay: 0.0 # Precision bf16: auto tf32: false # Performance gradient_checkpointing: true flash_attention: true # Checkpointing and Logging logging_steps: 1 evals_per_epoch: 4 saves_per_epoch: 1 # Loss Monitoring loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 special_tokens: pad_token: "<|end_of_text|>" # ============================================================================ # SwanLab Integration # ============================================================================ plugins: - axolotl.integrations.swanlab.SwanLabPlugin # Basic SwanLab Configuration use_swanlab: true swanlab_project: lora-profiling swanlab_experiment_name: llama-3.2-1b-profiling-demo swanlab_description: "LoRA fine-tuning with performance profiling" swanlab_mode: cloud # Options: cloud, local, offline, disabled # SwanLab Authentication # Recommended: Set via environment variable # export SWANLAB_API_KEY=your-api-key # Or set in config (less secure): # swanlab_api_key: your-api-key # Optional: Team workspace # swanlab_workspace: my-ml-team # ============================================================================ # Performance Profiling # ============================================================================ # # SwanLab automatically profiles trainer methods when enabled. # Profiling metrics appear in SwanLab dashboard under "profiling/" namespace. # # Built-in profiling: # - Minimal overhead (< 0.1% per step) # - High-precision timing (microsecond accuracy) # - Exception-safe (logs duration even if method fails) # # View profiling metrics in SwanLab dashboard: # profiling/Time taken: AxolotlTrainer.training_step # profiling/Time taken: AxolotlTrainer.compute_loss # profiling/Time taken: AxolotlTrainer.prediction_step # # For custom profiling in your own trainer, see: # examples/swanlab/custom_trainer_profiling.py # Completion logging is disabled for non-RLHF trainers swanlab_log_completions: false # Only works with DPO/KTO/ORPO/GRPO # ============================================================================ # Optional: Compare with Multiple Runs # ============================================================================ # # To compare profiling metrics across different configurations: # # 1. Run baseline without flash attention: # swanlab_experiment_name: llama-3.2-1b-no-flash-attn # flash_attention: false # # 2. Run with gradient checkpointing: # swanlab_experiment_name: llama-3.2-1b-grad-checkpoint # gradient_checkpointing: true # # 3. Run with both: # swanlab_experiment_name: llama-3.2-1b-optimized # flash_attention: true # gradient_checkpointing: true # # Then compare profiling metrics in SwanLab dashboard to see performance impact # ============================================================================ # Optional: Lark (Feishu) Team Notifications # ============================================================================ # # Get notified when profiling experiments complete: # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx # swanlab_lark_secret: your-webhook-secret # ============================================================================ # Profiling Best Practices # ============================================================================ # # 1. Run multiple epochs to see profiling trends over time # 2. Ignore first ~10 steps (warmup period, slower) # 3. Look for outliers (steps that take significantly longer) # 4. Compare profiling metrics before/after optimization changes # 5. Monitor per-rank profiling in distributed training # # Common bottlenecks to profile: # - training_step: Overall step time (should be consistent) # - compute_loss: Loss computation (scales with sequence length) # - prediction_step: Evaluation time (can be slow for large val sets) # # If you see inconsistent timing: # - Check for data loading bottlenecks # - Monitor GPU utilization (may be CPU-bound) # - Check for gradient accumulation effects # - Verify CUDA kernel synchronization # ============================================================================ # Disable WandB if you're migrating from it # ============================================================================ # wandb_project: # use_wandb: false ================================================ FILE: examples/trinity/README.md ================================================ # Finetune ArceeAI's Trinity with Axolotl [Trinity](https://huggingface.co/collections/arcee-ai/trinity) is a family of open weight MoE models trained by Arcee.ai. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started 1. Install Axolotl following the main from the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build). 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Run the finetuning example: ```bash axolotl train examples/trinity/trinity-nano-preview-qlora.yaml ``` This config uses about 24.9 GiB VRAM (w/o CCE). Let us know how it goes. Happy finetuning! 🚀 ### TIPS - For inference, the official Arcee.ai team recommends `top_p: 0.75`, `temperature: 0.15`, `top_k: 50`, and `min_p: 0.06`. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). ## Optimization Guides Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). ## Related Resources - [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ================================================ FILE: examples/trinity/trinity-nano-preview-qlora.yaml ================================================ base_model: arcee-ai/Trinity-Nano-Preview revision_of_model: 2ee94b0 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # CCE - N/A as of now # plugins: # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true datasets: - path: fozziethebeat/alpaca_messages_2k_test type: chat_template dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out adapter: qlora lora_model_dir: sequence_len: 2048 sample_packing: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 # flash_attention: true # Not supported sdp_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 # save_first_step: true # uncomment this to validate checkpoint saving works with your config ================================================ FILE: examples/voxtral/README.md ================================================ # Finetune Voxtral with Axolotl Voxtral is a [3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)/[24B](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) parameter opensource model from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl. Thanks to the team at MistralAI for giving us early access to prepare for this release. ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). Here is an example of how to install from pip: ```bash # Ensure you have Pytorch installed (Pytorch 2.6.0 min) pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` 2. Please install the below. ```bash # audio pip3 install librosa==0.11.0 pip3 install 'mistral_common[audio]==1.8.3' # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy python scripts/cutcrossentropy_install.py | sh ``` 3. Download sample dataset files ```bash # for text + audio only wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga ``` 4. Run the finetuning example: ```bash # text only axolotl train examples/voxtral/voxtral-mini-qlora.yml # text + audio axolotl train examples/voxtral/voxtral-mini-audio-qlora.yml ``` These configs use about 4.8 GB VRAM. Let us know how it goes. Happy finetuning! 🚀 ### TIPS - For inference, the official MistralAI team recommends `temperature: 0.2` and `top_p: 0.95` for audio understanding and `temperature: 0.0` for transcription. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). ## Optimization Guides - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html) ## Limitations We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only. In addition, we do not support overriding tokens yet. ## Related Resources - [MistralAI Magistral Blog](https://mistral.ai/news/magistral/) - [Axolotl Docs](https://docs.axolotl.ai) - [Axolotl Website](https://axolotl.ai) - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) - [Axolotl Discord](https://discord.gg/7m9sfhzaf3) ## Future Work - Add parity to Preference Tuning, RL, etc. - Add parity to other tokenizer configs like overriding tokens. ================================================ FILE: examples/voxtral/voxtral-mini-audio-qlora.yml ================================================ base_model: mistralai/Voxtral-Mini-3B-2507 processor_type: VoxtralProcessor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # for use with fft to only train on language model layers # unfrozen_parameters: # - language_model.model.* # - lm_head # - embed_tokens load_in_4bit: true # these 3 lines are needed for now to handle vision chat templates w images skip_prepare_dataset: true remove_unused_columns: false sample_packing: false # gemma3 doesn't seem to play nice with ddp ddp_find_unused_parameters: true eot_tokens: - # sample dataset below requires downloading audio/image in advance # wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga datasets: - path: NanoBit/text-audio-2k-test type: chat_template dataset_prepared_path: val_set_size: 0.01 output_dir: ./outputs/out adapter: qlora lora_model_dir: sequence_len: 2048 pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: true fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 ================================================ FILE: examples/voxtral/voxtral-mini-qlora.yml ================================================ base_model: mistralai/Voxtral-Mini-3B-2507 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name # Enable to use mistral-common tokenizer tokenizer_use_mistral_common: true plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin load_in_8bit: false load_in_4bit: true # for use with fft to only train on language model layers # unfrozen_parameters: # - language_model.model.* # - lm_head # - embed_tokens eot_tokens: - datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template split: train[:1%] field_messages: conversations message_property_mappings: role: from content: value val_set_size: 0.0 output_dir: ./outputs/out adapter: qlora lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' sequence_len: 2048 sample_packing: true eval_sample_packing: true pad_to_sequence_len: true wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 flash_attention: true warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 special_tokens: ================================================ FILE: index.qmd ================================================ --- # toc-location: right-body # toc-title: Table Of Contents # toc-expand: 2 --- ```{python} #|output: asis #|echo: false # This cell steals the README as the home page for now, but excludes the table of contents (quarto adds its own) import re pattern = re.compile( r"\s*\s*\s*\s*
\s*## Table of Contents.*?
", re.DOTALL | re.IGNORECASE ) with open('README.md', 'r') as f: txt = f.read() cleaned = pattern.sub("", txt) print(cleaned) ``` ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"] build-backend = "setuptools.build_meta" [project] name = "axolotl" dynamic = ["version", "dependencies", "optional-dependencies"] description = "LLM Trainer" readme = "README.md" requires-python = ">=3.10" # license = "Apache-2.0" [project.scripts] axolotl = "axolotl.cli.main:main" [project.urls] Homepage = "https://axolotl.ai/" Documentation = "https://docs.axolotl.ai/" Repository = "https://github.com/axolotl-ai-cloud/axolotl.git" [tool.setuptools_scm] [tool.setuptools] py-modules = ["setuptools_axolotl_dynamic_dependencies"] include-package-data = true [tool.setuptools.dynamic] version = { file = "VERSION" } [tool.setuptools.cmdclass] build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand" [tool.ruff] line-length = 88 target-version = "py310" [tool.ruff.lint] select = ["E", "F", "W", "C90", "B", "I"] ignore = [ "E203", # Whitespace before ':' "E501", # Line too long "C901", # Too complex "B019", # Use of functools.cache on methods "E722", # Bare except "F821", # Undefined name (for dynamic exec) ] [tool.ruff.lint.isort] known-third-party = ["wandb", "comet_ml"] known-local-folder = ["src", "tests"] # Black-compatible isort settings force-single-line = false combine-as-imports = true split-on-trailing-comma = true [tool.ruff.format] # Use black's formatting style exactly quote-style = "double" indent-style = "space" skip-magic-trailing-comma = false line-ending = "auto" docstring-code-format = false [tool.uv.extra-build-dependencies] axolotl = ["huggingface_hub"] ================================================ FILE: requirements-dev.txt ================================================ black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken ================================================ FILE: requirements-tests.txt ================================================ codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse ================================================ FILE: requirements.txt ================================================ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ # START section of dependencies that don't install on Darwin/MacOS bitsandbytes==0.49.1 triton>=3.4.0 mamba-ssm==1.2.0.post1 xformers>=0.0.23.post1 liger-kernel==0.7.0 # END section packaging==26.0 huggingface_hub>=1.1.7 peft>=0.18.1 tokenizers>=0.22.1 transformers==5.3.0 accelerate==1.13.0 datasets==4.5.0 deepspeed>=0.18.6,<0.19.0 trl==0.29.0 hf_xet==1.3.2 kernels==0.12.2 fla-core==0.4.1 flash-linear-attention==0.4.1 trackio>=0.16.1 typing-extensions>=4.15.0 optimum==1.16.2 hf_transfer sentencepiece gradio>=6.2.0,<7.0 modal==1.3.0.post1 pydantic>=2.10.6 addict fire PyYAML>=6.0 requests wandb einops colorama numba>=0.61.2 numpy>=2.2.6 # qlora things evaluate==0.4.1 scipy nvidia-ml-py==12.560.30 art tensorboard python-dotenv==1.0.1 # remote filesystems s3fs>=2024.5.0 gcsfs>=2025.3.0 adlfs>=2024.5.0 ocifs==1.3.2 zstandard==0.22.0 fastcore # lm eval harness lm_eval==0.4.7 langdetect==1.0.9 immutabledict==4.2.0 antlr4-python3-runtime==4.13.2 torchao==0.16.0 openenv-core==0.1.0 schedulefree==1.4.1 axolotl-contribs-lgpl==0.0.7 axolotl-contribs-mit==0.0.6 # telemetry posthog==6.7.11 mistral-common==1.10.0 ================================================ FILE: scripts/chat_datasets.py ================================================ """ helper script to parse chat datasets into a usable yaml """ import click import yaml from datasets import load_dataset @click.command() @click.argument("dataset", type=str) @click.option("--split", type=str, default="train") def parse_dataset(dataset=None, split="train"): ds_cfg = {} ds_cfg["path"] = dataset ds_cfg["split"] = split ds_cfg["type"] = "chat_template" ds_cfg["chat_template"] = "<<>>" dataset = load_dataset(dataset, split=split) features = dataset.features feature_keys = features.keys() field_messages = None for key in ["conversation", "conversations", "messages"]: if key in feature_keys: field_messages = key break if not field_messages: raise ValueError( f"No conversation field found in dataset: {', '.join(feature_keys)}" ) ds_cfg["field_messages"] = field_messages message_fields = features[field_messages][0].keys() message_property_mappings = {"role": None, "content": None} for key in ["from", "role"]: if key in message_fields: message_property_mappings["role"] = key break if not message_property_mappings["role"]: raise ValueError( f"No role field found in messages: {', '.join(message_fields)}" ) for key in ["content", "text", "value"]: if key in message_fields: message_property_mappings["content"] = key break if not message_property_mappings["content"]: raise ValueError( f"No content field found in messages: {', '.join(message_fields)}" ) ds_cfg["message_property_mappings"] = message_property_mappings print(yaml.dump({"datasets": [ds_cfg]})) if __name__ == "__main__": parse_dataset() ================================================ FILE: scripts/cloud-entrypoint-term.sh ================================================ #!/bin/bash # Export specific ENV variables to /etc/rp_environment echo "Exporting environment variables..." printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment conda init # this needs to come after conda init echo 'source /etc/rp_environment' >> ~/.bashrc add_keys_to_authorized() { local key_value=$1 # Create the ~/.ssh directory and set permissions mkdir -p ~/.ssh chmod 700 ~/.ssh # Create the authorized_keys file if it doesn't exist touch ~/.ssh/authorized_keys # Initialize an empty key variable local key="" # Read the key variable word by word for word in $key_value; do # Check if the word looks like the start of a key if [[ $word == ssh-* ]]; then # If there's a key being built, add it to the authorized_keys file if [[ -n $key ]]; then echo $key >> ~/.ssh/authorized_keys fi # Start a new key key=$word else # Append the word to the current key key="$key $word" fi done # Add the last key to the authorized_keys file if [[ -n $key ]]; then echo $key >> ~/.ssh/authorized_keys fi # Set the correct permissions chmod 600 ~/.ssh/authorized_keys chmod 700 -R ~/.ssh } if [[ $PUBLIC_KEY ]]; then # runpod add_keys_to_authorized "$PUBLIC_KEY" # Start the SSH service in the background service ssh start elif [[ $SSH_KEY ]]; then # latitude.sh add_keys_to_authorized "$SSH_KEY" # Start the SSH service in the background service ssh start else echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon" fi # Check if JUPYTER_PASSWORD is set and not empty if [ -n "$JUPYTER_PASSWORD" ]; then # Set JUPYTER_TOKEN to the value of JUPYTER_PASSWORD export JUPYTER_TOKEN="$JUPYTER_PASSWORD" fi if [ "$JUPYTER_DISABLE" != "1" ]; then # Run Jupyter Lab in the background jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* & fi if [ ! -d "/workspace/data/axolotl-artifacts" ]; then mkdir -p /workspace/data/axolotl-artifacts fi if [ ! -L "/workspace/axolotl/outputs" ]; then ln -sf /workspace/data/axolotl-artifacts /workspace/axolotl/outputs fi # Execute the passed arguments (CMD) exec "$@" ================================================ FILE: scripts/cloud-entrypoint.sh ================================================ #!/bin/bash # Export specific ENV variables to /etc/rp_environment echo "Exporting environment variables..." printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment echo 'source /etc/rp_environment' >> ~/.bashrc add_keys_to_authorized() { local key_value=$1 # Create the ~/.ssh directory and set permissions mkdir -p ~/.ssh chmod 700 ~/.ssh # Create the authorized_keys file if it doesn't exist touch ~/.ssh/authorized_keys # Initialize an empty key variable local key="" # Read the key variable word by word for word in $key_value; do # Check if the word looks like the start of a key if [[ $word == ssh-* ]]; then # If there's a key being built, add it to the authorized_keys file if [[ -n $key ]]; then echo $key >> ~/.ssh/authorized_keys fi # Start a new key key=$word else # Append the word to the current key key="$key $word" fi done # Add the last key to the authorized_keys file if [[ -n $key ]]; then echo $key >> ~/.ssh/authorized_keys fi # Set the correct permissions chmod 600 ~/.ssh/authorized_keys chmod 700 -R ~/.ssh } # Set SSH port if [ ! -z "$SSH_PORT" ]; then sed -i "s/#Port 22/Port $SSH_PORT/" /etc/ssh/sshd_config fi if [[ $PUBLIC_KEY ]]; then # runpod, prime intellect add_keys_to_authorized "$PUBLIC_KEY" # Start the SSH service in the background service ssh start elif [[ $SSH_KEY ]]; then # latitude.sh add_keys_to_authorized "$SSH_KEY" # Start the SSH service in the background service ssh start else echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon" fi # Check if JUPYTER_PASSWORD is set and not empty if [ -n "$JUPYTER_PASSWORD" ]; then # Set JUPYTER_TOKEN to the value of JUPYTER_PASSWORD export JUPYTER_TOKEN="$JUPYTER_PASSWORD" fi if [ "$JUPYTER_DISABLE" != "1" ]; then # Run Jupyter Lab in the background jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* & fi if [ ! -d "/workspace/data/axolotl-artifacts" ]; then mkdir -p /workspace/data/axolotl-artifacts fi if [ ! -L "/workspace/axolotl/outputs" ]; then ln -sf /workspace/data/axolotl-artifacts /workspace/axolotl/outputs fi # start the runpod slurm init SLURM_INIT="${SLURM_INIT:-/slurm-init.sh}" if [[ -f "$SLURM_INIT" ]]; then echo "[entrypoint] running $SLURM_INIT..." bash "$SLURM_INIT" fi # Execute the passed arguments (CMD) exec "$@" ================================================ FILE: scripts/cutcrossentropy_install.py ================================================ """Script to output the correct installation command for cut-cross-entropy.""" import importlib.util import sys try: import torch except ImportError as exc: raise ImportError("Install torch via `pip install torch`") from exc from packaging.version import Version as V USE_UV = "--uv" in sys.argv[1:] v = V(torch.__version__) # no cut-cross-entropy support for torch < 2.4.0 if v < V("2.4.0"): print("") sys.exit(0) cce_spec = importlib.util.find_spec("cut_cross_entropy") UNINSTALL_PREFIX = "" if cce_spec: if not importlib.util.find_spec("cut_cross_entropy.transformers"): UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && " UV_PREFIX = "uv " if USE_UV else "" print( UNINSTALL_PREFIX + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"' ) ================================================ FILE: scripts/motd ================================================ #@@ #@@ @@# @@# @@ @@ @@ @@ =@@# @@ #@ =@@#. @@ #@@@@@@@@@ @@ #@#@= @@ #@ .=@@ #@@@@@@@@@@@@@@@@@ =@# @# ##= ## =####=+ @@ =#####+ =#@@###. @@ @@@@@@@@@@/ +@@/ +@@ #@ =@= #@= @@ =@#+ +#@# @@ =@#+ +#@# #@. @@ @@@@@@@@@@ ##@@ ##@@ =@# @# =@# @# @@ @@ @@ @@ #@ #@ @@ @@@@@@@@@@@@@@@@@@@@ #@=+++#@= =@@# @@ @@ @@ @@ #@ #@ @@ =@#=====@@ =@# @# @@ @@ @@ @@ #@ #@ @@ @@@@@@@@@@@@@@@@ @@@@ #@ #@= #@= +@@ #@# =@# @@. =@# =@# #@. @@ =@# @# #@= #@ =#@@@@#= +#@@= +#@@@@#= .##@@+ @@ @@@@ @@@@@@@@@@@@@@@@ Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands: Need help with your post-training workloads? Reach out us at contact@axolotl.ai for assistance. ``` cd /workspace rm -rf /workspace/axolotl git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl pip install --no-build-isolation --no-deps -e . ``` ================================================ FILE: scripts/unsloth_install.py ================================================ # noqa import sys try: import torch except ImportError as error: raise ImportError("Install torch via `pip install torch`") from error from packaging.version import Version as V use_uv = "--uv" in sys.argv[1:] v = V(torch.__version__) cuda = str(torch.version.cuda) try: is_ampere = torch.cuda.get_device_capability()[0] >= 8 except RuntimeError: is_ampere = False if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!") if v <= V("2.1.0"): raise RuntimeError(f"Torch = {v} too old!") elif v <= V("2.1.1"): x = "cu{}{}-torch211" elif v <= V("2.1.2"): x = "cu{}{}-torch212" elif v < V("2.3.0"): x = "cu{}{}-torch220" elif v < V("2.4.0"): x = "cu{}{}-torch230" elif v < V("2.5.0"): x = "cu{}{}-torch240" elif v < V("2.6.0"): x = "cu{}{}-torch250" else: raise RuntimeError(f"Torch = {v} too new!") x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "") uv_prefix = "uv " if use_uv else "" print( f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"' ) ================================================ FILE: setup.py ================================================ """setup.py for axolotl""" import os import platform import re from importlib.metadata import PackageNotFoundError, version from pathlib import Path from setuptools import find_packages, setup def parse_requirements(extras_require_map): _install_requires = [] _dependency_links = [] with open("./requirements.txt", encoding="utf-8") as requirements_file: lines = [r.strip() for r in requirements_file.readlines()] for line in lines: is_extras = "deepspeed" in line or "mamba-ssm" in line if line.startswith("--extra-index-url"): # Handle custom index URLs _, url = line.split() _dependency_links.append(url) elif not is_extras and line and line[0] != "#": # Handle standard packages _install_requires.append(line) try: xformers_version = [req for req in _install_requires if "xformers" in req][0] install_xformers = platform.machine() != "aarch64" if platform.machine() == "aarch64": # skip on ARM64 skip_packages = [ "torchao", "fla-core", "flash-linear-attention", ] _install_requires = [ req for req in _install_requires if re.split(r"[>=<]", req)[0].strip() not in skip_packages ] if "Darwin" in platform.system(): # skip packages not compatible with OSX skip_packages = [ "bitsandbytes", "triton", "mamba-ssm", "xformers", "liger-kernel", ] _install_requires = [ req for req in _install_requires if re.split(r"[>=<]", req)[0].strip() not in skip_packages ] print( _install_requires, [req in skip_packages for req in _install_requires] ) else: # detect the version of torch already installed # and set it so dependencies don't clobber the torch version try: torch_version = version("torch") except PackageNotFoundError: torch_version = "2.8.0" # default to torch 2.8.0 _install_requires.append(f"torch=={torch_version}") version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version) if version_match: major, minor, patch = version_match.groups() major, minor = int(major), int(minor) patch = ( int(patch) if patch is not None else 0 ) # Default patch to 0 if not present else: raise ValueError("Invalid version format") torch_parts = torch_version.split("+") if len(torch_parts) == 2: torch_cuda_version = torch_parts[1] _dependency_links.append( f"https://download.pytorch.org/whl/{torch_cuda_version}" ) if (major, minor) >= (2, 9): extras_require_map.pop("fbgemm-gpu") extras_require_map["fbgemm-gpu"] = [ "fbgemm-gpu==1.4.0", "fbgemm-gpu-genai==1.4.2", ] extras_require_map["vllm"] = ["vllm==0.11.1"] if not install_xformers: _install_requires.pop(_install_requires.index(xformers_version)) extras_require_map["vllm"] = ["vllm==0.13.0"] if patch == 0: extras_require_map["vllm"] = ["vllm==0.13.0"] else: extras_require_map["vllm"] = ["vllm==0.14.0"] elif (major, minor) >= (2, 8): extras_require_map.pop("fbgemm-gpu") extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"] extras_require_map["vllm"] = ["vllm==0.11.0"] if not install_xformers: _install_requires.pop(_install_requires.index(xformers_version)) elif (major, minor) >= (2, 7): _install_requires.pop(_install_requires.index(xformers_version)) if patch == 0: if install_xformers: _install_requires.append("xformers==0.0.30") # vllm 0.9.x is incompatible with latest transformers extras_require_map.pop("vllm") else: if install_xformers: _install_requires.append("xformers==0.0.31") extras_require_map["vllm"] = ["vllm==0.10.1"] elif (major, minor) >= (2, 6): _install_requires.pop(_install_requires.index(xformers_version)) if install_xformers: _install_requires.append("xformers==0.0.29.post3") # since we only support 2.6.0+cu126 _dependency_links.append("https://download.pytorch.org/whl/cu126") extras_require_map.pop("vllm") elif (major, minor) >= (2, 5): _install_requires.pop(_install_requires.index(xformers_version)) if install_xformers: if patch == 0: _install_requires.append("xformers==0.0.28.post2") else: _install_requires.append("xformers>=0.0.28.post3") extras_require_map.pop("vllm") elif (major, minor) >= (2, 4): extras_require_map.pop("vllm") if install_xformers: if patch == 0: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.27") else: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers==0.0.28.post1") else: raise ValueError("axolotl requires torch>=2.4") except PackageNotFoundError: pass return _install_requires, _dependency_links, extras_require_map def get_package_version(): with open( Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION", "r", encoding="utf-8", ) as fin: version_ = fin.read().strip() return version_ extras_require = { "flash-attn": ["flash-attn==2.8.3"], "ring-flash-attn": [ "flash-attn==2.8.3", "ring-flash-attn>=0.1.7", ], "deepspeed": [ "deepspeed==0.18.2", "deepspeed-kernels", ], "mamba-ssm": [ "mamba-ssm==1.2.0.post1", "causal_conv1d", ], "auto-gptq": [ "auto-gptq==0.5.1", ], "mlflow": [ "mlflow", ], "galore": [ "galore_torch", ], "apollo": [ "apollo-torch", ], "optimizers": [ "galore_torch", "apollo-torch", "lomo-optim==0.1.1", "torch-optimi==0.2.1", "came_pytorch==0.1.3", ], "ray": [ "ray[train]>=2.52.1", ], "vllm": [ "vllm==0.10.0", ], "llmcompressor": [ "llmcompressor==0.5.1", ], "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"], "opentelemetry": [ "opentelemetry-api", "opentelemetry-sdk", "opentelemetry-exporter-prometheus", "prometheus-client", ], } install_requires, dependency_links, extras_require_build = parse_requirements( extras_require ) setup( version=get_package_version(), package_dir={"": "src"}, packages=find_packages("src"), install_requires=install_requires, dependency_links=dependency_links, entry_points={ "console_scripts": [ "axolotl=axolotl.cli.main:main", ], }, extras_require=extras_require_build, ) ================================================ FILE: src/axolotl/__init__.py ================================================ """Axolotl - Train and fine-tune large language models""" import pkgutil from importlib.metadata import PackageNotFoundError, version __path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package try: __version__ = version("axolotl") except PackageNotFoundError: __version__ = "unknown" ================================================ FILE: src/axolotl/cli/__init__.py ================================================ """Axolotl CLI module initialization.""" import os from axolotl.logging_config import configure_logging os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1") os.environ.setdefault("TRL_EXPERIMENTAL_SILENCE", "1") configure_logging() ================================================ FILE: src/axolotl/cli/args.py ================================================ """Module for axolotl CLI command arguments.""" from dataclasses import dataclass, field from typing import Optional @dataclass class PreprocessCliArgs: """Dataclass with CLI arguments for `axolotl preprocess` command.""" debug: bool = field(default=False) debug_text_only: bool = field(default=False) debug_num_examples: int = field(default=1) prompter: Optional[str] = field(default=None) download: Optional[bool] = field(default=True) iterable: Optional[bool] = field( default=False, metadata={ "help": ( "Deprecated in v0.13.0, will be removed in v0.14.0. For streaming " "datasets, use 'axolotl train' and set 'streaming: true' in your YAML " "config, or pass --streaming instead in the CLI." ) }, ) @dataclass class TrainerCliArgs: """Dataclass with CLI arguments for `axolotl train` command.""" debug: bool = field(default=False) debug_text_only: bool = field(default=False) debug_num_examples: int = field(default=0) prompter: Optional[str] = field(default=None) shard: bool = field(default=False) @dataclass class VllmServeCliArgs: """Dataclass with CLI arguments for `axolotl vllm-serve` command.""" tensor_parallel_size: Optional[int] = field( default=None, metadata={"help": "Number of tensor parallel workers to use."}, ) data_parallel_size: Optional[int] = field( default=None, metadata={ "help": "Number of data parallel workers to use for vLLM serving. This controls how many model replicas are used for parallel inference." }, ) host: Optional[str] = field( default=None, # nosec B104 metadata={"help": "Host address to run the server on."}, ) port: Optional[int] = field( default=None, metadata={"help": "Port to run the server on."}, ) gpu_memory_utilization: Optional[float] = field( default=None, metadata={ "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV " "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache " "size and thus improve the model's throughput. However, if the value is too high, it may cause " "out-of-memory (OOM) errors during initialization." }, ) dtype: Optional[str] = field( default=None, metadata={ "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically " "determined based on the model configuration. Find the supported values in the vLLM documentation." }, ) max_model_len: Optional[int] = field( default=None, metadata={ "help": "If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced " "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model " "context size, which might be much larger than the KV cache, leading to inefficiencies." }, ) enable_prefix_caching: Optional[bool] = field( default=None, metadata={ "help": "Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the " "hardware support this feature." }, ) serve_module: Optional[str] = field( default=None, metadata={ "help": "Module to serve. If not set, the default module will be used." }, ) enable_reasoning: Optional[bool] = field( default=None, ) reasoning_parser: Optional[str] = field( default=None, ) @dataclass class QuantizeCliArgs: """Dataclass with CLI arguments for `axolotl quantize` command.""" base_model: Optional[str] = field(default=None) weight_dtype: Optional[str] = field(default=None) activation_dtype: Optional[str] = field(default=None) quantize_embedding: Optional[bool] = field(default=None) group_size: Optional[int] = field(default=None) output_dir: Optional[str] = field(default=None) hub_model_id: Optional[str] = field(default=None) @dataclass class EvaluateCliArgs: """Dataclass with CLI arguments for `axolotl evaluate` command.""" debug: bool = field(default=False) debug_text_only: bool = field(default=False) debug_num_examples: int = field(default=0) @dataclass class InferenceCliArgs: """Dataclass with CLI arguments for `axolotl inference` command.""" prompter: Optional[str] = field(default=None) ================================================ FILE: src/axolotl/cli/art.py ================================================ """Axolotl ASCII logo utils.""" from axolotl.utils.distributed import is_main_process AXOLOTL_LOGO = """ #@@ #@@ @@# @@# @@ @@ @@ @@ =@@# @@ #@ =@@#. @@ #@@@@@@@@@ @@ #@#@= @@ #@ .=@@ #@@@@@@@@@@@@@@@@@ =@# @# ##= ## =####=+ @@ =#####+ =#@@###. @@ @@@@@@@@@@/ +@@/ +@@ #@ =@= #@= @@ =@#+ +#@# @@ =@#+ +#@# #@. @@ @@@@@@@@@@ ##@@ ##@@ =@# @# =@# @# @@ @@ @@ @@ #@ #@ @@ @@@@@@@@@@@@@@@@@@@@ #@=+++#@= =@@# @@ @@ @@ @@ #@ #@ @@ =@#=====@@ =@# @# @@ @@ @@ @@ #@ #@ @@ @@@@@@@@@@@@@@@@ @@@@ #@ #@= #@= +@@ #@# =@# @@. =@# =@# #@. @@ =@# @# #@= #@ =#@@@@#= +#@@= +#@@@@#= .##@@+ @@ @@@@ @@@@@@@@@@@@@@@@ """ HAS_PRINTED_LOGO = False def print_axolotl_text_art(): """Prints axolotl ASCII art.""" global HAS_PRINTED_LOGO if HAS_PRINTED_LOGO: return if is_main_process(): HAS_PRINTED_LOGO = True print(AXOLOTL_LOGO) ================================================ FILE: src/axolotl/cli/checks.py ================================================ """Various checks for Axolotl CLI.""" import os from pathlib import Path from accelerate.commands.config import config_args from huggingface_hub import HfApi from huggingface_hub.utils import LocalTokenNotFoundError from requests import HTTPError from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def check_accelerate_default_config() -> None: """Logs at warning level if no accelerate config file is found.""" if Path(config_args.default_yaml_config_file).exists(): LOG.warning( f"accelerate config file found at {config_args.default_yaml_config_file}. This can lead to unexpected errors" ) def check_user_token() -> bool: """Checks for HF user info. Check is skipped if HF_HUB_OFFLINE=1. Returns: Boolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved). Raises: LocalTokenNotFoundError: If HF user info can't be retrieved. """ # Skip check if HF_HUB_OFFLINE is set to True if os.getenv("HF_HUB_OFFLINE") == "1": LOG.info( "Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used." ) return True # Verify if token is valid api = HfApi() try: user_info = api.whoami() return bool(user_info) except LocalTokenNotFoundError: LOG.warning( "Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets." ) return False except HTTPError: LOG.warning( "Error accessing HuggingFace. This may be due to a network issue or rate limiting." ) return False ================================================ FILE: src/axolotl/cli/cloud/__init__.py ================================================ """ launch axolotl in supported cloud platforms """ from pathlib import Path from typing import Literal import yaml from axolotl.cli.cloud.base import Cloud from axolotl.cli.cloud.baseten import BasetenCloud from axolotl.cli.cloud.modal_ import ModalCloud from axolotl.utils.dict import DictDefault def load_cloud_cfg(cloud_config: Path | str) -> DictDefault: """Load and validate cloud configuration.""" # Load cloud configuration. with open(cloud_config, encoding="utf-8") as file: cloud_cfg: DictDefault = DictDefault(yaml.safe_load(file)) return cloud_cfg def do_cli_preprocess( cloud_config: Path | str, config: Path | str, ) -> None: cloud_cfg = load_cloud_cfg(cloud_config) cloud = ModalCloud(cloud_cfg) with open(config, "r", encoding="utf-8") as file: config_yaml = file.read() cloud.preprocess(config_yaml) def do_cli_train( cloud_config: Path | str, config: Path | str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", launcher_args: list[str] | None = None, cwd=None, **kwargs, ) -> None: cloud_cfg: DictDefault = load_cloud_cfg(cloud_config) provider = cloud_cfg.provider or "modal" cloud: Cloud | None if provider == "modal": cloud = ModalCloud(cloud_cfg) elif provider == "baseten": cloud = BasetenCloud(cloud_cfg.to_dict()) else: raise ValueError(f"Unsupported cloud provider: {provider}") with open(config, "r", encoding="utf-8") as file: config_yaml = file.read() local_dirs = {} if cwd and not Path(cwd).joinpath("src", "axolotl").exists(): local_dirs = {"/workspace/mounts": cwd} cloud.train( config_yaml, launcher=launcher, launcher_args=launcher_args, local_dirs=local_dirs, **kwargs, ) def do_cli_lm_eval( cloud_config: Path | str, config: Path | str, ) -> None: cloud_cfg = load_cloud_cfg(cloud_config) cloud = ModalCloud(cloud_cfg) with open(config, "r", encoding="utf-8") as file: config_yaml = file.read() cloud.lm_eval(config_yaml) ================================================ FILE: src/axolotl/cli/cloud/base.py ================================================ """ base class for cloud platforms from cli """ from abc import ABC, abstractmethod from typing import Literal class Cloud(ABC): """ Abstract base class for cloud platforms. """ @abstractmethod def preprocess(self, config_yaml: str, *args, **kwargs) -> None: pass @abstractmethod def train( self, config_yaml: str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", launcher_args: list[str] | None = None, local_dirs: dict[str, str] | None = None, **kwargs, ): pass ================================================ FILE: src/axolotl/cli/cloud/baseten/__init__.py ================================================ """Baseten Cloud CLI""" import shutil import subprocess # nosec B404 import tempfile from os.path import dirname from typing import Literal import yaml from axolotl.cli.cloud.base import Cloud class BasetenCloud(Cloud): """Baseten Cloud Axolotl CLI""" def __init__(self, config: dict): self.config = config def preprocess(self, config_yaml: str, *args, **kwargs) -> None: raise NotImplementedError( "Separate preprocess function for Baseten is not " "implemented and will happen during hte train step." ) def train( self, config_yaml: str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", launcher_args: list[str] | None = None, local_dirs: dict[str, str] | None = None, # pylint: disable=unused-argument **kwargs, ): with tempfile.TemporaryDirectory() as tmp_dir: config = self.config.copy() config["launcher"] = launcher config["launcher_args"] = launcher_args with open(tmp_dir + "/cloud.yaml", "w", encoding="utf-8") as cloud_fout: yaml.dump(config, cloud_fout) with open(tmp_dir + "/train.yaml", "w", encoding="utf-8") as config_fout: config_fout.write(config_yaml) shutil.copyfile(dirname(__file__) + "/template/run.sh", tmp_dir + "/run.sh") shutil.copyfile( dirname(__file__) + "/template/train_sft.py", tmp_dir + "/train_sft.py" ) subprocess.run( # nosec B603 B607 ["truss", "train", "push", "train_sft.py"], cwd=tmp_dir, check=False ) ================================================ FILE: src/axolotl/cli/cloud/baseten/template/run.sh ================================================ #!/bin/bash set -eux export NCCL_SOCKET_IFNAME="^docker0,lo" export NCCL_IB_DISABLE=0 export NCCL_TIMEOUT=1800000 axolotl preprocess train.yaml axolotl train train.yaml --launcher ${AXOLOTL_LAUNCHER} ${AXOLOTL_LAUNCHER_ARGS} ================================================ FILE: src/axolotl/cli/cloud/baseten/template/train_sft.py ================================================ """ Baseten Training Script for Axolotl """ # pylint: skip-file import yaml from truss.base import truss_config # Import necessary classes from the Baseten Training SDK from truss_train import definitions cloud_config = yaml.safe_load(open("cloud.yaml", "r")) gpu = cloud_config.get("gpu", "h100") gpu_count = int(cloud_config.get("gpu_count", 1)) node_count = int(cloud_config.get("node_count", 1)) project_name = cloud_config.get("project_name", "axolotl-project") or "axolotl-project" secrets = cloud_config.get("secrets", []) launcher = cloud_config.get("launcher", "accelerate") launcher_args = cloud_config.get("launcher_args", []) script_name = "run.sh" launcher_args_str = "" if launcher_args: launcher_args_str = "-- " + " ".join(launcher_args) # 1. Define a base image for your training job BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1" # 2. Define the Runtime Environment for the Training Job # This includes start commands and environment variables.a # Secrets from the baseten workspace like API keys are referenced using # `SecretReference`. env_vars = { "AXOLOTL_LAUNCHER": launcher, "AXOLOTL_LAUNCHER_ARGS": launcher_args_str, } for secret_name in secrets: env_vars[secret_name] = definitions.SecretReference(name=secret_name) training_runtime = definitions.Runtime( start_commands=[ # Example: list of commands to run your training script f"/bin/sh -c 'chmod +x ./{script_name} && ./{script_name}'" ], environment_variables=env_vars, ) # 3. Define the Compute Resources for the Training Job training_compute = definitions.Compute( node_count=node_count, accelerator=truss_config.AcceleratorSpec( accelerator=truss_config.Accelerator.H100, count=gpu_count, ), ) # 4. Define the Training Job # This brings together the image, compute, and runtime configurations. my_training_job = definitions.TrainingJob( image=definitions.Image(base_image=BASE_IMAGE), compute=training_compute, runtime=training_runtime, ) # This config will be pushed using the Truss CLI. # The association of the job to the project happens at the time of push. first_project_with_job = definitions.TrainingProject( name=project_name, job=my_training_job ) ================================================ FILE: src/axolotl/cli/cloud/modal_.py ================================================ """ Modal Cloud support from CLI """ import copy import json import os import subprocess # nosec B404 from pathlib import Path from random import randint from typing import Literal import modal from axolotl.cli.cloud.base import Cloud def run_cmd(cmd: str, run_folder: str, volumes=None): """Run a command inside a folder, with Modal Volume reloading before and commit on success.""" # Ensure volumes contain latest files. if volumes: for _, vol in volumes.items(): vol.reload() # modal workaround so it doesn't use the automounted axolotl new_env = copy.deepcopy(os.environ) if "PYTHONPATH" in new_env: paths = ["/workspace/mounts"] for sub_python_path_str in new_env["PYTHONPATH"].split(":"): sub_python_path = Path(sub_python_path_str) if not sub_python_path.joinpath("src", "axolotl").exists(): # we don't want to use the automounted axolotl or unexpected behavior happens paths.append(str(sub_python_path)) if paths: new_env["PYTHONPATH"] = ":".join(paths) else: del new_env["PYTHONPATH"] # Propagate errors from subprocess. if exit_code := subprocess.call( # nosec B603 cmd.split(), cwd=run_folder, env=new_env ): exit(exit_code) # Commit writes to volume. if volumes: for _, vol in volumes.items(): vol.commit() class ModalCloud(Cloud): """ Modal Cloud implementation. """ def __init__(self, config, app=None): self.config = config if not app: app = modal.App() self.app = app self.volumes = {} if config.volumes: for volume_config in config.volumes: _, mount, vol = self.create_volume(volume_config) self.volumes[mount] = (vol, volume_config) def get_env(self): res = { "HF_DATASETS_CACHE": "/workspace/data/huggingface-cache/datasets", "HF_HUB_CACHE": "/workspace/data/huggingface-cache/hub", } for key in self.config.get("env", []): if isinstance(key, str): if val := os.environ.get(key, ""): res[key] = val elif isinstance(key, dict): (key_, val) = list(key.items())[0] res[key_] = val return res def get_image(self): docker_tag = "main-py3.11-cu128-2.9.1" if self.config.docker_tag: docker_tag = self.config.docker_tag docker_image = f"axolotlai/axolotl:{docker_tag}" # grab the sha256 hash from docker hub for this image+tag # this ensures that we always get the latest image for this tag, even if it's already cached try: manifest = subprocess.check_output( # nosec ["docker", "manifest", "inspect", docker_image], ).decode("utf-8") sha256_hash = json.loads(manifest)["manifests"][0]["digest"] except subprocess.CalledProcessError: sha256_hash = None # create the image if sha256_hash: image = modal.Image.from_registry(f"axolotlai/axolotl@{sha256_hash}") else: image = modal.Image.from_registry(docker_image) dockerfile_commands = [] if self.config.dockerfile_commands: dockerfile_commands.extend(self.config.dockerfile_commands) # branch if self.config.branch: dockerfile_commands.extend( [ # Random id for cache busting of branch commits f"RUN echo '{str(randint(0, 1000000))}'", # nosec B311 f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch} && git pull", ] ) if dockerfile_commands: image = image.dockerfile_commands(dockerfile_commands) if env := self.get_env(): image = image.env(env) return image def get_secrets(self): res = [] if self.config.secrets: for key in self.config.get("secrets", []): if isinstance(key, str): if val := os.environ.get(key, ""): res.append(modal.Secret.from_dict({key: val})) elif isinstance(key, dict): (key_, val) = list(key.items())[0] res.append(modal.Secret.from_dict({key_: val})) return res def create_volume(self, volume_config): name = volume_config.name mount = volume_config.mount return name, mount, modal.Volume.from_name(name, create_if_missing=True) def get_ephemeral_disk_size(self): return 1000 * 525 # 1 TiB def get_preprocess_timeout(self): if self.config.timeout_preprocess: return int(self.config.timeout_preprocess) return 60 * 60 * 3 # 3 hours def get_preprocess_memory(self): memory = 128 # default to 128GiB if self.config.memory: memory = int(self.config.memory) if self.config.memory_preprocess: memory = int(self.config.memory_preprocess) return 1024 * memory def get_preprocess_env(self): return self.app.function( image=self.get_image(), volumes={k: v[0] for k, v in self.volumes.items()}, cpu=8.0, ephemeral_disk=self.get_ephemeral_disk_size(), memory=self.get_preprocess_memory(), timeout=self.get_preprocess_timeout(), secrets=self.get_secrets(), ) def preprocess(self, config_yaml: str, *args, **kwargs): modal_fn = self.get_preprocess_env()(_preprocess) with modal.enable_output(): with self.app.run(detach=True): modal_fn.remote( config_yaml, *args, volumes={k: v[0] for k, v in self.volumes.items()}, **kwargs, ) def get_train_timeout(self): if self.config.timeout: return int(self.config.timeout) return 60 * 60 * 24 # 24 hours def get_train_gpu(self): count = self.config.gpu_count or 1 family = self.config.gpu.lower() or "l40s" if family == "l40s": return modal.gpu.L40S(count=count) if family in ["a100", "a100-40gb"]: return modal.gpu.A100(count=count, size="40GB") if family == "a100-80gb": return modal.gpu.A100(count=count, size="80GB") if family in ["a10", "a10g"]: return modal.gpu.A10G(count=count) if family == "h100": return f"H100:{count}" if family == "t4": return modal.gpu.T4(count=count) if family == "l4": return modal.gpu.L4(count=count) raise ValueError(f"Unsupported GPU family: {family}") def get_train_memory(self): memory = 128 # default to 128GiB if self.config.memory: memory = int(self.config.memory) return 1024 * memory def get_train_env(self, local_dirs=None): image = self.get_image() for mount, local_dir in (local_dirs or {}).items(): image = image.add_local_dir(local_dir, mount) return self.app.function( image=image, volumes={k: v[0] for k, v in self.volumes.items()}, cpu=16.0, gpu=self.get_train_gpu(), memory=self.get_train_memory(), timeout=self.get_train_timeout(), secrets=self.get_secrets(), ) def train( self, config_yaml: str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", launcher_args: list[str] | None = None, local_dirs: dict[str, str] | None = None, **kwargs, ): modal_fn = self.get_train_env(local_dirs)(_train) with modal.enable_output(): with self.app.run(detach=True): modal_fn.remote( config_yaml, launcher=launcher, launcher_args=launcher_args, volumes={k: v[0] for k, v in self.volumes.items()}, **kwargs, ) def lm_eval(self, config_yaml: str): modal_fn = self.get_train_env()(_lm_eval) with modal.enable_output(): with self.app.run(detach=True): if self.config.get("spawn", False): modal_fn_exec = modal_fn.spawn else: modal_fn_exec = modal_fn.remote modal_fn_exec( config_yaml, volumes={k: v[0] for k, v in self.volumes.items()}, ) def _preprocess(config_yaml: str, volumes=None): Path("/workspace/mounts").mkdir(parents=True, exist_ok=True) with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out: f_out.write(config_yaml) run_folder = "/workspace/mounts" run_cmd( "axolotl preprocess /workspace/mounts/config.yaml --dataset-processes=8", run_folder, volumes, ) def _train( config_yaml: str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", launcher_args: list[str] | None = None, volumes=None, **kwargs, ): Path("/workspace/mounts").mkdir(parents=True, exist_ok=True) with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out: f_out.write(config_yaml) run_folder = "/workspace/mounts" launcher_args = launcher_args or [] # Build the base command if launcher == "accelerate": launcher_arg = "--launcher accelerate" elif launcher == "torchrun": launcher_arg = "--launcher torchrun" else: launcher_arg = "--launcher python" # Build launcher args string launcher_args_str = "" if launcher_args: launcher_args_str = "-- " + " ".join(launcher_args) run_cmd( f"axolotl train {launcher_arg} /workspace/mounts/config.yaml {launcher_args_str}".strip(), run_folder, volumes, ) def _lm_eval(config_yaml: str, volumes=None): Path("/workspace/mounts").mkdir(parents=True, exist_ok=True) with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out: f_out.write(config_yaml) run_folder = "/workspace/mounts" run_cmd( "axolotl lm-eval /workspace/mounts/config.yaml", run_folder, volumes, ) ================================================ FILE: src/axolotl/cli/config.py ================================================ """Configuration loading and processing.""" import json import os import tempfile from pathlib import Path from tempfile import NamedTemporaryFile from typing import Any, Optional, Union from urllib.parse import urlparse import requests import torch import yaml from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available from axolotl.integrations.base import PluginManager from axolotl.telemetry.errors import send_errors from axolotl.telemetry.manager import TelemetryManager from axolotl.utils.comet_ import setup_comet_env_vars from axolotl.utils.config import ( normalize_cfg_datasets, normalize_config, validate_config, ) from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.mlflow_ import setup_mlflow_env_vars from axolotl.utils.tee import prepare_debug_log from axolotl.utils.trackio_ import setup_trackio_env_vars from axolotl.utils.trainer import prepare_optim_env from axolotl.utils.wandb_ import setup_wandb_env_vars LOG = get_logger(__name__) def _coerce_value(value: Any, existing: Optional[Any] = None) -> Any: """Coerce a string CLI value to its most likely Python type. If an existing value is present in the config, its type is used to guide casting. Otherwise, YAML-style inference is applied: booleans, ints, floats, and None literals are recognised automatically. Args: value: The raw value (typically a string from the CLI). existing: An optional existing config value whose type guides coercion. Returns: The value cast to the inferred or expected type. """ if not isinstance(value, str): return value # If the config already has a typed value, cast to match if existing is not None: if isinstance(existing, bool): return value.lower() in ("true", "1", "yes") if isinstance(existing, int): try: return int(value) except (ValueError, TypeError): return value if isinstance(existing, float): try: return float(value) except (ValueError, TypeError): return value # For other types (str, list, dict, etc.), return as-is return value # No existing value -- use YAML-style inference lower = value.lower() if lower in ("true", "yes"): return True if lower in ("false", "no"): return False if lower in ("null", "none", "~"): return None # Try int then float try: return int(value) except ValueError: pass try: return float(value) except ValueError: pass return value API_KEY_FIELDS = {"comet_api_key"} TELEMETRY_MANAGER = TelemetryManager.get_instance() def check_remote_config(config: Union[str, Path]) -> Union[str, Path]: """ First, determines if the passed config is a valid HTTPS URL. Then, attempts to query for it and parse its content, first as JSON, then as YAML (YAML is preferred). Finally, the parsed content is written to a local file and its path is returned. Args: config: HTTPS URL to a YAML or JSON file. Returns: Either the original `config` if it's not a valid HTTPS URL, or the path to the downloaded remote config. Raises: ValueError: If the remote configuration is neither valid JSON or YAML. RuntimeError: If some request-related exception occurs from the file download. Exception: Catch-all for any other exception. """ # Check if the config is a valid HTTPS URL to a .yml or .yaml file if not (isinstance(config, str) and config.startswith("https://")): return config # Return the original value if it's not a valid URL filename = os.path.basename(urlparse(config).path) temp_dir = tempfile.mkdtemp() try: response = requests.get(config, timeout=30) response.raise_for_status() # Check for HTTP errors content = response.content try: # Try parsing as JSON first to catch cases where JSON content is mistakenly # considered YAML. json.loads(content) # Log a warning but do not raise an error; JSON is technically valid YAML. # This can happen when you forget to point to a raw GitHub link. LOG.warning( f"Warning: The content of the file at {config} is JSON, which is technically valid YAML but might not be intended." ) except json.JSONDecodeError: # If it's not valid JSON, verify it's valid YAML try: yaml.safe_load(content) except yaml.YAMLError as err: raise ValueError( f"Failed to parse the content at {config} as YAML: {err}" ) from err # Write the content to a file if it's valid YAML (or JSON treated as YAML) output_path = Path(temp_dir) / filename with open(output_path, "wb") as file: file.write(content) LOG.info( f"Using the following config obtained from {config}: \n\n{content.decode('utf-8')}\n" ) return output_path except requests.RequestException as err: # This catches all requests-related exceptions including HTTPError raise RuntimeError(f"Failed to download {config}: {err}") from err except Exception as err: # Catch-all for any other exceptions raise err def choose_config(path: Path) -> str: """ Helper method for choosing a `axolotl` config YAML file (considering only files ending with `.yml` or `.yaml`). If more than one config file exists in the passed `path`, the user is prompted to choose one. Args: path: Directory in which config file(s) are stored. Returns: Path to either (1) the sole YAML file, or (2) if more than one YAML files exist, the user-selected YAML file. Raises: ValueError: If no YAML files are found in the given `path`. """ yaml_files = list(path.glob("*.yml")) + list(path.glob("*.yaml")) if not yaml_files: raise ValueError( "No YAML config files found in the specified directory. Are you using a .yml extension?" ) if len(yaml_files) == 1: LOG.info(f"Using default YAML file '{yaml_files[0]}'") return str(yaml_files[0]) LOG.info("Choose a YAML file:") for idx, file in enumerate(yaml_files): LOG.info(f"{idx + 1}. {file}") chosen_file = None while chosen_file is None: try: choice = int(input("Enter the number of your choice: ")) if 1 <= choice <= len(yaml_files): chosen_file = str(yaml_files[choice - 1]) else: LOG.info("Invalid choice. Please choose a number from the list.") except ValueError: LOG.info("Invalid input. Please enter a number.") return chosen_file def prepare_plugins(cfg: DictDefault): """ Registers the plugins for the given configuration. Args: cfg: Dictionary mapping `axolotl` config keys to values. """ if cfg.get("plugins"): plugin_manager = PluginManager.get_instance() for plugin_name in cfg["plugins"]: plugin_manager.register(plugin_name) for plugin in plugin_manager.plugins.values(): plugin.register(cfg) def plugin_set_cfg(cfg: DictDefault): if cfg.get("plugins"): plugin_manager = PluginManager.get_instance() plugin_manager.cfg = cfg @send_errors def load_cfg( config: str | Path | DictDefault = Path("examples/"), **kwargs ) -> DictDefault: """ Loads the `axolotl` configuration stored at `config`, validates it, and performs various setup. Args: config: Path (local or remote) to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. Returns: `DictDefault` mapping configuration keys to values. """ if isinstance(config, (str, Path)): config = check_remote_config(config) if Path(config).is_dir(): config = choose_config(Path(config)) # Load the config from the yaml file with open(config, encoding="utf-8") as file: cfg: DictDefault = DictDefault(yaml.safe_load(file)) cfg.axolotl_config_path = config else: cfg = config with NamedTemporaryFile( mode="w", delete=False, suffix=".yml", prefix="axolotl_config_" ) as temp_file: temp_file.write(yaml.dump(config.to_dict())) temp_file.close() cfg.axolotl_config_path = temp_file.name TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg) # If there are any options passed in the cli, if it is something that seems valid # from the yaml, then overwrite the value cfg_keys = cfg.keys() # Separate nested (dot-notation) kwargs from flat kwargs nested_kwargs: dict[str, dict[str, Any]] = {} flat_kwargs: dict[str, Any] = {} for key, value in kwargs.items(): if "__" in key: parent, child = key.split("__", 1) nested_kwargs.setdefault(parent, {})[child] = value else: flat_kwargs[key] = value # Apply flat kwargs for key, value in flat_kwargs.items(): # If not strict, allow writing to cfg even if it's not in the yml already if key in cfg_keys or not cfg.strict: cfg[key] = _coerce_value(value, cfg.get(key)) # Apply nested kwargs (e.g., trl__beta -> cfg.trl.beta) for parent, children in nested_kwargs.items(): if parent not in cfg_keys and cfg.strict: continue if cfg[parent] is None: cfg[parent] = {} if not isinstance(cfg[parent], dict): LOG.warning( "Overwriting non-dict value for '%s' with nested CLI overrides", parent ) cfg[parent] = {} for child_key, child_value in children.items(): existing_child = cfg[parent].get(child_key) cfg[parent][child_key] = _coerce_value(child_value, existing_child) try: device_props = torch.cuda.get_device_properties("cuda") gpu_version = "sm_" + str(device_props.major) + str(device_props.minor) except (RuntimeError, AssertionError): gpu_version = None prepare_plugins(cfg) cfg = validate_config( cfg, capabilities={ "bf16": is_torch_bf16_gpu_available(), "fp8": compute_supports_fp8(), "tf32": is_torch_tf32_available(), "n_gpu": int(os.environ.get("WORLD_SIZE", 1)), "compute_capability": gpu_version, }, env_capabilities={ "torch_version": str(torch.__version__).split("+", maxsplit=1)[0] }, ) # NOTE(djsaunde): We start outputting to output_dir/debug.log at this point since we # have to wait for cfg.output to be resolved. We could call this earlier if we write # to a temporary file, and then move it later. prepare_debug_log(cfg) prepare_optim_env(cfg) normalize_config(cfg) normalize_cfg_datasets(cfg) setup_wandb_env_vars(cfg) setup_mlflow_env_vars(cfg) setup_comet_env_vars(cfg) setup_trackio_env_vars(cfg) plugin_set_cfg(cfg) TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg) cfg_to_log = { k: "[REDACTED]" if k in API_KEY_FIELDS else v for k, v in cfg.items() if v is not None } LOG.info( "config:\n%s", json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True), ) return cfg def compute_supports_fp8() -> bool: try: compute_capability = torch.cuda.get_device_capability() return compute_capability >= (9, 0) except RuntimeError: return False ================================================ FILE: src/axolotl/cli/delinearize_llama4.py ================================================ """ CLI tool to delinearize quantized/Linearized Llama-4 models. """ import os from pathlib import Path from typing import Generator, Union import fire import torch from accelerate import init_empty_weights from transformers import AutoProcessor def iter_convert_patched_to_hf(model_state_dict, num_experts) -> Generator: keys = list(model_state_dict.keys()) for key in keys: if ".feed_forward.experts." not in key: yield key, model_state_dict[key] if ".feed_forward.experts.gate_projs" in key: # gate gets fused with up so skip the yield on this and we'll fuse it when asking for the up continue if ".feed_forward.experts.up_projs" in key: if ".feed_forward.experts.up_projs.0." in key: # handle the re-shape and fusing of gate and up, and conversion from linear to parameter prefix = key.split(".up_projs.0.")[0] key = f"{prefix}.gate_up_proj" # grab all the up_projs and gate_projs across all experts gate_stacked = torch.stack( [ model_state_dict[ f"{prefix}.gate_projs.{expert_idx}.weight" ].transpose(0, 1) for expert_idx in range(num_experts) ] ) up_stacked = torch.stack( [ model_state_dict[ f"{prefix}.up_projs.{expert_idx}.weight" ].transpose(0, 1) for expert_idx in range(num_experts) ] ) gate_up_proj = torch.cat((gate_stacked, up_stacked), dim=-1) del gate_stacked, up_stacked yield key, gate_up_proj else: del model_state_dict[key] continue if ".feed_forward.experts.down_projs" in key: if ".feed_forward.experts.down_projs.0." in key: # handle the re-shape and fusing of gate and up, and conversion from linear to parameter prefix = key.split(".down_projs.0.")[0] key = f"{prefix}.down_proj" # grab all the down_projs across all experts down_stacked = torch.stack( [ model_state_dict[ f"{prefix}.down_projs.{expert_idx}.weight" ].transpose(0, 1) for expert_idx in range(num_experts) ] ) yield key, down_stacked else: del model_state_dict[key] continue def do_cli(model: Union[Path, str], output: Union[Path, str]) -> None: """ Convert a patched HF format Llama4 model (with separated projections) back to the original HF format (with fused projections). Args: model: Path to the patched HF model output: Path to save the converted model """ print(f"Loading model from {model}") from axolotl.monkeypatch.models.llama4.modeling import ( patch_llama4_linearized_modeling, ) unpatch_llama4 = patch_llama4_linearized_modeling() from transformers import Llama4ForConditionalGeneration model_ = Llama4ForConditionalGeneration.from_pretrained(model, dtype=torch.bfloat16) processor = AutoProcessor.from_pretrained(model) processor.save_pretrained(output) device = model_.device.type if device == "cuda": print( f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB" ) print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB") model_config = model_.config config = model_.config.get_text_config() # Get key dimensions from the config hidden_size = config.hidden_size intermediate_size = config.intermediate_size num_experts = config.num_local_experts print( f"Model dimensions: hidden_size={hidden_size}, intermediate_size={intermediate_size}, num_experts={num_experts}" ) # Create output directory if it doesn't exist os.makedirs(output, exist_ok=True) # Get state dict state_dict = model_.state_dict() del model_ # Create a new state dict for the converted model converted_state_dict = {} # First, copy all keys that don't need modification for key, value in iter_convert_patched_to_hf(state_dict, num_experts): converted_state_dict[key] = value del state_dict if device == "cuda": torch.cuda.empty_cache() print("State dict converted.") print( f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB" ) print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB") # Ideally re-load the model import to load the converted state dict # Save the converted model with init_empty_weights(): unpatch_llama4() model_ = Llama4ForConditionalGeneration(model_config) if device == "cuda": print("State dict loaded into model.") print( f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB" ) print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB") model_.load_state_dict(converted_state_dict, strict=False, assign=True) print(f"Saving converted model to {output}...") model_.save_pretrained(output) print(f"Model successfully converted and saved to {output}") if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/evaluate.py ================================================ """CLI to run evaluation on a model.""" import os from pathlib import Path from typing import Union import fire from transformers.hf_argparser import HfArgumentParser from axolotl.cli.args import TrainerCliArgs from axolotl.cli.checks import check_accelerate_default_config, check_user_token from axolotl.cli.config import load_cfg from axolotl.common.datasets import load_datasets, load_preference_datasets from axolotl.evaluate import evaluate from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None: """ Evaluates a `transformers` model by first loading the dataset(s) specified in the `axolotl` config, and then calling `axolotl.evaluate.evaluate`, which computes evaluation metrics on the given dataset(s) and writes them to disk. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: CLI arguments. """ check_accelerate_default_config() if int(os.getenv("LOCAL_RANK", "0")) == 0: check_user_token() if cfg.rl: dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) else: dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) evaluate(cfg=cfg, dataset_meta=dataset_meta) def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None: """ Parses `axolotl` config, CLI args, and calls `do_evaluate`. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. """ parsed_cfg = load_cfg(config, **kwargs) parser = HfArgumentParser(TrainerCliArgs) parsed_cli_args, _ = parser.parse_args_into_dataclasses( return_remaining_strings=True ) do_evaluate(parsed_cfg, parsed_cli_args) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/inference.py ================================================ """CLI to run inference on a trained model.""" import importlib import sys from pathlib import Path from threading import Thread from typing import Union import fire import torch import transformers from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer from axolotl.cli.args import InferenceCliArgs from axolotl.cli.config import load_cfg from axolotl.cli.utils import load_model_and_tokenizer from axolotl.cli.utils.diffusion import ( diffusion_inference, launch_diffusion_gradio_ui, ) from axolotl.integrations.base import PluginManager from axolotl.telemetry.errors import send_errors from axolotl.utils.chat_templates import ( get_chat_template_from_config, ) from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def get_multi_line_input() -> str: """ Gets multi-line input from terminal. Returns: Possibly multi-line, possibly empty stdin input as a string. """ print("Give me an instruction (Ctrl + D to submit): ") print("=" * 80) instruction = "" for line in sys.stdin: instruction += line return instruction @send_errors def do_inference( *, cfg: DictDefault, cli_args: InferenceCliArgs, ): """ Runs inference on the command line in a loop. User input is accepted, a chat template is (optionally) applied, and the model specified in the `axolotl` config is used to generate completions according to a default generation config. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Inference-specific CLI arguments. """ model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True) prompter = cli_args.prompter prompter_module = None chat_template_str = None if prompter: prompter_module = getattr( importlib.import_module("axolotl.prompters"), prompter ) elif cfg.chat_template: chat_template_str = get_chat_template_from_config( cfg, ds_cfg=None, tokenizer=tokenizer ) elif cfg.datasets and cfg.datasets[0].type == "chat_template": chat_template_str = get_chat_template_from_config( cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer ) model = model.to(cfg.device, dtype=cfg.torch_dtype) # Detect diffusion mode plugin_manager = PluginManager.get_instance() is_diffusion = any( plugin.__class__.__name__ == "DiffusionPlugin" for plugin in plugin_manager.plugins.values() ) if is_diffusion: print("=" * 80) print("Commands:") print(":complete N -> completion mode with N tokens (default 64)") print(":mask R -> random masking with ratio R (0.0–1.0)") while True: print("=" * 80) instruction = get_multi_line_input() if not instruction: return if prompter_module: prompt: str = next( prompter_module().build_prompt(instruction=instruction.strip("\n")) ) else: prompt = instruction.strip() if chat_template_str: batch = tokenizer.apply_chat_template( [ { "role": "user", "content": prompt, } ], return_tensors="pt", add_special_tokens=True, add_generation_prompt=True, chat_template=chat_template_str, tokenize=True, return_dict=True, ) else: batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) print("=" * 80) model.eval() with torch.no_grad(): if is_diffusion: diffusion_inference( model=model, tokenizer=tokenizer, cfg=cfg, prompt=prompt, chat_template_str=chat_template_str, ) continue generation_config = GenerationConfig( repetition_penalty=1.1, max_new_tokens=1024, temperature=0.9, top_p=0.95, top_k=40, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=True, use_cache=True, return_dict_in_generate=True, output_attentions=False, output_hidden_states=False, output_scores=False, ) streamer = TextStreamer(tokenizer) generated = model.generate( inputs=batch["input_ids"].to(cfg.device), generation_config=generation_config, streamer=streamer, ) print("=" * 80) print(tokenizer.decode(generated["sequences"].cpu().tolist()[0])) @send_errors def do_inference_gradio( *, cfg: DictDefault, cli_args: InferenceCliArgs, ): """ Runs inference in a Gradio interface. User input is accepted, a chat template is (optionally) applied, and the model specified in the `axolotl` config is used to generate completions according to a default generation config. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Inference-specific CLI arguments. """ import gradio as gr model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True) prompter = cli_args.prompter prompter_module = None chat_template_str = None if prompter: prompter_module = getattr( importlib.import_module("axolotl.prompters"), prompter ) elif cfg.chat_template: chat_template_str = get_chat_template_from_config( cfg, ds_cfg=None, tokenizer=tokenizer ) elif cfg.datasets and cfg.datasets[0].type == "chat_template": chat_template_str = get_chat_template_from_config( cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer ) model = model.to(cfg.device, dtype=cfg.torch_dtype) # Detect diffusion mode plugin_manager = PluginManager.get_instance() is_diffusion = any( plugin.__class__.__name__ == "DiffusionPlugin" for plugin in plugin_manager.plugins.values() ) if is_diffusion: launch_diffusion_gradio_ui( model=model, tokenizer=tokenizer, cfg=cfg, prompter_module=prompter_module, chat_template_str=chat_template_str, ) return def generate(instruction): if not instruction: return if prompter_module: prompt: str = next( prompter_module().build_prompt(instruction=instruction.strip("\n")) ) else: prompt = instruction.strip() if chat_template_str: batch = tokenizer.apply_chat_template( [ { "role": "user", "content": prompt, } ], return_tensors="pt", add_special_tokens=True, add_generation_prompt=True, chat_template=chat_template_str, tokenize=True, return_dict=True, ) else: batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) model.eval() with torch.no_grad(): generation_config = GenerationConfig( repetition_penalty=1.1, max_new_tokens=cfg.get("gradio_max_new_tokens", 1024), temperature=cfg.get("gradio_temperature", 0.9), top_p=0.95, top_k=40, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=True, use_cache=True, return_dict_in_generate=True, output_attentions=False, output_hidden_states=False, output_scores=False, ) streamer = TextIteratorStreamer(tokenizer) generation_kwargs = { "inputs": batch["input_ids"].to(cfg.device), "attention_mask": batch["attention_mask"].to(cfg.device), "generation_config": generation_config, "streamer": streamer, } thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() all_text = "" for new_text in streamer: all_text += new_text yield all_text demo = gr.Interface( fn=generate, inputs="textbox", outputs="text", title=cfg.get("gradio_title", "Axolotl Gradio Interface"), ) demo.launch( footer_links=["gradio", "settings"], share=cfg.get("gradio_share", True), server_name=cfg.get("gradio_server_name", "127.0.0.1"), server_port=cfg.get("gradio_server_port", None), ) def do_cli( config: Union[Path, str] = Path("examples/"), gradio: bool = False, **kwargs ) -> None: """ Parses axolotl config, CLI args, and calls `do_inference` or `do_inference_gradio`. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. """ parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs) parsed_cfg.sample_packing = False parser = transformers.HfArgumentParser(InferenceCliArgs) parsed_cli_args, _ = parser.parse_args_into_dataclasses( return_remaining_strings=True ) if gradio: do_inference_gradio(cfg=parsed_cfg, cli_args=parsed_cli_args) else: do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/main.py ================================================ """Click CLI definitions for various axolotl commands.""" import os import subprocess # nosec B404 from typing import Literal, Optional import click from dotenv import load_dotenv import axolotl from axolotl.cli.args import ( EvaluateCliArgs, PreprocessCliArgs, QuantizeCliArgs, TrainerCliArgs, VllmServeCliArgs, ) from axolotl.cli.art import print_axolotl_text_art from axolotl.cli.utils import ( add_options_from_config, add_options_from_dataclass, build_command, fetch_from_github, filter_none_kwargs, generate_config_files, launch_training, ) from axolotl.integrations.lm_eval.cli import lm_eval from axolotl.utils import set_misc_env, set_pytorch_cuda_alloc_conf from axolotl.utils.logging import get_logger from axolotl.utils.schemas.config import AxolotlInputConfig LOG = get_logger(__name__) LAUNCHER_COMMAND_MAPPING = { "accelerate": ["accelerate", "launch"], "torchrun": ["torchrun"], } @click.group() @click.version_option(version=axolotl.__version__, prog_name="axolotl") def cli(): """Axolotl CLI - Train and fine-tune large language models""" print_axolotl_text_art() load_dotenv() set_pytorch_cuda_alloc_conf() set_misc_env() @cli.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(PreprocessCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs def preprocess(config: str, cloud: Optional[str] = None, **kwargs): """ Preprocess datasets before training. Args: config: Path to `axolotl` config YAML file. cloud: Path to a cloud accelerator configuration file. kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ if cloud: from axolotl.cli.cloud import do_cli_preprocess do_cli_preprocess(cloud_config=cloud, config=config) else: from axolotl.cli.preprocess import do_cli do_cli(config=config, **kwargs) @cli.command( context_settings={"ignore_unknown_options": True, "allow_extra_args": True} ) @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option( "--launcher", type=click.Choice(["accelerate", "torchrun", "python"]), default="accelerate", help="Launcher to use for multi-GPU training", ) @click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str)) @click.option( "--sweep", type=click.Path(exists=True, path_type=str), help="YAML config for sweeping hyperparameters", ) @add_options_from_dataclass(TrainerCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs @click.pass_context def train( ctx: click.Context, config: str, launcher: Literal["accelerate", "torchrun", "python"] = "accelerate", cloud: str | None = None, sweep: str | None = None, **kwargs, ): """ Train or fine-tune a model. Args: ctx: Click context for extra args. config: Path to `axolotl` config YAML file. launcher: Launcher to use for multi-GPU training ("accelerate", "torchrun", or "python"). cloud: Path to a cloud accelerator configuration file sweep: Path to YAML config for sweeping hyperparameters. kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ # Extract launcher args from extra args (after --) launcher_args = ctx.args if ctx.args else [] # Handle Ray launcher override _launcher = None if kwargs.get("use_ray") else launcher # Process each configuration for cfg_file, is_group in generate_config_files(config, sweep): try: use_exec = is_group is not True launch_training(cfg_file, _launcher, cloud, kwargs, launcher_args, use_exec) except subprocess.CalledProcessError as exc: LOG.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}") if not sweep: raise exc finally: # Only delete temp files, not the original config if cfg_file != config: os.unlink(cfg_file) @cli.command( context_settings={"ignore_unknown_options": True, "allow_extra_args": True} ) @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option( "--launcher", type=click.Choice(["accelerate", "torchrun", "python"]), default="accelerate", help="Launcher to use for multi-GPU evaluation", ) @add_options_from_dataclass(EvaluateCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs @click.pass_context def evaluate(ctx: click.Context, config: str, launcher: str, **kwargs): """ Evaluate a model. Args: ctx: Click context for extra args. config: Path to `axolotl` config YAML file. launcher: Launcher to use for multi-GPU evaluation ("accelerate", "torchrun", or "python"). kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ # Extract launcher args from extra args (after --) launcher_args = ctx.args if ctx.args else [] if launcher in LAUNCHER_COMMAND_MAPPING: base_cmd = ( LAUNCHER_COMMAND_MAPPING[launcher] + launcher_args + ["-m", "axolotl.cli.evaluate"] ) if config: base_cmd.append(config) cmd = build_command(base_cmd, kwargs) subprocess.run(cmd, check=True) # nosec B603 else: from axolotl.cli.evaluate import do_cli do_cli(config=config, **kwargs) @cli.command( context_settings={"ignore_unknown_options": True, "allow_extra_args": True} ) @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option( "--launcher", type=click.Choice(["accelerate", "torchrun", "python"]), default="accelerate", help="Launcher to use for multi-GPU inference", ) @click.option("--gradio", is_flag=True, help="Launch Gradio interface") @add_options_from_dataclass(TrainerCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs @click.pass_context def inference(ctx: click.Context, config: str, launcher: str, gradio: bool, **kwargs): """ Run inference with a trained model. Args: ctx: Click context for extra args. config: Path to `axolotl` config YAML file. launcher: Launcher to use for multi-GPU inference ("accelerate", "torchrun", or "python"). gradio: Whether to use Gradio browser interface or command line for inference. kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ # Extract launcher args from extra args (after --) launcher_args = ctx.args if ctx.args else [] if launcher in LAUNCHER_COMMAND_MAPPING: base_cmd = ( LAUNCHER_COMMAND_MAPPING[launcher] + launcher_args + ["-m", "axolotl.cli.inference"] ) if config: base_cmd.append(config) if gradio: base_cmd.append("--gradio") cmd = build_command(base_cmd, kwargs) subprocess.run(cmd, check=True) # nosec B603 else: from axolotl.cli.inference import do_cli do_cli(config=config, gradio=gradio, **kwargs) @cli.command( context_settings={"ignore_unknown_options": True, "allow_extra_args": True} ) @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option( "--launcher", type=click.Choice(["accelerate", "torchrun", "python"]), default="accelerate", help="Launcher to use for weight merging", ) @add_options_from_dataclass(TrainerCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs @click.pass_context def merge_sharded_fsdp_weights( ctx: click.Context, config: str, launcher: str, **kwargs ): """ Merge sharded FSDP model weights. Args: ctx: Click context for extra args. config: Path to `axolotl` config YAML file. launcher: Launcher to use for weight merging ("accelerate", "torchrun", or "python"). kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ # Extract launcher args from extra args (after --) launcher_args = ctx.args if ctx.args else [] if launcher in LAUNCHER_COMMAND_MAPPING: base_cmd = ( LAUNCHER_COMMAND_MAPPING[launcher] + launcher_args + ["-m", "axolotl.cli.merge_sharded_fsdp_weights"] ) if config: base_cmd.append(config) cmd = build_command(base_cmd, kwargs) subprocess.run(cmd, check=True) # nosec B603 else: from axolotl.cli.merge_sharded_fsdp_weights import do_cli do_cli(config=config, **kwargs) @cli.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(TrainerCliArgs) @add_options_from_config(AxolotlInputConfig) @filter_none_kwargs def merge_lora(config: str, **kwargs): """ Merge trained LoRA adapters into a base model. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments which correspond to CLI args or `axolotl` config options. """ from axolotl.cli.merge_lora import do_cli do_cli(config=config, **kwargs) @cli.command() @click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"])) @click.option("--dest", help="Destination directory") def fetch(directory: str, dest: Optional[str]): """ Fetch example configs or other resources. Available directories: - examples: Example configuration files - deepspeed_configs: DeepSpeed configuration files Args: directory: One of `examples`, `deepspeed_configs`. dest: Optional destination directory. """ fetch_from_github(f"{directory}/", dest) @cli.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(VllmServeCliArgs) @filter_none_kwargs def vllm_serve(config: str, **cli_args: VllmServeCliArgs): from axolotl.cli.vllm_serve import do_vllm_serve do_vllm_serve(config, cli_args) @cli.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(QuantizeCliArgs) @filter_none_kwargs def quantize(config: str, **cli_args: QuantizeCliArgs): from axolotl.cli.quantize import do_quantize do_quantize(config, cli_args) @cli.command() @click.argument("model", type=click.Path(exists=True, path_type=str)) @click.argument("output", type=click.Path(exists=False, path_type=str)) def delinearize_llama4(model: str, output: str): from axolotl.cli.delinearize_llama4 import do_cli as do_delinearize_llama4 do_delinearize_llama4(model, output) cli.add_command(lm_eval) def main(): cli() if __name__ == "__main__": main() ================================================ FILE: src/axolotl/cli/merge_lora.py ================================================ """CLI to merge a trained LoRA into a base model.""" from pathlib import Path from typing import Union import fire from axolotl.cli.config import load_cfg from axolotl.cli.utils import load_model_and_tokenizer from axolotl.telemetry.errors import send_errors from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @send_errors def do_merge_lora(*, cfg: DictDefault) -> None: """ Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config along with the LoRA adapters to combine them into a single base model. Args: cfg: Dictionary mapping `axolotl` config keys to values. """ model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg) LOG.info("Running merge of LoRA with base model...") model = model.merge_and_unload(progressbar=True) try: model.to(dtype=cfg.torch_dtype) except ValueError as e: LOG.warning("Failed to convert model to dtype %s", cfg.torch_dtype) LOG.warning("Ignore this if the base_model is pre-quantized.") LOG.warning("Error raised: %s", e) model.generation_config.do_sample = True model.config.use_cache = True if cfg.local_rank == 0: LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...") model.save_pretrained( str(Path(cfg.output_dir) / "merged"), progressbar=True, ) tokenizer.save_pretrained( str(Path(cfg.output_dir) / "merged"), save_jinja_files=cfg.tokenizer_save_jinja_files, ) if processor: processor.save_pretrained(str(Path(cfg.output_dir) / "merged")) def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None: """ Parses `axolotl` config, CLI args, and calls `do_merge_lora`. Note that various config values will be overwritten to allow the LoRA merge logic to work as expected (`load_in_8bit=False`, `load_in4bit=False`, `flash_attention=False`, etc.). Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. Raises: ValueError: If target directory for LoRA merged model does not exist. """ parsed_cfg = load_cfg( config, merge_lora=True, load_in_8bit=False, load_in_4bit=False, quantize_moe_experts=False, flash_attention=False, context_parallel_size=None, deepspeed=None, fsdp=None, fsdp_config=None, **kwargs, ) if not parsed_cfg.lora_model_dir and parsed_cfg.output_dir: parsed_cfg.lora_model_dir = parsed_cfg.output_dir if not Path(parsed_cfg.lora_model_dir).exists(): raise ValueError( f"Target directory for merge: `{parsed_cfg.lora_model_dir}` does not exist." ) do_merge_lora(cfg=parsed_cfg) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/merge_sharded_fsdp_weights.py ================================================ """CLI to merge sharded FSDP model checkpoints into a single combined checkpoint.""" import json import os import shutil from pathlib import Path from typing import Dict, Union import fire import torch import torch.distributed.checkpoint as dist_cp import torch.distributed.checkpoint.format_utils as dist_cp_format_utils from accelerate import PartialState from accelerate.utils import ( SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, is_torch_version, ) from huggingface_hub import split_torch_state_dict_into_shards from safetensors.torch import save_file as safe_save_file from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner from axolotl.cli.config import load_cfg from axolotl.telemetry.errors import send_errors from axolotl.utils.logging import get_logger from axolotl.utils.train import determine_last_checkpoint LOG = get_logger(__name__) class BFloat16CastPlanner(_EmptyStateDictLoadPlanner): """A custom planner to cast tensors to bfloat16 on the fly during loading.""" def commit_tensor(self, read_item, tensor): tensor.copy_(tensor.to(torch.bfloat16)) def _distributed_checkpoint_to_merged_weights( checkpoint_dir: Union[str, Path], save_path: str, max_shard_size: str = "5GB", ) -> Path: """ Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`. Will save under `save_path` as `model.safetensors`. Args: checkpoint_dir: Directory where distributed checkpoint is saved. save_path: Path to save model to. max_shard_size: Max size of model shards to save. Returns: Path where model is saved. """ state_dict: Dict = {} save_path_ = Path(save_path) save_path_.mkdir(exist_ok=True) dist_cp_format_utils._load_state_dict( state_dict, storage_reader=dist_cp.FileSystemReader(checkpoint_dir), planner=BFloat16CastPlanner(), no_dist=True, ) # To handle if state is a dict like {model: {...}} if len(state_dict.keys()) == 1: state_dict = state_dict[list(state_dict)[0]] # Ensure all tensors are in bfloat16 for key, value in state_dict.items(): if isinstance(value, torch.Tensor) and value.dtype != torch.bfloat16: state_dict[key] = value.to(torch.bfloat16) filename_pattern = SAFE_WEIGHTS_NAME.replace(".safetensors", "{suffix}.safetensors") state_dict_split = split_torch_state_dict_into_shards( state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size ) # Save index if sharded index = None if state_dict_split.is_sharded: index = { "metadata": state_dict_split.metadata, "weight_map": state_dict_split.tensor_to_filename, } # Save the model filename_to_tensors = state_dict_split.filename_to_tensors.items() for shard_file, tensors in filename_to_tensors: shard = {tensor: state_dict[tensor] for tensor in tensors} safe_save_file( shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"} ) if index is not None: save_index_file = os.path.join(save_path_, SAFE_WEIGHTS_INDEX_NAME) # Save the index as well with open(save_index_file, "w", encoding="utf-8") as fout: content = json.dumps(index, indent=2, sort_keys=True) + "\n" fout.write(content) return save_path_ @send_errors def merge_fsdp_weights( checkpoint_dir: str, output_path: str, remove_checkpoint_dir: bool = False, ): """ Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors`. Note: this is a CPU-bound process. Args: checkpoint_dir (`str`): The directory containing the FSDP checkpoints (can be either the model or optimizer). output_path (`str`): The path to save the merged checkpoint. remove_checkpoint_dir (`bool`, *optional*, defaults to `False`): Whether to remove the checkpoint directory after merging. Raises: ValueError: If torch version < 2.3.0, or if `checkpoint_dir` does not exist. """ checkpoint_dir_ = Path(checkpoint_dir) if not is_torch_version(">=", "2.3.0"): raise ValueError("`merge_fsdp_weights` requires PyTorch >= 2.3.0`") # Verify that the checkpoint directory exists if not checkpoint_dir_.exists(): model_path_exists = (checkpoint_dir_ / "pytorch_model_fsdp_0").exists() optimizer_path_exists = (checkpoint_dir_ / "optimizer_0").exists() err = f"Tried to load from {checkpoint_dir_} but couldn't find a valid metadata file." if model_path_exists and optimizer_path_exists: err += ( " However, potential model and optimizer checkpoint directories exist." ) err += f"Please pass in either {checkpoint_dir_}/pytorch_model_fsdp_0 or {checkpoint_dir_}/optimizer_0" err += "instead." elif model_path_exists: err += " However, a potential model checkpoint directory exists." err += ( f"Please try passing in {checkpoint_dir_}/pytorch_model_fsdp_0 instead." ) elif optimizer_path_exists: err += " However, a potential optimizer checkpoint directory exists." err += f"Please try passing in {checkpoint_dir_}/optimizer_0 instead." raise ValueError(err) # To setup `save` to work state = PartialState() if state.is_main_process: LOG.info(f"Merging FSDP weights from {checkpoint_dir_}") save_path = _distributed_checkpoint_to_merged_weights( checkpoint_dir_, output_path ) LOG.info(f"Successfully merged FSDP weights and saved to {save_path}") if remove_checkpoint_dir: LOG.info(f"Removing old checkpoint directory {checkpoint_dir_}") shutil.rmtree(checkpoint_dir_) def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs): """ Parses `axolotl` config, CLI args, and calls `merge_fsdp_weights`. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. """ parsed_cfg = load_cfg(config, **kwargs) fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0" if not fsdp_dir.exists(): checkpoint_dir = determine_last_checkpoint(parsed_cfg, update=False) if checkpoint_dir: fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0" if not fsdp_dir.exists(): raise ValueError( f"Could not find FSDP checkpoint `pytorch_model_fsdp_0` in {checkpoint_dir}" ) output_path = str(Path(parsed_cfg.output_dir) / "merged") merge_fsdp_weights( checkpoint_dir=str(fsdp_dir), output_path=output_path, ) state = PartialState() state.wait_for_everyone() LOG.info( f"FSDP SHARDED_STATE_DICT weights successfully merged to: {output_path}", ) LOG.info( "Merged weights are only the safetensors and doesn't include the model configuration " f"or tokenizer which may be found in {parsed_cfg.output_dir}.", ) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/preprocess.py ================================================ """CLI to run preprocessing of a dataset.""" import os import warnings from pathlib import Path from typing import Union import fire import transformers from accelerate import init_empty_weights from colorama import Fore from transformers import AutoModelForCausalLM from axolotl.cli.args import PreprocessCliArgs from axolotl.cli.checks import check_accelerate_default_config, check_user_token from axolotl.cli.config import load_cfg from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH from axolotl.common.datasets import load_datasets, load_preference_datasets from axolotl.integrations.base import PluginManager from axolotl.telemetry.errors import send_errors from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.trainer import disable_datasets_caching LOG = get_logger(__name__) @send_errors def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None: """ Preprocesses dataset specified in axolotl config. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Preprocessing-specific CLI arguments. """ check_accelerate_default_config() check_user_token() if cli_args.iterable: LOG.error( "The --iterable CLI argument for 'axolotl preprocess' is no longer " "supported. For training, set 'streaming: true' in your YAML config or " "pass '--streaming' in your 'axolotl train' command for on-the-fly " "preprocessing." ) return for key in ["skip_prepare_dataset", "pretraining_dataset"]: if cfg.get(key): LOG.error( f"You have set `{key}:`. `preprocess` is not needed. Run the 'axolotl " "train' CLI directly instead." ) return if not cfg.dataset_prepared_path: msg = ( Fore.RED + "preprocess CLI called without dataset_prepared_path set, " + f"using default path: {DEFAULT_DATASET_PREPARED_PATH}" + Fore.RESET ) LOG.warning(msg) cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH with disable_datasets_caching(): plugin_manager = PluginManager.get_instance() if plugin_manager.load_datasets(cfg, preprocess=True): pass elif cfg.rl: load_preference_datasets(cfg=cfg, cli_args=cli_args) else: load_datasets(cfg=cfg, cli_args=cli_args) if cli_args.download: model_name = cfg.base_model with warnings.catch_warnings(): # there are a bunch of useless UserWarnings about # "copying from a non-meta parameter in the checkpoint to a meta parameter in the current model" warnings.simplefilter("ignore") with init_empty_weights(include_buffers=True): # fmt: off try: AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True ) except Exception: # nosec B110 pass # fmt: on LOG.info( Fore.GREEN + f"Success! Preprocessed data path: `dataset_prepared_path: {cfg.dataset_prepared_path}`" + Fore.RESET ) def do_cli( config: Union[Path, str] = Path("examples/"), **kwargs, ) -> None: """ Parses `axolotl` config, CLI args, and calls `do_preprocess`. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. """ os.environ["AXOLOTL_IS_PREPROCESS"] = "1" is_preprocess = kwargs.pop("is_preprocess", True) parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs) parsed_cfg.is_preprocess = True parser = transformers.HfArgumentParser(PreprocessCliArgs) parsed_cli_args, _ = parser.parse_args_into_dataclasses( return_remaining_strings=True ) do_preprocess(parsed_cfg, parsed_cli_args) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/quantize.py ================================================ """ CLI to post-training quantize a model using torchao """ from pathlib import Path from typing import Union from transformers import AutoConfig, AutoModelForCausalLM, TorchAoConfig from axolotl.cli.config import load_cfg from axolotl.loaders import load_processor, load_tokenizer from axolotl.utils.logging import get_logger from axolotl.utils.quantization import ( TorchAOQuantDType, get_quantization_config, quantization_config_to_str, quantize_model, ) LOG = get_logger(__name__) def do_quantize( config: Union[Path, str], cli_args: dict, ): """ Quantizes a model's model's weights Args: config (Union[Path, str]): The path to the config file cli_args (dict): Additional command-line arguments """ cfg = load_cfg(config) if cfg.qat and cfg.quantization: raise ValueError( "QAT and quantization cannot be used together. Please specify only one of qat or quantization in your config file." ) if cfg.qat: quantize_cfg = cfg.qat elif cfg.quantization: quantize_cfg = cfg.quantization else: raise ValueError( "No quantization configuration found. Please specify either qat or quantization in your config file." ) model_path = cli_args.get("base_model") or cfg.output_dir if weight_dtype := cli_args.get("weight_dtype"): weight_dtype = TorchAOQuantDType.from_string(weight_dtype) else: weight_dtype = quantize_cfg.weight_dtype if activation_dtype := cli_args.get("activation_dtype"): activation_dtype = TorchAOQuantDType.from_string(activation_dtype) else: activation_dtype = quantize_cfg.activation_dtype group_size = cli_args.get("group_size") or quantize_cfg.group_size quantize_embedding = ( cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding ) output_dir = cli_args.get("output_dir") or cfg.output_dir hub_model_id = cli_args.get("hub_model_id") or cfg.hub_model_id LOG.info(f"Loading model from {model_path}.") tokenizer = load_tokenizer(cfg) processor = None if cfg.is_multimodal: processor = load_processor(cfg, tokenizer) config = AutoConfig.from_pretrained(model_path) torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", dtype=torch_dtype ) LOG.info( f"Quantizing model with configuration: \n" f"\tweight_dtype: {weight_dtype}\n" f"\tactivation_dtype: {activation_dtype}\n" f"\tgroup_size: {group_size}\n" f"\tquantize_embedding: {quantize_embedding}" ) quantize_model( model, weight_dtype, group_size, activation_dtype, quantize_embedding ) quantization_config = get_quantization_config( weight_dtype, activation_dtype, group_size ) ao_config = TorchAoConfig( quant_type=quantization_config, include_input_output_embeddings=quantize_embedding, ) model.config.quantization_config = ao_config LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.") model.save_pretrained( str(Path(output_dir) / "quantized"), progressbar=True, ) tokenizer.save_pretrained( str(Path(output_dir) / "quantized"), progressbar=True, save_jinja_files=cfg.tokenizer_save_jinja_files, ) if processor: LOG.info(f"Saving processor to: {str(Path(output_dir) / 'quantized')}.") processor.save_pretrained(str(Path(output_dir) / "quantized")) if hub_model_id: hub_model_id = ( hub_model_id.rstrip("-") + f"-{quantization_config_to_str[type(quantization_config)]}" ) model.push_to_hub(hub_model_id) tokenizer.push_to_hub(hub_model_id) if processor: processor.push_to_hub(hub_model_id) LOG.info(f"Quantized model pushed to: {hub_model_id}.") LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}.") ================================================ FILE: src/axolotl/cli/train.py ================================================ """CLI to run training on a model.""" import gc import os from pathlib import Path from typing import Union import fire from accelerate import Accelerator from transformers.hf_argparser import HfArgumentParser from axolotl.cli.args import TrainerCliArgs from axolotl.cli.checks import check_accelerate_default_config, check_user_token from axolotl.cli.config import load_cfg from axolotl.common.datasets import load_datasets, load_preference_datasets from axolotl.integrations.base import PluginManager from axolotl.train import train from axolotl.utils.config import normalize_config, resolve_dtype from axolotl.utils.dict import DictDefault from axolotl.utils.trainer import prepare_optim_env def do_train(cfg: DictDefault, cli_args: TrainerCliArgs): """ Trains a `transformers` model by first loading the dataset(s) specified in the `axolotl` config, and then calling `axolotl.train.train`. Also runs the plugin manager's `post_train_unload` once training completes. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Training-specific CLI arguments. """ check_accelerate_default_config() if int(os.getenv("LOCAL_RANK", "0")) == 0: check_user_token() plugin_manager = PluginManager.get_instance() dataset_meta = plugin_manager.load_datasets(cfg, preprocess=False) if not dataset_meta: if cfg.rl: dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) else: dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) del model, tokenizer, trainer gc.collect() plugin_manager = PluginManager.get_instance() plugin_manager.post_train_unload(cfg) def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs): """ Parses `axolotl` config, CLI args, and calls `do_train`. Args: config: Path to `axolotl` config YAML file. kwargs: Additional keyword arguments to override config file values. """ parsed_cfg = load_cfg(config, **kwargs) parser = HfArgumentParser(TrainerCliArgs) parsed_cli_args, _ = parser.parse_args_into_dataclasses( return_remaining_strings=True ) if parsed_cfg.use_ray: from ray.train import RunConfig, ScalingConfig from ray.train.torch import TorchTrainer train_loop_config = {"cfg": parsed_cfg.to_dict(), "cli_args": parsed_cli_args} trainer = TorchTrainer( ray_train_func, train_loop_config=train_loop_config, scaling_config=ScalingConfig( num_workers=parsed_cfg.ray_num_workers, resources_per_worker=parsed_cfg.resources_per_worker.to_dict(), use_gpu=True, ), run_config=RunConfig( name=parsed_cfg.ray_run_name, storage_path=Path(parsed_cfg.output_dir).absolute().as_posix(), ), ) return trainer.fit() return do_train(parsed_cfg, parsed_cli_args) def ray_train_func(kwargs: dict): # cast `cfg` back to DictDefault (ray tune deepcopy has issues with DictDefault so needed it to be dict) # also renormalize the config now that TorchTrainer has spawned distributed workers cfg = DictDefault(kwargs["cfg"]) prepare_optim_env(cfg) normalize_config(cfg) # now that we are on the worker node, we can check `is_torch_bf16_gpu_available` to resolve dtype resolve_dtype(cfg) # ray serializing objects gets rid of frozen attribute - HF expects dict not DefaultDict if cfg.deepspeed and hasattr(cfg.deepspeed, "to_dict"): cfg.deepspeed = cfg.deepspeed.to_dict() # initialize accelerator before model instantiation Accelerator(gradient_accumulation_steps=cfg.gradient_accumulation_steps) # Register plugins in Ray workers if cfg.get("plugins"): from axolotl.cli.config import plugin_set_cfg, prepare_plugins prepare_plugins(cfg) plugin_set_cfg(cfg) kwargs["cfg"] = cfg do_train(**kwargs) if __name__ == "__main__": fire.Fire(do_cli) ================================================ FILE: src/axolotl/cli/utils/__init__.py ================================================ """Init for axolotl.cli.utils module.""" from .args import ( add_options_from_config, add_options_from_dataclass, filter_none_kwargs, ) from .fetch import fetch_from_github from .load import load_model_and_tokenizer from .sweeps import generate_sweep_configs from .train import build_command, generate_config_files, launch_training __all__ = [ "filter_none_kwargs", "add_options_from_dataclass", "add_options_from_config", "build_command", "generate_config_files", "generate_sweep_configs", "load_model_and_tokenizer", "launch_training", "fetch_from_github", ] ================================================ FILE: src/axolotl/cli/utils/args.py ================================================ """Utilities for axolotl CLI args.""" import dataclasses from functools import wraps from types import NoneType, UnionType from typing import Any, Callable, Type, Union, get_args, get_origin import click from pydantic import BaseModel def _strip_optional_type(field_type: type | str | None): """ Extracts the non-`None` type from an `Optional` / `Union` type. Args: field_type: Type of field for Axolotl CLI command. Returns: If the input type is `Union[T, None]` or `Optional[T]`, returns `T`. Otherwise returns the input type unchanged. """ is_union = get_origin(field_type) is Union or isinstance(field_type, UnionType) if is_union and type(None) in get_args(field_type): field_type = next( t for t in get_args(field_type) if not isinstance(t, NoneType) ) return field_type def filter_none_kwargs(func: Callable) -> Callable: """ Wraps function to remove `None`-valued `kwargs`. Args: func: Function to wrap. Returns: Wrapped function. """ @wraps(func) def wrapper(*args, **kwargs) -> Callable: """Filters out `None`-valued `kwargs`.""" filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} return func(*args, **filtered_kwargs) return wrapper def add_options_from_dataclass(config_class: Type[Any]) -> Callable: """ Create Click options from the fields of a dataclass. Args: config_class: Dataclass with fields to parse from the CLI. Returns: Function decorator for Axolotl CLI command. """ def decorator(function: Callable) -> Callable: # Process dataclass fields in reverse order for correct option ordering for field in reversed(dataclasses.fields(config_class)): field_type = _strip_optional_type(field.type) if field_type is bool: field_name = field.name.replace("_", "-") option_name = f"--{field_name}/--no-{field_name}" function = click.option( option_name, default=field.default, help=field.metadata.get("description"), )(function) else: option_name = f"--{field.name.replace('_', '-')}" function = click.option( option_name, type=field_type, default=field.default, help=field.metadata.get("description"), )(function) return function return decorator def _is_pydantic_model(field_type: type) -> bool: """Check if a type is a Pydantic BaseModel subclass.""" try: return isinstance(field_type, type) and issubclass(field_type, BaseModel) except TypeError: return False def _get_field_description(field) -> str | None: """Get description from a Pydantic field, checking both .description and json_schema_extra.""" if field.description: return field.description if field.json_schema_extra and isinstance(field.json_schema_extra, dict): return field.json_schema_extra.get("description") return None def _add_nested_model_options( function: Callable, parent_name: str, model_class: Type[BaseModel] ) -> Callable: """ Add Click options for all fields of a nested Pydantic model using dot-notation. Note: Only single-level nesting is supported (e.g., ``--trl.beta``). Deeper nesting (e.g., ``--trl.scheduler.warmup``) is not handled. Args: function: Click command function to add options to. parent_name: Parent field name (e.g., "trl"). model_class: Nested Pydantic model class. Returns: Function with added Click options. """ for sub_name, sub_field in reversed(model_class.model_fields.items()): sub_type = _strip_optional_type(sub_field.annotation) # Use dot notation: --parent.sub_field cli_name = f"{parent_name}.{sub_name}".replace("_", "-") # The kwarg name uses double-underscore as separator param_name = f"{parent_name}__{sub_name}" description = _get_field_description(sub_field) if sub_type is bool: option_name = f"--{cli_name}/--no-{cli_name}" function = click.option( option_name, param_name, default=None, help=description )(function) else: option_name = f"--{cli_name}" click_type = {str: str, int: int, float: float}.get(sub_type) function = click.option( option_name, param_name, default=None, type=click_type, help=description )(function) return function def add_options_from_config(config_class: Type[BaseModel]) -> Callable: """ Create Click options from the fields of a Pydantic model. For fields whose type is itself a Pydantic BaseModel, dot-notation CLI options are generated for each sub-field (e.g., ``--trl.beta=0.1``). Args: config_class: PyDantic model with fields to parse from the CLI Returns: Function decorator for Axolotl CLI command. """ def decorator(function: Callable) -> Callable: # Process model fields in reverse order for correct option ordering for name, field in reversed(config_class.model_fields.items()): field_type = _strip_optional_type(field.annotation) # Handle nested Pydantic models with dot-notation options if _is_pydantic_model(field_type): function = _add_nested_model_options(function, name, field_type) continue if field_type is bool: field_name = name.replace("_", "-") option_name = f"--{field_name}/--no-{field_name}" function = click.option( option_name, default=None, help=field.description )(function) else: option_name = f"--{name.replace('_', '-')}" function = click.option( option_name, default=None, help=field.description )(function) return function return decorator ================================================ FILE: src/axolotl/cli/utils/diffusion.py ================================================ """Helpers for diffusion-mode inference in CLI and Gradio.""" from __future__ import annotations import gradio as gr from colorama import Fore, Style from axolotl.integrations.diffusion import generate, resolve_mask_token_id from axolotl.utils.dict import DictDefault def diffusion_inference( model, tokenizer, cfg, prompt: str, chat_template_str: str | None = None, ): """Diffusion inference helper method.""" mode = "random" completion_tokens = 0 target_mask_ratio = None mode, completion_tokens, target_mask_ratio, cleaned = _parse_commands(prompt) if cleaned: prompt = cleaned info = run_diffusion( model=model, tokenizer=tokenizer, cfg=cfg, prompt=prompt, chat_template_str=chat_template_str, mode=mode, target_mask_ratio=target_mask_ratio, completion_tokens=completion_tokens, ) masked_text = info["masked_text"] mask_ratio = info["mask_ratio"] generated_ids = info["generated_ids"] masked_positions = info["masked_positions"] orig_ids = info["orig_ids"] # Display with masked preview and colored diff if masked_text is not None and mask_ratio is not None: print(f"Masked ({mask_ratio:.1%}):\n{masked_text}\n") if generated_ids is not None: # Compute per-token style styles: list[str] = [] for i, tid in enumerate(generated_ids): if i in masked_positions: if i < len(orig_ids) and tid == orig_ids[i]: styles.append("green") # correct fill elif i < len(orig_ids): styles.append("red") # incorrect fill else: styles.append("normal") # appended else: same = i < len(orig_ids) and tid == orig_ids[i] styles.append("dim" if same else "normal") # Group contiguous spans by style styled_spans: list[tuple[str, int, int]] = [] if generated_ids: current_style = styles[0] start = 0 for i in range(1, len(generated_ids)): s = styles[i] if s != current_style: styled_spans.append((current_style, start, i)) current_style, start = s, i styled_spans.append((current_style, start, len(generated_ids))) out_parts = [] for style_name, a, b in styled_spans: chunk_text = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False) if style_name == "green": out_parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL) elif style_name == "red": out_parts.append(Fore.RED + chunk_text + Style.RESET_ALL) else: if style_name == "dim": out_parts.append(Style.DIM + chunk_text + Style.RESET_ALL) else: out_parts.append(chunk_text) print("Generated:\n" + "".join(out_parts)) else: print("Generated:\n(no output)") def _parse_commands(text: str): """ Parse leading diffusion commands. Supported at start of input (can be chained): :complete N -> completion mode with N tokens (default 64) :mask R -> random masking with ratio R in [0, 1] """ tokens = text.strip().split() i = 0 mode = "random" completion_tokens = 0 target_mask_ratio = None consumed = 0 while i < len(tokens) and tokens[i].startswith(":"): cmd = tokens[i] i += 1 consumed = i if cmd == ":complete": mode = "completion" if i < len(tokens): try: completion_tokens = int(tokens[i]) i += 1 consumed = i except Exception: completion_tokens = 64 else: completion_tokens = 64 elif cmd == ":mask": mode = "random" if i < len(tokens): try: target_mask_ratio = float(tokens[i]) i += 1 consumed = i except Exception: target_mask_ratio = None else: i -= 1 consumed = i break cleaned = " ".join(tokens[consumed:]) return mode, completion_tokens, target_mask_ratio, cleaned def run_diffusion( *, model, tokenizer, cfg: DictDefault, prompt: str, chat_template_str: str | None, mode: str = "random", target_mask_ratio: float | None = None, completion_tokens: int = 0, ): """Run a single diffusion generation and return a structured result dict.""" if chat_template_str: batch = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], return_tensors="pt", add_special_tokens=True, add_generation_prompt=True, chat_template=chat_template_str, tokenize=True, return_dict=True, ) else: batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) mask_token_id = resolve_mask_token_id(tokenizer, cfg, allow_add=False) seq = batch["input_ids"].to(cfg.device) gen_mode = "completion" if mode == "completion" else "random" comp_tokens = int(completion_tokens) if gen_mode == "completion" else 0 result = generate( model, tokenizer, original_sequence=seq[:1], num_diffusion_steps=cfg.diffusion.num_diffusion_steps, temperature=cfg.diffusion.generation_temperature, mask_token_id=int(mask_token_id), mode=gen_mode, # type: ignore[arg-type] completion_tokens=comp_tokens, target_mask_ratio=target_mask_ratio, ) masked_text = result.get("masked") if isinstance(result, dict) else None mask_ratio = result.get("mask_ratio") if isinstance(result, dict) else None generated_ids = result.get("generated_ids") if isinstance(result, dict) else None masked_positions = ( set(result.get("masked_positions") or []) if isinstance(result, dict) else set() ) orig_ids = seq[0].detach().cpu().tolist() return { "masked_text": masked_text, "mask_ratio": mask_ratio, "generated_ids": generated_ids, "masked_positions": masked_positions, "orig_ids": orig_ids, } def render_html( *, generated_ids: list[int] | None, orig_ids: list[int], masked_positions: set[int], tokenizer, ) -> str: """Render HTML visualizing diffusion outputs.""" if not generated_ids: return "
Generated:\n(no output)
" def _style_for(i: int, tid: int) -> str: if i in masked_positions: if i < len(orig_ids) and tid == orig_ids[i]: return "green" if i < len(orig_ids): return "red" return "normal" same = i < len(orig_ids) and tid == orig_ids[i] return "dim" if same else "normal" # Group contiguous spans by style to reduce HTML size spans: list[tuple[str, int, int]] = [] if generated_ids: cur = _style_for(0, generated_ids[0]) start = 0 for i in range(1, len(generated_ids)): s = _style_for(i, generated_ids[i]) if s != cur: spans.append((cur, start, i)) cur, start = s, i spans.append((cur, start, len(generated_ids))) html_parts = [] for style_name, a, b in spans: txt = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False) if style_name == "green": html_parts.append(f'{txt}') elif style_name == "red": html_parts.append(f'{txt}') elif style_name == "dim": html_parts.append(f'{txt}') else: html_parts.append(txt) legend = ( '
' 'correct, ' 'incorrect, ' 'unchanged' "
" ) return ( legend + '
Generated:\n'
        + "".join(html_parts)
        + "
" ) def launch_diffusion_gradio_ui( *, model, tokenizer, cfg: DictDefault, prompter_module=None, chat_template_str: str | None = None, ): """Build and launch a simple Gradio UI for diffusion inference.""" with gr.Blocks( title=cfg.get("gradio_title", "Axolotl Diffusion Interface") ) as demo: gr.Markdown( """ ## Axolotl Diffusion Inference - Mode "Random" masks tokens at a target ratio and fills them. - Mode "Completion" appends N masked tokens at the end and fills them. """ ) with gr.Row(): mode = gr.Radio( choices=["random", "completion"], value="random", label="Mode", ) mask_ratio = gr.Slider( minimum=0.0, maximum=1.0, step=0.05, value=0.4, label="Mask ratio (random mode)", interactive=True, ) completion_tokens = gr.Number( value=64, precision=0, label="Completion tokens (completion mode)", interactive=True, visible=False, ) instruction = gr.Textbox(label="Instruction", lines=6) run_btn = gr.Button("Generate") masked_preview = gr.Textbox(label="Masked preview", lines=6) html_out = gr.HTML(label="Generated") def _toggle_controls(selected_mode: str): return ( gr.update(visible=(selected_mode == "random")), gr.update(visible=(selected_mode == "completion")), ) mode.change( _toggle_controls, inputs=[mode], outputs=[mask_ratio, completion_tokens], ) def _gen(instruction_text: str, selected_mode: str, mratio: float, ctoks: int): if not instruction_text: return "", "
Generated:\n(no output)
" if prompter_module: prompt: str = next( prompter_module().build_prompt( instruction=instruction_text.strip("\n") ) ) else: prompt = instruction_text.strip() info = run_diffusion( model=model, tokenizer=tokenizer, cfg=cfg, prompt=prompt, chat_template_str=chat_template_str, mode=selected_mode, target_mask_ratio=mratio if selected_mode == "random" else None, completion_tokens=int(ctoks) if selected_mode == "completion" else 0, ) masked_text = info.get("masked_text") mask_ratio_val = info.get("mask_ratio") generated_ids = info.get("generated_ids") masked_positions = info.get("masked_positions") or set() orig_ids = info.get("orig_ids") or [] preview = ( f"Masked ({mask_ratio_val:.1%}):\n{masked_text}" if masked_text is not None and mask_ratio_val is not None else "" ) html = render_html( generated_ids=generated_ids, orig_ids=orig_ids, masked_positions=masked_positions, tokenizer=tokenizer, ) return preview, html run_btn.click( _gen, inputs=[instruction, mode, mask_ratio, completion_tokens], outputs=[masked_preview, html_out], ) demo.launch( footer_links=["gradio", "settings"], share=cfg.get("gradio_share", True), server_name=cfg.get("gradio_server_name", "127.0.0.1"), server_port=cfg.get("gradio_server_port", None), ) ================================================ FILE: src/axolotl/cli/utils/fetch.py ================================================ """Utilities for axolotl fetch CLI command.""" import concurrent.futures import hashlib import json from pathlib import Path import click import requests from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def _download_file( file_info: tuple, raw_base_url: str, dest_path: Path, dir_prefix: str ) -> tuple[str, str]: """ Download a single file and return its processing status. Args: file_info: Tuple of (file_path, remote_sha). raw_base_url: Base URL for raw GitHub content. dest_path: Local destination directory. dir_prefix: Directory prefix to filter files. Returns: Tuple of (file_path, status) where status is 'new', 'updated', or 'unchanged'. """ file_path, remote_sha = file_info raw_url = f"{raw_base_url}/{file_path}" dest_file = dest_path / file_path.split(dir_prefix)[-1] # Check if file exists and needs updating if dest_file.exists(): with open(dest_file, "rb") as file: content = file.read() # Calculate git blob SHA blob = b"blob " + str(len(content)).encode() + b"\0" + content local_sha = hashlib.sha1(blob, usedforsecurity=False).hexdigest() if local_sha == remote_sha: print(f"Skipping {file_path} (unchanged)") return file_path, "unchanged" print(f"Updating {file_path}") status = "updated" else: print(f"Downloading {file_path}") status = "new" # Create directories if needed dest_file.parent.mkdir(parents=True, exist_ok=True) # Download and save file try: response = requests.get(raw_url, timeout=30) response.raise_for_status() with open(dest_file, "wb") as file: file.write(response.content) return file_path, status except (requests.RequestException, IOError) as request_error: print(f"Error downloading {file_path}: {str(request_error)}") return file_path, "error" def fetch_from_github( dir_prefix: str, dest_dir: str | None = None, max_workers: int = 5 ) -> None: """ Sync files from a specific directory in the GitHub repository. Only downloads files that don't exist locally or have changed. Args: dir_prefix: Directory prefix to filter files (e.g., 'examples/', 'deepspeed_configs/'). dest_dir: Local destination directory. max_workers: Maximum number of concurrent downloads. """ api_url = "https://api.github.com/repos/axolotl-ai-cloud/axolotl/git/trees/main?recursive=1" raw_base_url = "https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main" # Get repository tree with timeout response = requests.get(api_url, timeout=30) response.raise_for_status() tree = json.loads(response.text) # Filter for files and get their SHA files = { item["path"]: item["sha"] for item in tree["tree"] if item["type"] == "blob" and item["path"].startswith(dir_prefix) } if not files: raise click.ClickException(f"No files found in {dir_prefix}") # Default destination directory is the last part of dir_prefix default_dest = Path(dir_prefix.rstrip("/")) dest_path = Path(dest_dir) if dest_dir else default_dest # Keep track of processed files for summary files_processed: dict[str, list[str]] = { "new": [], "updated": [], "unchanged": [], "error": [], } # Process files in parallel using ThreadPoolExecutor with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_file = { executor.submit( _download_file, (file_path, remote_sha), raw_base_url, dest_path, dir_prefix, ): file_path for file_path, remote_sha in files.items() } # Process completed tasks as they finish for future in concurrent.futures.as_completed(future_to_file): file_path = future_to_file[future] try: file_path, status = future.result() files_processed[status].append(file_path) except (requests.RequestException, IOError) as request_error: print(f"Error processing {file_path}: {str(request_error)}") files_processed["error"].append(file_path) # Log summary LOG.info("\nSync Summary:") LOG.info(f"New files: {len(files_processed['new'])}") LOG.info(f"Updated files: {len(files_processed['updated'])}") LOG.info(f"Unchanged files: {len(files_processed['unchanged'])}") if files_processed["error"]: LOG.info(f"Failed files: {len(files_processed['error'])}") ================================================ FILE: src/axolotl/cli/utils/load.py ================================================ """Utilities for model, tokenizer, etc. loading.""" from typing import Any from transformers import ( PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin, ) from axolotl.loaders import load_processor, load_tokenizer from axolotl.loaders.model import ModelLoader from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def load_model_and_tokenizer( *, cfg: DictDefault, inference: bool = False, ) -> tuple[ PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None, ]: """ Helper function for loading a model, tokenizer, and processor specified in the given `axolotl` config. Args: cfg: Dictionary mapping `axolotl` config keys to values. inference: Boolean denoting inference mode. Returns: Tuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin). """ LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}") tokenizer = load_tokenizer(cfg) LOG.info("loading model...") model_loader = ModelLoader(cfg, tokenizer, inference=inference) model, _ = model_loader.load() processor = None if cfg.is_multimodal: LOG.info("loading processor...") processor = load_processor(cfg, tokenizer) return model, tokenizer, processor ================================================ FILE: src/axolotl/cli/utils/sweeps.py ================================================ """Utilities for handling sweeps over configs for axolotl train CLI command""" import random from copy import deepcopy from itertools import product from typing import Any def generate_sweep_configs( base_config: dict[str, list], sweeps_config: dict[str, list] ) -> list[dict[str, Any]]: """ Recursively generates all possible configurations by applying sweeps to the base config. Args: base_config (dict): The original configuration dictionary sweeps_config (dict): Dictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the '_' key Returns: list: List of all possible configuration dictionaries Example: sweeps_config = { 'learning_rate': [0.1, 0.01], '_': [ {'load_in_8bit': True, 'adapter': 'lora'}, {'load_in_4bit': True, 'adapter': 'qlora'} ] } """ # Separate paired values from regular sweeps paired_values = sweeps_config.get("_", []) regular_sweeps = {k: v for k, v in sweeps_config.items() if k != "_"} # Process regular sweeps param_names = list(regular_sweeps.keys()) param_values = list(regular_sweeps.values()) # Generate combinations for regular sweeps regular_combinations = list(product(*param_values)) if param_values else [()] # Combine regular sweeps with paired values all_combinations = [] for reg_combo in regular_combinations: if paired_values: for paired_set in paired_values: new_config = {} # new_config = deepcopy(base_config) # Combine regular parameters with paired parameters full_combo = { **dict(zip(param_names, reg_combo, strict=False)), **paired_set, } for param_name, param_value in full_combo.items(): new_config[param_name] = param_value print(new_config) all_combinations.append(new_config) else: # If no paired values, just use regular combinations # new_config = deepcopy(base_config) new_config = {} for param_name, param_value in zip(param_names, reg_combo, strict=False): new_config[param_name] = param_value print(new_config) all_combinations.append(new_config) # randomize the order of trials random.seed(42) random.shuffle(all_combinations) # Generate a new config for each combination result_configs = [] for combination in all_combinations: new_config = deepcopy(base_config) for param_name, param_value in combination.items(): new_config[param_name] = param_value result_configs.append(new_config) return result_configs ================================================ FILE: src/axolotl/cli/utils/train.py ================================================ """Utilities for axolotl train CLI command.""" import os import subprocess # nosec import sys import tempfile from pathlib import Path from typing import Any, Iterator, Literal import yaml from axolotl.cli.utils.sweeps import generate_sweep_configs def _add_default_rdzv_args(launcher_args: list[str]) -> list[str]: """ Add default RDZV arguments if rdzv_endpoint is set but rdzv_backend/rdzv_id are missing. Args: launcher_args: List of launcher arguments Returns: Updated launcher args with defaults added if needed """ args = launcher_args.copy() # Check if rdzv_endpoint is present has_rdzv_endpoint = any("--rdzv_endpoint" in arg for arg in args) if has_rdzv_endpoint: # Check if rdzv_backend is already provided has_rdzv_backend = any("--rdzv_backend" in arg for arg in args) if not has_rdzv_backend: args.extend(["--rdzv_backend", "c10d"]) # Check if rdzv_id is already provided has_rdzv_id = any("--rdzv_id" in arg for arg in args) if not has_rdzv_id: import uuid args.extend(["--rdzv_id", str(uuid.uuid4())[:8]]) return args def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]: """ Build command list from base command and options. Args: base_cmd: Command without options. options: Options to parse and append to base command. Returns: List of strings giving shell command. """ cmd = base_cmd.copy() for key, value in options.items(): if value is None: continue key = key.replace("_", "-") cmd.append(f"--{key}={value}") return cmd def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str, bool]]: """ Generate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating whether this is a group of configurations (i.e., a sweep). Args: config: Base configuration file sweep: Sweep configuration file """ if not sweep: yield config, False return # Load sweep and base configurations with open(sweep, "r", encoding="utf-8") as fin: sweep_config: dict[str, list] = yaml.safe_load(fin) with open(config, "r", encoding="utf-8") as fin: base_config: dict[str, list] = yaml.safe_load(fin) # Generate all possible configurations permutations = generate_sweep_configs(base_config, sweep_config) is_group = len(permutations) > 1 base_output_dir = base_config.get("output_dir", "./model-out") for idx, permutation in enumerate(permutations, start=1): permutation_dir = Path(permutation.get("output_dir", base_output_dir)) permutation_id = f"sweep{idx:04d}" permutation["output_dir"] = str(permutation_dir / permutation_id) temp_file = tempfile.NamedTemporaryFile( mode="w", suffix=".yaml", delete=False, encoding="utf-8", ) yaml.dump(permutation, temp_file) temp_file.close() yield temp_file.name, is_group def launch_training( cfg_file: str, launcher: Literal["accelerate", "torchrun", "python"] | None, cloud: str | None, kwargs: dict, launcher_args: list[str] | None = None, use_exec: bool = False, ) -> None: """Execute training with the given configuration.""" launcher_args = launcher_args or [] if cloud: _launch_cloud_training(cloud, cfg_file, launcher, kwargs, launcher_args) elif launcher: if launcher == "accelerate": _launch_accelerate_training(cfg_file, kwargs, launcher_args, use_exec) elif launcher == "torchrun": _launch_torchrun_training(cfg_file, kwargs, launcher_args, use_exec) elif launcher == "python": _launch_python_training(cfg_file, kwargs) elif launcher is None: # handle ray train launch _launch_python_training(cfg_file, kwargs) def _launch_cloud_training( cloud: str, cfg_file: str, launcher: Literal["accelerate", "torchrun", "python"] | None, kwargs: dict, launcher_args: list[str] | None = None, ) -> None: """Execute training via cloud launcher.""" from axolotl.cli.cloud import do_cli_train launcher_args = launcher_args or [] cwd = os.getcwd() if launcher else None do_cli_train( cloud_config=cloud, config=cfg_file, launcher=launcher or "accelerate", launcher_args=launcher_args, cwd=cwd, **kwargs, ) def _launch_accelerate_training( cfg_file: str, kwargs: dict, launcher_args: list[str] | None = None, use_exec: bool = False, ) -> None: """Execute training via accelerate launcher.""" launcher_args = launcher_args or [] internal_launcher_args = [] # Extract launcher-specific arguments from kwargs (legacy support) if "main_process_port" in kwargs: main_process_port = kwargs.pop("main_process_port") internal_launcher_args.extend(["--main_process_port", str(main_process_port)]) if "num_processes" in kwargs: num_processes = kwargs.pop("num_processes") internal_launcher_args.extend(["--num_processes", str(num_processes)]) # Combine internal args with user-provided launcher args all_launcher_args = internal_launcher_args + launcher_args base_cmd = ( ["accelerate", "launch"] + all_launcher_args + ["-m", "axolotl.cli.train"] ) if cfg_file: base_cmd.append(cfg_file) cmd = build_command(base_cmd, kwargs) if use_exec: # make sure to flush stdout and stderr before replacing the process sys.stdout.flush() sys.stderr.flush() os.execvpe(cmd[0], cmd, os.environ) # nosec B606 else: subprocess.run(cmd, check=True) # nosec B603 def _launch_torchrun_training( cfg_file: str, kwargs: dict, launcher_args: list[str] | None = None, use_exec: bool = False, ) -> None: """Execute training via torchrun launcher.""" launcher_args = launcher_args or [] # Add default RDZV arguments if rdzv_endpoint is set launcher_args = _add_default_rdzv_args(launcher_args) base_cmd = ["torchrun"] + launcher_args + ["-m", "axolotl.cli.train"] if cfg_file: base_cmd.append(cfg_file) cmd = build_command(base_cmd, kwargs) if use_exec: # make sure to flush stdout and stderr before replacing the process sys.stdout.flush() sys.stderr.flush() os.execvpe(cmd[0], cmd, os.environ) # nosec B606 else: subprocess.run(cmd, check=True) # nosec B603 def _launch_python_training(cfg_file: str, kwargs: dict) -> None: """Execute training via python launcher.""" from axolotl.cli.train import do_cli do_cli(config=cfg_file, **kwargs) ================================================ FILE: src/axolotl/cli/vllm_serve.py ================================================ """ CLI to start the vllm server for online RL """ from dataclasses import dataclass, field from pathlib import Path from typing import Union from trl.scripts.vllm_serve import ScriptArguments from axolotl.cli.config import load_cfg @dataclass class AxolotlScriptArguments(ScriptArguments): """ Additional arguments for the VLLM server """ reasoning_parser: str = field(default="", kw_only=True) enable_reasoning: bool | None = field(default=None, kw_only=True) def do_vllm_serve( config: Union[Path, str], cli_args: dict, ): """ Starts the VLLM server for serving LLM models used for online RL Args :param cfg: Parsed doct of the YAML config :param cli_args: dict of additional command-line arguments of type VllmServeCliArgs Returns: process_id: the process id of the started VLLM server """ cfg = load_cfg(config) model = cfg.base_model # Determine serve module: explicit CLI/config > auto-select from vllm_lora_sync > default serve_module = cli_args.get("serve_module") or getattr( cfg.vllm, "serve_module", None ) if ( serve_module is None and getattr(cfg, "trl", None) and getattr(cfg.trl, "vllm_lora_sync", False) ): serve_module = "axolotl.scripts.vllm_serve_lora" if serve_module is None: serve_module = "trl.scripts.vllm_serve" vllm_serve_main = __import__(serve_module, fromlist=["main"]).main tensor_parallel_size = 1 data_parallel_size = 1 if cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size: tensor_parallel_size = ( cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size ) if cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size: data_parallel_size = ( cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size ) host = cli_args.get("host") or cfg.vllm.host port = cli_args.get("port") or cfg.vllm.port gpu_memory_utilization = ( cli_args.get("gpu_memory_utilization") or cfg.vllm.gpu_memory_utilization ) dtype = cli_args.get("dtype") or cfg.vllm.dtype max_model_len = cli_args.get("max_model_len") or cfg.vllm.max_model_len enable_prefix_caching = ( cli_args.get("enable_prefix_caching") or cfg.vllm.enable_prefix_caching ) reasoning_parser = ( cli_args.get("reasoning_parser") or cfg.vllm.reasoning_parser or "" ) enable_reasoning = ( cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False ) base_kwargs = dict( model=model, tensor_parallel_size=tensor_parallel_size, data_parallel_size=data_parallel_size, host=host, port=port, gpu_memory_utilization=gpu_memory_utilization, dtype=dtype, max_model_len=max_model_len, enable_prefix_caching=enable_prefix_caching, ) # Use LoRAScriptArguments when serving with native LoRA support if serve_module == "axolotl.scripts.vllm_serve_lora": from axolotl.scripts.vllm_serve_lora import LoRAScriptArguments lora_kwargs = {} if hasattr(cfg, "lora_r") and cfg.lora_r: lora_kwargs["max_lora_rank"] = cfg.lora_r vllm_script_args = LoRAScriptArguments(**base_kwargs, **lora_kwargs) else: vllm_script_args = AxolotlScriptArguments( **base_kwargs, reasoning_parser=reasoning_parser, enable_reasoning=enable_reasoning, ) vllm_serve_main(vllm_script_args) ================================================ FILE: src/axolotl/common/__init__.py ================================================ ================================================ FILE: src/axolotl/common/architectures.py ================================================ """ Common architecture specific constants """ MOE_ARCH_BLOCK = { "dbrx": "DbrxFFN", "jamba": "JambaSparseMoeBlock", "jetmoe": [ "JetMoeMoA", "JetMoeMoE", ], "mixtral": "MixtralSparseMoeBlock", "qwen2_moe": "Qwen2MoeSparseMoeBlock", "qwen3_moe": "Qwen3MoeSparseMoeBlock", "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock", "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock", "deepseek_v2": "DeepseekV2MoE", "deepseek_v3": "DeepseekV3MoE", "mistral4": "Mistral4MoE", "gpt_oss": "GptOssDecoderLayer", "lfm2_moe": "Lfm2MoeSparseMoeBlock", "afmoe": "AfmoeMoE", "glm4_moe": "Glm4MoeDecoderLayer", "glm4_moe_lite": "Glm4MoeLiteDecoderLayer", "glm_moe_dsa": "GlmMoeDsaDecoderLayer", } ================================================ FILE: src/axolotl/common/const.py ================================================ """Various shared constants""" DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared" ================================================ FILE: src/axolotl/common/datasets.py ================================================ """Dataset loading utilities.""" import math import random from dataclasses import dataclass from datasets import Dataset import axolotl.monkeypatch.data.batch_dataset_fetcher # noqa: F401 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs from axolotl.loaders import load_processor, load_tokenizer from axolotl.telemetry.errors import send_errors from axolotl.utils.data import prepare_datasets, prepare_preference_datasets from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import RLType from axolotl.utils.tokenization import check_dataset_labels LOG = get_logger(__name__) @dataclass class TrainDatasetMeta: """Dataclass with fields for training and validation datasets and metadata.""" train_dataset: Dataset eval_dataset: Dataset | None = None total_num_steps: int | None = None def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset: """Randomly sample `num_samples` samples with replacement from `dataset`.""" return dataset.select( [random.randrange(0, len(dataset) - 1) for _ in range(num_samples)] # nosec ) @send_errors def load_datasets( *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None, debug: bool = False, ) -> TrainDatasetMeta: """Loads one or more training or evaluation datasets, calling `axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Command-specific CLI arguments. debug: Whether to print out tokenization of sample. This is duplicated in `cfg` and `cli_args`, but is kept due to use in our Colab notebooks. Returns: Dataclass with fields for training and evaluation datasets and the computed `total_num_steps`. """ tokenizer = load_tokenizer(cfg) processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets( cfg, tokenizer, processor=processor, ) if ( cfg.debug or getattr(cli_args, "debug", False) or getattr(cli_args, "debug_text_only", False) or getattr(cli_args, "debug_num_examples", 0) > 0 or debug ): LOG.info("check_dataset_labels...") num_examples = cli_args.debug_num_examples if cli_args else 1 text_only = cli_args.debug_text_only if cli_args else False try: train_samples = sample_dataset(train_dataset, num_examples) check_dataset_labels( train_samples, tokenizer, num_examples=num_examples, text_only=text_only, ) except AttributeError: # can't sample iterable datasets pass LOG.info("printing prompters...") for prompter in prompters: LOG.info(prompter) return TrainDatasetMeta( train_dataset=train_dataset, eval_dataset=eval_dataset, total_num_steps=total_num_steps, ) @send_errors def load_preference_datasets( *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None ) -> TrainDatasetMeta: """Loads one or more training or evaluation datasets for RL training using paired preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`. Optionally, logs out debug information. Args: cfg: Dictionary mapping `axolotl` config keys to values. cli_args: Command-specific CLI arguments. Returns: Dataclass with fields for training and evaluation datasets and the computed `total_num_steps`. """ tokenizer = load_tokenizer(cfg) train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer) total_num_steps: int | None = None if cfg.rl is not RLType.GRPO: total_num_steps = int( math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) if ((cli_args and cli_args.debug) or cfg.debug) and cfg.rl != RLType.ORPO: LOG.info("check_dataset_labels...") num_examples = cli_args.debug_num_examples if cli_args else 1 text_only = cli_args.debug_text_only if cli_args else False tokenizer = load_tokenizer(cfg) train_samples = sample_dataset(train_dataset, num_examples) check_dataset_labels( dataset=train_samples, tokenizer=tokenizer, num_examples=num_examples, text_only=text_only, rl_mode=True, ) return TrainDatasetMeta( train_dataset=train_dataset, eval_dataset=eval_dataset, total_num_steps=total_num_steps, ) ================================================ FILE: src/axolotl/convert.py ================================================ """Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes""" import json import sys class FileReader: """ Reads a file and returns its contents as a string """ def read(self, file_path): with open(file_path, encoding="utf-8") as file: return file.read() class FileWriter: """ Writes a string to a file """ def __init__(self, file_path): self.file_path = file_path def write(self, content): with open(self.file_path, "w", encoding="utf-8") as file: file.write(content) class StdoutWriter: """ Writes a string to stdout """ def write(self, content): sys.stdout.write(content) sys.stdout.write("\n") class JsonParser: """ Parses a string as JSON and returns the result """ def parse(self, content): return json.loads(content) class JsonlSerializer: """ Serializes a list of JSON objects into a JSONL string """ def serialize(self, data): lines = [json.dumps(item) for item in data] return "\n".join(lines) class JsonToJsonlConverter: """ Converts a JSON file to JSONL """ def __init__(self, file_reader, file_writer, json_parser, jsonl_serializer): self.file_reader = file_reader self.file_writer = file_writer self.json_parser = json_parser self.jsonl_serializer = jsonl_serializer def convert(self, input_file_path): content = self.file_reader.read(input_file_path) data = self.json_parser.parse(content) # data = [r for r in data if r["conversations"]] # vicuna cleaned has rows with empty conversations jsonl_content = self.jsonl_serializer.serialize(data) self.file_writer.write(jsonl_content) ================================================ FILE: src/axolotl/core/__init__.py ================================================ ================================================ FILE: src/axolotl/core/attention/__init__.py ================================================ ================================================ FILE: src/axolotl/core/builders/__init__.py ================================================ """Trainer builder classes""" from .causal import HFCausalTrainerBuilder from .rl import HFRLTrainerBuilder __all__ = ["HFCausalTrainerBuilder", "HFRLTrainerBuilder"] ================================================ FILE: src/axolotl/core/builders/base.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Base class for trainer builder""" import abc import importlib import logging import sys from abc import abstractmethod from contextlib import suppress from pathlib import Path from typing import Any import torch from transformers import TrainerCallback from transformers.trainer_pt_utils import AcceleratorConfig from axolotl.integrations.base import PluginManager from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr from axolotl.telemetry.callbacks import TelemetryCallback from axolotl.telemetry.manager import TelemetryManager from axolotl.utils import ( is_comet_available, is_mlflow_available, is_opentelemetry_available, is_trackio_available, ) from axolotl.utils.callbacks import ( GCCallback, SaveAxolotlConfigtoWandBCallback, SaveModelOnFirstStepCallback, ) from axolotl.utils.callbacks.profiler import PytorchProfilerCallback from axolotl.utils.distributed import build_parallelism_config from axolotl.utils.schemas.enums import CustomSupportedOptimizers LOG = logging.getLogger(__name__) with suppress(ImportError): import torch._dynamo class TrainerBuilderBase(abc.ABC): """Base class for trainer builder.""" def __init__(self, cfg, model, tokenizer, processor=None): self.cfg = cfg self.model = model self.tokenizer = tokenizer self.processor = processor self._train_dataset = None self._eval_dataset = None self._model_ref = None self._peft_config = None # If the model supports tagging, add the axolotl tag. # This makes sure the tag is correctly pushed even if a user calls # model.push_to_hub instead of trainer.push_to_hub. if hasattr(model, "add_model_tags"): model.add_model_tags(["axolotl"]) patch_trainer_get_lr() @property def model_ref(self): return self._model_ref @model_ref.setter def model_ref(self, model): self._model_ref = model @property def train_dataset(self): return self._train_dataset @train_dataset.setter def train_dataset(self, dataset): self._train_dataset = dataset @property def eval_dataset(self): return self._eval_dataset @eval_dataset.setter def eval_dataset(self, dataset): self._eval_dataset = dataset @property def peft_config(self): return self._peft_config @peft_config.setter def peft_config(self, peft_config): self._peft_config = peft_config @abstractmethod def build(self, total_num_steps): pass def get_callbacks(self) -> list[TrainerCallback]: callbacks = [] plugin_manager = PluginManager.get_instance() callbacks.extend( plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model) ) if self.cfg.gc_steps: callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps)) if self.cfg.dynamic_checkpoint and self.cfg.dynamic_checkpoint.enabled: from axolotl.utils.callbacks.dynamic_checkpoint import ( DynamicCheckpointCallback, ) callbacks.append(DynamicCheckpointCallback(self.cfg)) if self.cfg.use_wandb: callbacks.append( SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path) ) if self.cfg.use_mlflow and is_mlflow_available(): from axolotl.utils.callbacks.mlflow_ import ( SaveAxolotlConfigtoMlflowCallback, ) callbacks.extend( [ SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path), ] ) if self.cfg.use_comet and is_comet_available(): from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback callbacks.append( SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path) ) if self.cfg.use_trackio and is_trackio_available(): from axolotl.utils.callbacks.trackio_ import ( SaveAxolotlConfigtoTrackioCallback, ) callbacks.append( SaveAxolotlConfigtoTrackioCallback(self.cfg.axolotl_config_path) ) if self.cfg.use_otel_metrics and is_opentelemetry_available(): from axolotl.utils.callbacks.opentelemetry import ( OpenTelemetryMetricsCallback, ) callbacks.append(OpenTelemetryMetricsCallback(self.cfg)) if self.cfg.save_first_step: callbacks.append(SaveModelOnFirstStepCallback()) if self.cfg.profiler_steps: callbacks.append( PytorchProfilerCallback( steps_to_profile=self.cfg.profiler_steps, profiler_steps_start=self.cfg.profiler_steps_start, ) ) telemetry_manager = TelemetryManager.get_instance() if telemetry_manager.enabled: callbacks.append(TelemetryCallback()) return callbacks def get_post_trainer_create_callbacks(self, trainer): """ Callbacks added after the trainer is created, usually b/c these need access to the trainer """ callbacks = [] if self.cfg.plugins: plugin_manager = PluginManager.get_instance() callbacks.extend( [ cb for cb in plugin_manager.add_callbacks_post_trainer( self.cfg, trainer ) if cb ] ) return callbacks def hook_pre_create_training_args(self, training_arguments_kwargs): # TODO return training_arguments_kwargs def hook_post_create_training_args(self, training_arguments): # TODO return training_arguments def hook_pre_create_trainer(self, trainer_kwargs, trainer_cls): # TODO return trainer_kwargs, trainer_cls def hook_post_create_trainer(self, trainer): # TODO return trainer def _configure_warmup_and_logging( self, total_num_steps: int, training_args_kwargs: dict ): warmup_steps: int | float = 0 warmup_ratio = 0.0 if self.cfg.warmup_steps is not None: warmup_steps = self.cfg.warmup_steps elif self.cfg.warmup_ratio is not None: if total_num_steps: warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0) else: warmup_ratio = self.cfg.warmup_ratio elif total_num_steps: warmup_steps = min(int(0.03 * total_num_steps), 100) else: warmup_ratio = 0.03 # transformers v5 if warmup_ratio > 0.0 and warmup_steps == 0: warmup_steps = warmup_ratio if warmup_steps == 1: warmup_steps = 2 if self.cfg.logging_steps is not None: training_args_kwargs["logging_steps"] = self.cfg.logging_steps else: training_args_kwargs["logging_steps"] = ( 500 # transformers defaults to 500 if not total_num_steps else max(min(int(0.005 * total_num_steps), 10), 1) ) training_args_kwargs["warmup_steps"] = warmup_steps def _configure_precision_settings(self, training_args_kwargs: dict): training_args_kwargs["fp16"] = (self.cfg.fp16 and not self.cfg.bf16) or False training_args_kwargs["tf32"] = True if self.cfg.tf32 is True else False if self.cfg.bf16 == "full": training_args_kwargs["bf16_full_eval"] = True else: bf16 = self.cfg.bf16 or self.cfg.bfloat16 bf16 = bf16 if bf16 is not None else False training_args_kwargs["bf16"] = bf16 def _configure_scheduler(self, training_args_kwargs: dict): if self.cfg.lr_scheduler in ["one_cycle", "rex"]: training_args_kwargs["lr_scheduler_type"] = "cosine" training_args_kwargs["alternate_lr_scheduler_type"] = self.cfg.lr_scheduler else: training_args_kwargs["lr_scheduler_type"] = ( self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine" ) training_args_kwargs["lr_scheduler_kwargs"] = ( self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {} ) def _configure_optimizer(self, training_args_kwargs: dict, trainer_kwargs: dict): def _configure_custom_optimizer( training_args_kwargs: dict, trainer_kwargs: dict ): # Common optimizer kwargs optimizer_kwargs = { "lr": training_args_kwargs["learning_rate"], "weight_decay": training_args_kwargs["weight_decay"], } # Adam-specific kwargs adam_kwargs: dict = {} if training_args_kwargs.get("adam_beta1") and training_args_kwargs.get( "adam_beta2" ): adam_kwargs["betas"] = ( training_args_kwargs.get("adam_beta1"), training_args_kwargs.get("adam_beta2"), ) if training_args_kwargs.get("adam_epsilon"): adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon") if self.cfg.optimizer == "muon": _, device_mesh = build_parallelism_config(self.cfg) if device_mesh is not None: from axolotl.contribs.mit.muon.dist_muon import ( DistMuonOptimizerFactory, ) optimizer_cls = DistMuonOptimizerFactory optimizer_kwargs["device_mesh"] = device_mesh else: from axolotl.contribs.mit.muon import ( MuonOptimizerFactory, ) optimizer_cls = MuonOptimizerFactory optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "dion": from axolotl.contribs.mit.dion import ( DionOptimizerFactory, ) optimizer_cls = DionOptimizerFactory optimizer_kwargs["dion_lr"] = training_args_kwargs["dion_learning_rate"] optimizer_kwargs["dion_mu"] = training_args_kwargs["dion_momentum"] optimizer_kwargs.update(adam_kwargs) _, device_mesh = build_parallelism_config(self.cfg) if device_mesh is not None: optimizer_kwargs["device_mesh"] = device_mesh elif self.cfg.optimizer == "optimi_adamw": from optimi import AdamW optimizer_kwargs["foreach"] = False optimizer_cls = AdamW optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "ao_adamw_fp8": from torchao.prototype.low_bit_optim import AdamWFp8 optimizer_cls = AdamWFp8 optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "adopt_adamw": from axolotl.utils.optimizers.adopt import ADOPT optimizer_cls = ADOPT adam_kwargs["decouple"] = True optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "came_pytorch": from came_pytorch import CAME optimizer_cls = CAME beta1 = training_args_kwargs.get("adam_beta1", 0.9) beta2 = training_args_kwargs.get("adam_beta2", 0.999) beta3 = training_args_kwargs.get("adam_beta3", 0.9999) eps1 = training_args_kwargs.get("adam_epsilon", 1e-30) eps2 = training_args_kwargs.get("adam_epsilon2", 1e-16) adam_kwargs["betas"] = (beta1, beta2, beta3) adam_kwargs["eps"] = (eps1, eps2) optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "flash_adamw": from flashoptim import FlashAdamW optimizer_cls = FlashAdamW optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "flash_adam": from flashoptim import FlashAdam optimizer_cls = FlashAdam optimizer_kwargs.update(adam_kwargs) elif self.cfg.optimizer == "flash_sgd": from flashoptim import FlashSGD optimizer_cls = FlashSGD elif self.cfg.optimizer == "flash_sgdw": from flashoptim import FlashSGDW optimizer_cls = FlashSGDW elif self.cfg.optimizer == "flash_lion": from flashoptim import FlashLion optimizer_cls = FlashLion if "betas" in adam_kwargs: optimizer_kwargs["betas"] = adam_kwargs["betas"] else: raise ValueError( f"Unhandled optimizer: {self.cfg.optimizer}. Please raise an Issue." ) # Parse any additional optimizer args from config if self.cfg.optim_args: if isinstance(self.cfg.optim_args, dict): optimizer_kwargs.update(self.cfg.optim_args) else: # Parse string format "key1=value1,key2=value2" for mapping in self.cfg.optim_args.replace(" ", "").split(","): key, value = mapping.split("=") optimizer_kwargs[key] = value # Note: This is not used in training_args_kwargs, but in trainer_kwargs trainer_kwargs["optimizer_cls_and_kwargs"] = ( optimizer_cls, optimizer_kwargs, ) # Handle custom optimizer custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers] if self.cfg.optimizer in custom_supported_optimizers: _configure_custom_optimizer(training_args_kwargs, trainer_kwargs) else: # Use transformers' optimizer training_args_kwargs["optim"] = self.cfg.optimizer # Parse any additional optimizer args from config if self.cfg.optim_args: if isinstance(self.cfg.optim_args, dict): optim_args = ",".join( [f"{key}={value}" for key, value in self.cfg.optim_args.items()] ) else: optim_args = self.cfg.optim_args training_args_kwargs["optim_args"] = optim_args if ( self.cfg.optimizer == "adamw_anyprecision" and Path(self.cfg.torchdistx_path).exists() ): sys.path.append(self.cfg.torchdistx_path) importlib.import_module("torchdistx") def _configure_hub_parameters(self, training_args_kwargs: dict): if self.cfg.hub_model_id: training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id training_args_kwargs["push_to_hub"] = True training_args_kwargs["hub_private_repo"] = True training_args_kwargs["hub_always_push"] = True if self.cfg.hub_strategy: training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy if self.cfg.hub_revision: training_args_kwargs["hub_revision"] = self.cfg.hub_revision def _configure_save_and_eval_strategy(self, training_args_kwargs: dict): # save_strategy and save_steps if self.cfg.save_steps: training_args_kwargs["save_strategy"] = "steps" training_args_kwargs["save_steps"] = self.cfg.save_steps elif self.cfg.save_strategy: training_args_kwargs["save_strategy"] = self.cfg.save_strategy else: # default to saving each epoch if not defined training_args_kwargs["save_strategy"] = "epoch" training_args_kwargs["save_total_limit"] = ( self.cfg.save_total_limit if self.cfg.save_total_limit else 4 ) # eval_strategy and eval_steps if not self.eval_dataset and self.cfg.val_set_size == 0: # do not eval if no eval_dataset and val_set_size=0 training_args_kwargs["eval_strategy"] = "no" elif self.cfg.eval_steps: training_args_kwargs["eval_strategy"] = "steps" training_args_kwargs["eval_steps"] = self.cfg.eval_steps training_args_kwargs["eval_on_start"] = True elif self.cfg.eval_strategy: training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy training_args_kwargs["eval_on_start"] = True def _configure_reporting(self, training_args_kwargs: dict): report_to = [] if self.cfg.use_wandb: report_to.append("wandb") if self.cfg.use_mlflow: report_to.append("mlflow") if self.cfg.use_tensorboard: report_to.append("tensorboard") if self.cfg.use_comet: report_to.append("comet_ml") if self.cfg.use_trackio: report_to.append("trackio") training_args_kwargs["report_to"] = report_to if self.cfg.use_wandb: training_args_kwargs["run_name"] = self.cfg.wandb_name elif self.cfg.use_mlflow: training_args_kwargs["run_name"] = self.cfg.mlflow_run_name elif self.cfg.use_trackio: training_args_kwargs["run_name"] = self.cfg.trackio_run_name else: training_args_kwargs["run_name"] = None def _configure_torch_compile(self, training_args_kwargs: dict): if self.cfg.torch_compile and getattr(torch, "_dynamo", None): torch._dynamo.config.suppress_errors = True torch._dynamo.config.accumulated_cache_size_limit = 256 training_args_kwargs["torch_compile"] = self.cfg.torch_compile if self.cfg.torch_compile_backend: training_args_kwargs["torch_compile_backend"] = ( self.cfg.torch_compile_backend ) if self.cfg.torch_compile_mode: training_args_kwargs["torch_compile_mode"] = self.cfg.torch_compile_mode def _configure_accelerator_config(self, training_args_kwargs: dict): if self.cfg.accelerator_config: training_args_kwargs["accelerator_config"] = AcceleratorConfig( **self.cfg.accelerator_config ) else: training_args_kwargs["accelerator_config"] = AcceleratorConfig() def _configure_gradient_checkpointing(self, training_args_kwargs: dict): if self.cfg.activation_offloading is True: # don't use the HF gradient checkpointing, manually wrap training_args_kwargs["gradient_checkpointing"] = False training_args_kwargs["activation_offloading"] = True elif self.cfg.gradient_checkpointing is not None: training_args_kwargs["gradient_checkpointing"] = ( self.cfg.gradient_checkpointing ) if self.cfg.gradient_checkpointing_kwargs is not None: training_args_kwargs["gradient_checkpointing_kwargs"] = ( self.cfg.gradient_checkpointing_kwargs ) else: training_args_kwargs["gradient_checkpointing_kwargs"] = { "use_reentrant": False } def _set_base_training_args( self, total_num_steps ) -> tuple[dict[str, Any], dict[str, Any]]: training_args_kwargs: dict[str, Any] = {} trainer_kwargs: dict[str, Any] = {} self._configure_warmup_and_logging(total_num_steps, training_args_kwargs) self._configure_precision_settings(training_args_kwargs) self._configure_save_and_eval_strategy(training_args_kwargs) self._configure_gradient_checkpointing(training_args_kwargs) # set arg into trainer_args_kwargs with same name if value not None for arg in [ # optim/scheduler "adam_beta1", "adam_beta2", "adam_beta3", "adam_epsilon", "adam_epsilon2", "cosine_min_lr_ratio", "cosine_constant_lr_ratio", "optim_target_modules", # trainer "max_grad_norm", "dataloader_num_workers", "dataloader_pin_memory", "dataloader_prefetch_factor", "gradient_accumulation_steps", "learning_rate", "embedding_lr", "embedding_lr_scale", "lr_groups", "loraplus_lr_ratio", "loraplus_lr_embedding", "output_dir", "save_only_model", "weight_decay", "seed", "dion_momentum", "dion_rank_fraction", "dion_rank_multiple_of", "dataset_num_proc", ]: if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None: training_args_kwargs[arg] = getattr(self.cfg, arg) arg_map = { "dion_learning_rate": "dion_lr", "include_num_input_tokens_seen": "include_tokens_per_second", } for kwarg, cfg_arg in arg_map.items(): if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None: training_args_kwargs[kwarg] = getattr(self.cfg, cfg_arg) training_args_kwargs["per_device_train_batch_size"] = self.cfg.micro_batch_size training_args_kwargs["average_tokens_across_devices"] = False if self.cfg.eval_batch_size: training_args_kwargs["per_device_eval_batch_size"] = ( self.cfg.eval_batch_size ) training_args_kwargs["include_tkps"] = self.cfg.include_tkps training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1 training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs # max_length is not used in CausalTrainer if self.cfg.reward_model or self.cfg.rl: training_args_kwargs["max_length"] = self.cfg.sequence_len if self.cfg.fsdp_config or self.cfg.fsdp: training_args_kwargs["fsdp_config"] = self.cfg.fsdp_config training_args_kwargs["fsdp"] = self.cfg.fsdp if self.cfg.fsdp else True self._configure_reporting(training_args_kwargs) self._configure_hub_parameters(training_args_kwargs) self._configure_scheduler(training_args_kwargs) self._configure_optimizer(training_args_kwargs, trainer_kwargs) self._configure_torch_compile(training_args_kwargs) self._configure_accelerator_config(training_args_kwargs) return training_args_kwargs, trainer_kwargs ================================================ FILE: src/axolotl/core/builders/causal.py ================================================ """Builder for causal trainers""" import inspect import math import os from pathlib import Path from typing import Type, Union import transformers from transformers import ( DataCollatorWithFlattening, EarlyStoppingCallback, Trainer, ) from trl.trainer.reward_trainer import DataCollatorForPreference from axolotl.core.builders.base import TrainerBuilderBase from axolotl.core.trainers import ( AxolotlMambaTrainer, AxolotlPRMTrainer, AxolotlRewardTrainer, AxolotlTrainer, ) from axolotl.integrations.base import PluginManager from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES from axolotl.monkeypatch.relora import ReLoRACallback from axolotl.processing_strategies import get_processing_strategy from axolotl.utils import is_comet_available, is_mlflow_available from axolotl.utils.callbacks import ( LossWatchDogCallback, bench_eval_callback_factory, causal_lm_bench_eval_callback_factory, colab_inference_post_train_callback, log_prediction_callback_factory, ) from axolotl.utils.callbacks.lisa import lisa_callback_factory from axolotl.utils.callbacks.qat import QATCallback from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback from axolotl.utils.chat_templates import get_chat_template_from_config from axolotl.utils.collators import ( BatchSamplerDataCollatorForSeq2Seq, DataCollatorForSeq2Seq, MambaDataCollator, V2BatchSamplerDataCollatorForSeq2Seq, ) from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator from axolotl.utils.import_helper import get_cls_from_module_str from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class HFCausalTrainerBuilder(TrainerBuilderBase): """ Build the HuggingFace training args/trainer for causal models and reward modeling using TRL. """ def get_callbacks(self): callbacks = super().get_callbacks() if self.cfg.relora: callbacks.append(ReLoRACallback(self.cfg)) # TODO: check if can move to base class if self.cfg.loss_watchdog_threshold is not None: callbacks.append(LossWatchDogCallback(self.cfg)) if self.cfg.qat: callbacks.append(QATCallback(self.cfg.qat)) if self.cfg.include_tkps: callbacks.append( TokensPerSecondCallback( self.cfg.tensor_parallel_size, self.cfg.context_parallel_size, resume_from_checkpoint=self.cfg.resume_from_checkpoint, ) ) return callbacks def get_post_trainer_create_callbacks(self, trainer): callbacks = [] if self.cfg.use_wandb and self.cfg.eval_table_size > 0: LogPredictionCallback = log_prediction_callback_factory( trainer, self.tokenizer, "wandb" ) callbacks.append(LogPredictionCallback(self.cfg)) if ( self.cfg.use_mlflow and is_mlflow_available() and self.cfg.eval_table_size > 0 ): LogPredictionCallback = log_prediction_callback_factory( trainer, self.tokenizer, "mlflow" ) callbacks.append(LogPredictionCallback(self.cfg)) if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0: LogPredictionCallback = log_prediction_callback_factory( trainer, self.tokenizer, "comet_ml" ) callbacks.append(LogPredictionCallback(self.cfg)) if self.cfg.do_bench_eval: callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer)) if self.cfg.do_causal_lm_eval: CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory( trainer, self.tokenizer ) callbacks.append(CausalLMBenchEvalCallback(self.cfg)) if self.cfg.early_stopping_patience: early_stop_cb = EarlyStoppingCallback( self.cfg.early_stopping_patience, ) callbacks.append(early_stop_cb) if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers: callbacks.append(lisa_callback_factory(trainer)) if any("COLAB_" in key for key in os.environ): ColabCallback = colab_inference_post_train_callback(trainer) callbacks.append(ColabCallback(self.cfg)) if getattr(self.cfg, "generate_samples", False): from axolotl.utils.callbacks.generation import SFTGenerationCallback callbacks.append(SFTGenerationCallback(trainer)) LOG.info("SFT sample generation enabled") callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer)) return callbacks def _get_trainer_cls(self): """ Gets the trainer class for the given configuration. """ if self.cfg.plugins: plugin_manager = PluginManager.get_instance() trainer_cls = plugin_manager.get_trainer_cls(self.cfg) if trainer_cls: return trainer_cls if self.cfg.model_config_type == "mamba": return AxolotlMambaTrainer if self.cfg.reward_model: return AxolotlRewardTrainer if self.cfg.process_reward_model: return AxolotlPRMTrainer if self.cfg.trainer_cls: # override the trainer cls try: trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls) LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}") return trainer_cls except (ImportError, AttributeError, ValueError) as e: raise ValueError( f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}" ) from e return AxolotlTrainer def build(self, total_num_steps): from axolotl.core.training_args import ( AxolotlPRMConfig, AxolotlRewardConfig, AxolotlTrainingArguments, ) training_arguments_kwargs, trainer_kwargs = self._set_base_training_args( total_num_steps ) if self.cfg.adapter == "qlora": training_arguments_kwargs["qlora"] = True # deepspeed if self.cfg.deepspeed: training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed if self.cfg.lr_quadratic_warmup is not None: training_arguments_kwargs["lr_quadratic_warmup"] = ( self.cfg.lr_quadratic_warmup ) if self.cfg.dataloader_drop_last is not None: training_arguments_kwargs["dataloader_drop_last"] = ( self.cfg.dataloader_drop_last ) elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False: training_arguments_kwargs["dataloader_drop_last"] = True if self.cfg.remove_unused_columns is not None: training_arguments_kwargs["remove_unused_columns"] = ( self.cfg.remove_unused_columns ) if self.cfg.do_bench_eval: training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval if self.cfg.bench_dataset: training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset if self.cfg.do_causal_lm_eval: training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval if self.cfg.metric_for_best_model: training_arguments_kwargs["metric_for_best_model"] = ( self.cfg.metric_for_best_model ) if self.cfg.greater_is_better: training_arguments_kwargs["greater_is_better"] = self.cfg.greater_is_better # DDP Config if self.cfg.ddp_timeout: training_arguments_kwargs["ddp_timeout"] = self.cfg.ddp_timeout # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html if self.cfg.ddp_bucket_cap_mb: training_arguments_kwargs["ddp_bucket_cap_mb"] = self.cfg.ddp_bucket_cap_mb if self.cfg.ddp_broadcast_buffers is not None: training_arguments_kwargs["ddp_broadcast_buffers"] = ( self.cfg.ddp_broadcast_buffers ) # these are all the "standard" kwargs that are def used training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len if self.cfg.auto_find_batch_size is not None: training_arguments_kwargs["auto_find_batch_size"] = ( self.cfg.auto_find_batch_size ) training_arguments_kwargs["eval_accumulation_steps"] = ( self.cfg.gradient_accumulation_steps ) training_arguments_kwargs["load_best_model_at_end"] = ( ( self.cfg.load_best_model_at_end is not False or self.cfg.early_stopping_patience ) and ( (not self.cfg.test_datasets and self.cfg.val_set_size > 0) or (self.cfg.test_datasets and self.cfg.val_set_size == 0) ) and self.cfg.save_steps and self.cfg.eval_steps and self.cfg.save_steps % self.cfg.eval_steps == 0 ) or False # handle ddp ddp_find_unused_parameters = None if self.cfg.ddp: ddp_find_unused_parameters = bool(self.cfg.ddp_find_unused_parameters) training_arguments_kwargs["ddp_find_unused_parameters"] = ( ddp_find_unused_parameters ) if self.cfg.group_by_length: training_arguments_kwargs["train_sampling_strategy"] = "group_by_length" training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing) training_arguments_kwargs["sample_packing_drop_attention_mask"] = bool( self.cfg.flash_attention or self.cfg.xformers_attention or self.cfg.flex_attention ) training_arguments_kwargs["multipack_real_batches"] = ( self.cfg.multipack_real_batches if self.cfg.multipack_real_batches is not None else not ( self.cfg.flash_attention or self.cfg.flex_attention or self.cfg.xformers_attention ) ) training_arguments_kwargs["eval_sample_packing"] = bool( self.cfg.eval_sample_packing ) if self.cfg.sample_packing_sequentially is not None: training_arguments_kwargs["sample_packing_sequentially"] = ( self.cfg.sample_packing_sequentially ) if self.cfg.sample_packing_bin_size is not None: training_arguments_kwargs["sample_packing_bin_size"] = ( self.cfg.sample_packing_bin_size ) if self.cfg.sample_packing_group_size is not None: training_arguments_kwargs["sample_packing_group_size"] = ( self.cfg.sample_packing_group_size ) if self.cfg.sample_packing_eff_est: training_arguments_kwargs["sample_packing_efficiency"] = ( self.cfg.sample_packing_eff_est ) if self.cfg.relora and self.cfg.jagged_restart_steps: if self.cfg.relora_prune_ratio: training_arguments_kwargs["relora_prune_ratio"] = ( self.cfg.relora_prune_ratio ) if self.cfg.jagged_restart_steps: training_arguments_kwargs["jagged_restart_steps"] = ( self.cfg.jagged_restart_steps ) if self.cfg.jagged_restart_warmup_steps: training_arguments_kwargs["jagged_restart_warmup_steps"] = ( self.cfg.jagged_restart_warmup_steps ) if self.cfg.jagged_restart_anneal_steps: training_arguments_kwargs["jagged_restart_anneal_steps"] = ( self.cfg.jagged_restart_anneal_steps ) if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers: training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers training_arguments_kwargs["lisa_step_interval"] = ( self.cfg.lisa_step_interval ) training_arguments_kwargs["lisa_layers_attribute"] = ( self.cfg.lisa_layers_attribute ) training_arguments_kwargs = self.hook_pre_create_training_args( training_arguments_kwargs ) training_arguments_kwargs["model_type"] = self.cfg.model_config_type training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset) if self.cfg.chat_template: training_arguments_kwargs["chat_template"] = get_chat_template_from_config( cfg=self.cfg, tokenizer=self.tokenizer, ) if self.cfg.neftune_noise_alpha is not None: training_arguments_kwargs["neftune_noise_alpha"] = ( self.cfg.neftune_noise_alpha ) if self.cfg.image_size: training_arguments_kwargs["image_size"] = self.cfg.image_size if self.cfg.image_resize_algorithm: training_arguments_kwargs["image_resize_algorithm"] = ( self.cfg.image_resize_algorithm ) if self.cfg.plugins: plugin_manager = PluginManager.get_instance() plugin_training_args = plugin_manager.get_training_args(self.cfg) if plugin_training_args: training_arguments_kwargs.update(plugin_training_args) if self.cfg.reward_model: training_args_cls = AxolotlRewardConfig if self.cfg.center_rewards_coefficient is not None: training_arguments_kwargs["center_rewards_coefficient"] = ( self.cfg.center_rewards_coefficient ) elif self.cfg.process_reward_model: training_args_cls = AxolotlPRMConfig else: training_args_cls = AxolotlTrainingArguments training_args = training_args_cls( **training_arguments_kwargs, ) training_args = self.hook_post_create_training_args(training_args) # unset run_name so wandb sets up experiment names if self.cfg.use_wandb and training_args.run_name == training_args.output_dir: training_args.run_name = None data_collator_kwargs = { "padding": True, # True/"longest" is the default } multiple = 64 if self.cfg.pad_to_sequence_len: data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil( self.cfg.sequence_len / multiple ) elif self.cfg.pad_to_sequence_len is None: # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html data_collator_kwargs["pad_to_multiple_of"] = multiple if self.cfg.use_eaft: from functools import partial from axolotl.monkeypatch.loss.eaft import eaft_loss configured_eaft_loss = partial( eaft_loss, alpha=self.cfg.eaft_alpha if self.cfg.eaft_alpha is not None else 1.0, k=self.cfg.eaft_k if self.cfg.eaft_k is not None else 20, ) trainer_kwargs["compute_loss_func"] = configured_eaft_loss trainer_cls = self._get_trainer_cls() trainer_kwargs, trainer_cls = self.hook_pre_create_trainer( trainer_kwargs, trainer_cls ) if eval_data_collator := self.build_collator( training_args, is_eval=True, **data_collator_kwargs ): if not (self.cfg.reward_model or self.cfg.process_reward_model): trainer_kwargs["eval_data_collator"] = eval_data_collator if not (self.cfg.reward_model or self.cfg.process_reward_model): trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq( self.tokenizer, return_tensors="pt", **data_collator_kwargs, ) sig = inspect.signature(trainer_cls) if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer): trainer_kwargs["processing_class"] = self.tokenizer elif "tokenizer" in sig.parameters: trainer_kwargs["tokenizer"] = self.tokenizer if ( trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer] and self.cfg.datasets is not None ): trainer_kwargs["dataset_tags"] = [ d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir() ] # TRL's RewardTrainer validates num_labels=1 on pre-loaded models; ensure the # config reflects this regardless of how the model was instantiated. if ( self.cfg.reward_model and getattr(self.model.config, "num_labels", None) != 1 ): self.model.config.num_labels = 1 trainer = trainer_cls( model=self.model, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, args=training_args, data_collator=self.build_collator(training_args, **data_collator_kwargs), callbacks=self.get_callbacks(), **trainer_kwargs, ) trainer = self.hook_post_create_trainer(trainer) # if the trainer has the `axolotl_cfg` property, set it if hasattr(trainer, "axolotl_cfg"): trainer.axolotl_cfg = self.cfg for callback in self.get_post_trainer_create_callbacks(trainer): trainer.add_callback(callback) if self.cfg.deepspeed and self.cfg.sample_packing: trainer.accelerator.state.deepspeed_plugin.deepspeed_config[ "train_micro_batch_size_per_gpu" ] = self.cfg.micro_batch_size return trainer def build_collator( self, training_args, # type: "AxolotlTrainingArguments" # type: ignore is_eval=False, **kwargs, ): if training_args.pretraining: if ( self.cfg.pretraining_sample_concatenation is False or self.cfg.micro_batch_size > 1 ): return DataCollatorForSeq2Seq(self.tokenizer, **kwargs) if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn) or ( self.cfg.micro_batch_size == 1 and is_eval is False ): return None if self.cfg.model_config_type == "mamba": return MambaDataCollator(tokenizer=self.tokenizer) use_batch_sampler_collator = False if is_eval is False and training_args.sample_packing: use_batch_sampler_collator = True if is_eval and training_args.eval_sample_packing: use_batch_sampler_collator = True collator: Type[ Union[ V2BatchSamplerDataCollatorForSeq2Seq, BatchSamplerDataCollatorForSeq2Seq, DataCollatorForSeq2Seq, DataCollatorWithFlattening, DataCollatorForPreference, ] ] collator_args = [self.tokenizer] collator_cls_and_kwargs = None if self.cfg.plugins: plugin_manager = PluginManager.get_instance() collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs( self.cfg, is_eval=is_eval ) if collator_cls_and_kwargs: collator = collator_cls_and_kwargs[0] if kwargs and isinstance(kwargs, dict): kwargs.update(collator_cls_and_kwargs[1]) elif self.cfg.reward_model: collator = DataCollatorForPreference tokenizer = collator_args.pop(0) kwargs["pad_token_id"] = tokenizer.pad_token_id kwargs.pop("padding") elif use_batch_sampler_collator: # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention, # supported multipack models, or non-flash-attention llama if ( self.cfg.flex_attention or self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES or ( self.cfg.model_config_type in ["llama"] and self.cfg.flash_attention is not True ) ): collator = V2BatchSamplerDataCollatorForSeq2Seq else: collator = BatchSamplerDataCollatorForSeq2Seq else: if self.cfg.processor_type and self.processor: collator = MultiModalChatDataCollator kwargs["processing_strategy"] = get_processing_strategy( self.processor, training_args.chat_template, self.cfg.chat_template, image_size=training_args.image_size, image_resize_algorithm=training_args.image_resize_algorithm, ) elif self.cfg.batch_flattening: collator = DataCollatorWithFlattening collator_args.pop(0) kwargs.pop("pad_to_multiple_of", None) kwargs.pop("padding", None) else: collator = DataCollatorForSeq2Seq kwargs["return_tensors"] = "pt" return collator( *collator_args, **kwargs, ) ================================================ FILE: src/axolotl/core/builders/rl.py ================================================ """Builder for RLHF trainers""" import inspect from pathlib import Path from axolotl.core.builders.base import TrainerBuilderBase from axolotl.core.trainers import ( AxolotlCPOTrainer, AxolotlKTOTrainer, AxolotlORPOTrainer, ) from axolotl.core.trainers.dpo import DPOStrategy from axolotl.core.trainers.dpo.args import AxolotlDPOConfig from axolotl.integrations.base import PluginManager from axolotl.loaders.utils import ensure_dtype from axolotl.utils.callbacks.qat import QATCallback from axolotl.utils.import_helper import get_cls_from_module_str from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import RLType LOG = get_logger(__name__) class HFRLTrainerBuilder(TrainerBuilderBase): """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)""" def get_callbacks(self): callbacks = super().get_callbacks() if self.cfg.qat: callbacks.append(QATCallback(self.cfg.qat)) return callbacks def get_post_trainer_create_callbacks(self, trainer): callbacks = super().get_post_trainer_create_callbacks(trainer=trainer) return callbacks def _get_trainer_cls(self, trainer_kwargs: dict): """ Returns trainer_cls and trainer_cls_args """ if self.cfg.plugins: plugin_manager = PluginManager.get_instance() trainer_cls = plugin_manager.get_trainer_cls(self.cfg) trainer_cls_args = [] # type: ignore if trainer_cls is not None: return trainer_cls, trainer_cls_args trainer_cls = None trainer_cls_args = [self.model] if self.cfg.rl in {RLType.GRPO, RLType.GDPO}: from axolotl.core.trainers.grpo import GRPOStrategy async_grpo = bool( self.cfg.trl and ( getattr(self.cfg.trl, "async_prefetch", False) or getattr(self.cfg.trl, "use_data_producer", False) ) ) trainer_cls = GRPOStrategy.get_trainer_class( sequence_parallel=self.cfg.context_parallel_size > 1, async_grpo=async_grpo, ) trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg)) trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg)) elif self.cfg.rl in [RLType.DPO, RLType.IPO]: trainer_cls = DPOStrategy.get_trainer_class() trainer_cls_args.append(self.model_ref) elif self.cfg.rl is RLType.ORPO: trainer_cls = AxolotlORPOTrainer elif self.cfg.rl is RLType.KTO: trainer_cls = AxolotlKTOTrainer elif self.cfg.rl is RLType.SIMPO: trainer_cls = AxolotlCPOTrainer else: raise ValueError(f"Unsupported RL: {self.cfg.rl}") if self.cfg.trainer_cls: # override the trainer cls try: trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls) LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}") except (ImportError, AttributeError, ValueError) as e: raise ValueError( f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}" ) from e return trainer_cls, trainer_cls_args def _build_training_arguments(self, total_num_steps): """ Returns training_args and trainer_kwargs """ from axolotl.core.training_args import ( AxolotlCPOConfig, AxolotlKTOConfig, AxolotlORPOConfig, ) training_args_kwargs, trainer_kwargs = self._set_base_training_args( total_num_steps=total_num_steps ) if self.cfg.remove_unused_columns is not None: training_args_kwargs["remove_unused_columns"] = ( self.cfg.remove_unused_columns ) else: training_args_kwargs["remove_unused_columns"] = False if self.cfg.trl and self.cfg.trl.beta is not None: training_args_kwargs["beta"] = self.cfg.trl.beta elif self.cfg.rl_beta is not None: training_args_kwargs["beta"] = self.cfg.rl_beta elif self.cfg.orpo_alpha is not None: # trl does some odd mapping of alpha to beta to reuse the beta parameter ??? training_args_kwargs["beta"] = self.cfg.orpo_alpha if self.cfg.rpo_alpha is not None: training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha if self.cfg.use_wandb: training_args_kwargs["run_name"] = self.cfg.wandb_name training_args_cls = None blocklist_args_kwargs = [] if self.cfg.rl is RLType.SIMPO: training_args_cls = AxolotlCPOConfig training_args_kwargs["loss_type"] = "simpo" training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma if self.cfg.cpo_alpha is not None: training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha blocklist_args_kwargs.append("max_prompt_length") elif self.cfg.rl is RLType.ORPO: training_args_cls = AxolotlORPOConfig blocklist_args_kwargs.append("max_prompt_length") elif self.cfg.rl is RLType.KTO: training_args_cls = AxolotlKTOConfig # KTOConfig in TRL >= 0.27.0 no longer accepts max_prompt_length blocklist_args_kwargs.append("max_prompt_length") training_args_kwargs["desirable_weight"] = ( self.cfg.kto_desirable_weight or 1.0 ) training_args_kwargs["undesirable_weight"] = ( self.cfg.kto_undesirable_weight or 1.0 ) elif self.cfg.rl in {RLType.GRPO, RLType.GDPO}: from axolotl.core.trainers.grpo import GRPOStrategy async_grpo = bool( self.cfg.trl and ( getattr(self.cfg.trl, "async_prefetch", False) or getattr(self.cfg.trl, "use_data_producer", False) ) ) training_args_cls = GRPOStrategy.get_training_args_class( async_grpo=async_grpo ) training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg)) blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs() if self.cfg.rl is RLType.GDPO: training_args_kwargs.setdefault( "multi_objective_aggregation", "normalize_then_sum" ) elif self.cfg.rl in [RLType.DPO, RLType.IPO]: training_args_cls = AxolotlDPOConfig training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg)) else: raise ValueError(f"Unsupported RL: {self.cfg.rl}") for blocklist_key in blocklist_args_kwargs: if blocklist_key in training_args_kwargs: del training_args_kwargs[blocklist_key] if self.cfg.plugins: plugin_manager = PluginManager.get_instance() plugin_training_args = plugin_manager.get_training_args(self.cfg) if plugin_training_args: training_args_kwargs.update(plugin_training_args) training_args = training_args_cls( logging_first_step=True, **training_args_kwargs, ) # unset run_name so wandb sets up experiment names if self.cfg.use_wandb and training_args.run_name == training_args.output_dir: training_args.run_name = None return training_args, trainer_kwargs def build(self, total_num_steps): training_args, trainer_kwargs = self._build_training_arguments(total_num_steps) if self.eval_dataset: trainer_kwargs["eval_dataset"] = self.eval_dataset if self.cfg.adapter and self.peft_config and self.cfg.rl is not RLType.GRPO: trainer_kwargs["peft_config"] = self.peft_config if self.cfg.precompute_ref_log_probs is not None: trainer_kwargs["precompute_ref_log_probs"] = ( self.cfg.precompute_ref_log_probs ) trainer_cls, trainer_cls_args = self._get_trainer_cls(trainer_kwargs) sig = inspect.signature(trainer_cls) if "tokenizer" in sig.parameters: trainer_kwargs["tokenizer"] = self.tokenizer else: trainer_kwargs["processing_class"] = self.tokenizer if self.cfg.datasets is not None and ( trainer_cls is DPOStrategy.get_trainer_class() ): trainer_kwargs["dataset_tags"] = [ d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir() ] trainer_kwargs, trainer_cls = self.hook_pre_create_trainer( trainer_kwargs, trainer_cls ) # Allow FP8-quantized models to be fine-tuned with LoRA adapters. # transformers' validate_quantization_for_training blocks FP8 because # hf_quantizer.is_trainable is False, but LoRA only trains the adapters # (base weights stay frozen in FP8). _orig_validate_quant = None if ( self.cfg.adapter and hasattr(self.model, "is_quantized") and self.model.is_quantized ): import transformers.trainer as _trainer_module _orig_validate_quant = _trainer_module.validate_quantization_for_training _trainer_module.validate_quantization_for_training = lambda model: None try: trainer = trainer_cls( *trainer_cls_args, args=training_args, train_dataset=self.train_dataset, callbacks=self.get_callbacks(), **trainer_kwargs, ) finally: if _orig_validate_quant is not None: import transformers.trainer as _trainer_module _trainer_module.validate_quantization_for_training = ( _orig_validate_quant ) if self.cfg.fsdp_config or self.cfg.fsdp: ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype) if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model: ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype) trainer = self.hook_post_create_trainer(trainer) for callback in self.get_post_trainer_create_callbacks(trainer): trainer.add_callback(callback) return trainer ================================================ FILE: src/axolotl/core/chat/__init__.py ================================================ ================================================ FILE: src/axolotl/core/chat/format/__init__.py ================================================ ================================================ FILE: src/axolotl/core/chat/format/chatml.py ================================================ """ ChatML transformation functions for MessageContents """ from typing import Optional from ..messages import MessageContents, Messages from .shared import wrap_tools def format_message( message: Messages, message_index: Optional[int] = None, ) -> Messages: if message.is_chat_formatted: return message # prepend the role prefix within a MessageContents to message.content message.content.insert( 0, MessageContents( type="text", value=f"<|im_start|>{message.role}\n", weight=0, ), ) message.content.append( MessageContents(type="text", value="<|im_end|>", weight=message.weight) ) message.content.append(MessageContents(type="text", value="\n", weight=0)) message = wrap_tools(message) message.is_chat_formatted = True return message ================================================ FILE: src/axolotl/core/chat/format/llama3x.py ================================================ """ Llama 3.x chat formatting functions for MessageContents """ from typing import Optional from ..messages import MessageContents, Messages from .shared import wrap_tools def format_message(message: Messages, message_index: Optional[int] = None) -> Messages: if message.is_chat_formatted: return message message_role = message.role if message.role == "tool": message_role = "ipython" # prepend the role prefix within a MessageContents to message.content message.content.insert( 0, MessageContents( type="text", value=f"<|start_header_id|>{message_role}<|end_header_id|>\n\n", weight=0, ), ) message.content.append( MessageContents(type="text", value="<|eot_id|>", weight=message.weight) ) message = wrap_tools(message) if message_index == 0: message.content.insert( 0, MessageContents( type="text", value="<|begin_of_text|>", weight=0, ), ) message.is_chat_formatted = True return message ================================================ FILE: src/axolotl/core/chat/format/shared.py ================================================ """ shared functions for format transforms """ from axolotl.core.chat.messages import MessageContents, Messages def wrap_tools(message: Messages): # loop over message.content by index to find tool calls, we need to wrap each with tags, # so be wary of indexing issues when changing the list while iterating. # iterate over the range in reverse order to avoid index shifting for i in range(len(message.content) - 1, -1, -1): if message.content[i].type == "tool_call": # append a MessageContents text tag after message.content.insert( i + 1, MessageContents( type="text", value="\n", weight=message.weight ), ) # make sure the actual tool call content ends with a newline message.content[i].has_newline = True # prepend a MessageContents text tag before message.content.insert( i, MessageContents( type="text", value="\n", weight=message.weight ), ) elif message.content[i].type == "tool_response": # append a MessageContents text tag after message.content.insert( i + 1, MessageContents( type="text", value="\n", weight=message.weight ), ) # make sure the actual tool response content ends with a newline message.content[i].has_newline = True # prepend a MessageContents text tag before message.content.insert( i, MessageContents( type="text", value="\n", weight=message.weight ), ) return message ================================================ FILE: src/axolotl/core/chat/messages.py ================================================ """ internal message representations of chat messages """ import json from enum import Enum from typing import Any, Callable, List, Optional, Union from pydantic import BaseModel from transformers import PreTrainedTokenizer class MessageRoles(str, Enum): """ Message roles for the system, user, assistant, and tools """ system = "system" user = "user" assistant = "assistant" tool = "tool" ipython = ( # for responses from builtin tools "ipython" ) class MessageContentTypes(str, Enum): """ Message content types for text, image, audio, tool calls, and tool responses """ special_token = "special_token" # nosec B105 text = "text" image = "image" audio = "audio" tool_call = "tool_call" tool_response = "tool_response" class SpecialToken(str, Enum): """ Special tokens for beginning of string and end of string """ bos_token = "bos_token" # nosec B105 eos_token = "eos_token" # nosec B105 class ToolCallFunction(BaseModel): """ Tool call function with name and arguments """ name: str arguments: dict[str, str] class Tool(BaseModel): """ Tool with description, function, and parameters """ description: str function: ToolCallFunction parameters: dict[str, str] # .properties class ToolCallContents(BaseModel): """ Tool call contents with name, arguments, and optional id """ name: str arguments: dict[str, Union[str, int]] id: Optional[str] = None def __str__(self) -> str: data = {"name": self.name, "arguments": self.arguments} if self.id is not None: data["id"] = self.id return json.dumps(data) class ToolResponseContents(BaseModel): """ Tool response contents with name, content, and optional id """ name: str content: Union[str, dict[str, Union[str, int, float]]] id: Optional[str] = None def __str__(self) -> str: data = {"name": self.name, "content": self.content} if self.id is not None: data["id"] = self.id return json.dumps(data) class MessageContents(BaseModel): """ Message contents with type, value, metadata, weight, newline, and end of contents """ type: Union[str, MessageContentTypes] value: Union[str, ToolCallContents, ToolResponseContents, SpecialToken] meta: Optional[dict[str, Any]] = None # support additional arbitrary metadata weight: Optional[Union[int, float]] = None has_newline: bool = False eoc: bool = False # end of contents def __str__(self) -> str: str_val = str(self.value) if self.has_newline and not str_val.endswith("\n"): str_val += "\n" return str_val class Messages(BaseModel): """ Messages with role, content, metadata, weight, and chat formatting """ role: Union[MessageRoles, str] # allows for arbitrary roles content: List["MessageContents"] meta: Optional[dict[str, Any]] = None # support additional arbitrary metadata weight: Optional[Union[int, float]] = None is_chat_formatted: bool = False def __str__(self) -> str: return "".join(str(c) for c in self.content) def tokenized( self, tokenizer: PreTrainedTokenizer, ignore_index=-100 ) -> dict[str, List[int]]: # iterate over the contents, tokenizing the concatenated string values up to the current MessageContents # returns a dictionary mapping w input_ids, attention_mask, and labels input_ids: List[int] = [] labels: List[int] = [] pending_input_ids: List[int] = [] pending_weight = self.weight running_content = "" for _, msg_content in enumerate(self.content): # TODO also handle non-text content types if msg_content.type in [ MessageContentTypes.text.value, MessageContentTypes.tool_call.value, MessageContentTypes.tool_response.value, ]: running_content += str(msg_content) tok_results = tokenizer(running_content, add_special_tokens=False) tok_input_ids = tok_results["input_ids"] if pending_input_ids: new_pending_inputs = tok_input_ids[ len(input_ids) : len(input_ids) + len(pending_input_ids) ] if new_pending_inputs != pending_input_ids: pending_input_ids = new_pending_inputs input_ids.extend(pending_input_ids) if pending_weight: labels.extend(pending_input_ids) else: labels.extend([ignore_index] * len(pending_input_ids)) pending_input_ids = tok_results["input_ids"][len(input_ids) :] pending_weight = self.weight and msg_content.weight not in [0, 0.0] input_ids.extend(pending_input_ids) if pending_weight: labels.extend(pending_input_ids) else: labels.extend([ignore_index] * len(pending_input_ids)) attention_mask = [1] * len(input_ids) return { "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, } class Chats(BaseModel): """ top level data structure for chat conversations """ conversation: List[Messages] def __str__(self) -> str: return "".join(str(c) for c in self.conversation) def tokenized( self, tokenizer: Callable[[str], dict[str, List[int]]], ignore_index=-100 ) -> dict[str, List[int]]: input_ids = [] attention_mask = [] labels = [] for msg in self.conversation: msg_results = msg.tokenized(tokenizer, ignore_index) input_ids.extend(msg_results["input_ids"]) attention_mask.extend(msg_results["attention_mask"]) labels.extend(msg_results["labels"]) return { "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, } class ChatFormattedChats(Chats): """ Chat formatted chats with formatter and optional train on inputs """ formatter: Callable # [[Union[dict, Chats]], Chats] train_on_inputs: bool = False def model_post_init(self, __context): for i, msg in enumerate(self.conversation): self.conversation[i] = self.formatter(msg, message_index=i) if self.train_on_inputs: self.conversation[i].weight = 1 class PreferenceChats(BaseModel): """ representation for preference data for chat """ prompt: List[Messages] chosen: Messages rejected: Messages ================================================ FILE: src/axolotl/core/datasets/__init__.py ================================================ ================================================ FILE: src/axolotl/core/datasets/chat.py ================================================ """ chat dataset module """ from typing import Callable, Optional, Union from datasets import Dataset from transformers import PreTrainedTokenizer from axolotl.core.chat.messages import ChatFormattedChats class TokenizedChatDataset(Dataset): """ Tokenized chat dataset """ def __init__( self, data: Dataset, model_transform: Union[PreTrainedTokenizer, Callable], *args, message_transform: Optional[Callable] = None, formatter=None, process_count: Optional[int] = None, keep_in_memory: Optional[bool] = False, **kwargs, ): def map_fn(ex): if message_transform is not None: ex = message_transform(ex) if formatter is not None: ex = ChatFormattedChats( formatter=formatter, **ex, ) else: ex = ChatFormattedChats( **ex, ) return ex.tokenized(model_transform) features = data.features.keys() tokenized_data = data.map( map_fn, num_proc=process_count, keep_in_memory=keep_in_memory, remove_columns=features, desc="Tokenizing Chats", ) super().__init__(tokenized_data.data, *args, **kwargs) ================================================ FILE: src/axolotl/core/datasets/transforms/__init__.py ================================================ ================================================ FILE: src/axolotl/core/datasets/transforms/chat_builder.py ================================================ """ This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat. """ from typing import Any, Mapping def chat_message_transform_builder( train_on_inputs=False, conversations_field: str = "messages", message_field_role: str | list[str] | None = None, # commonly "role" message_field_content: str | list[str] | None = None, # commonly "content" message_field_training: str | list[str] | None = None, # commonly "weight" ): """Builds a transform that takes a row from the dataset and converts it to a Chat Args: train_on_inputs (bool, optional): If True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False. conversations_field (str, optional): The field name of the conversations. Defaults to "messages". message_field_role (str | list[str], optional): The field name of the role. message_field_content (str | list[str], optional): The field name of the message content. message_field_training (str | list[str], optional): The field name of the train/weight. Returns: Callable: A function that takes a list of conversations and returns a list of messages. """ if message_field_training is None: message_field_training = ["train", "weight"] if message_field_content is None: message_field_content = ["value", "text", "content"] if message_field_role is None: message_field_role = ["role", "from"] message_field_role = ( [message_field_role] if isinstance(message_field_role, str) else message_field_role ) message_field_content = ( [message_field_content] if isinstance(message_field_content, str) else message_field_content ) message_weight_fields = ( [message_field_training] if isinstance(message_field_training, str) else message_field_training ) role_value_mappings = { "system": "system", "user": "user", "human": "user", "assistant": "assistant", "gpt": "assistant", "tool": "tool", "ipython": "ipython", } if train_on_inputs: role_default_weights_mappings = { "system": 1, "user": 1, "assistant": 1, "tool": 1, "ipython": 1, } else: role_default_weights_mappings = { "system": 0, "user": 0, "assistant": 1, "tool": 0, "ipython": 0, } def transform_builder(sample: Mapping[str, Any]): if conversations_field not in sample: raise ValueError(f"Field '{conversations_field}' not found in sample.") # if none of the role fields are in the message, raise an error if not any( role in sample[conversations_field][0] for role in message_field_role ): raise ValueError("No role field found in message.") role_field = next( role for role in message_field_role if role in sample[conversations_field][0] ) if not any( field in sample[conversations_field][0] for field in message_field_content ): raise ValueError("No message_content field found in message.") message_content_field = next( field for field in message_field_content if field in sample[conversations_field][0] ) if not any( field in sample[conversations_field][0] for field in message_field_training ): message_weight_field = None else: message_weight_field = next( field for field in message_weight_fields if field in sample[conversations_field][0] ) messages = [] for message in sample[conversations_field]: role = role_value_mappings[message[role_field]] weight = ( int(message[message_weight_field]) if message_weight_field else role_default_weights_mappings[role] ) # TODO if "tool_calls" in message[message_content_field]: then convert tool call to ToolCallContents if isinstance(message[message_content_field], str): messages.append( { "role": role, "content": [ { "type": "text", "value": message[message_content_field], } ], "weight": weight, } ) else: messages.append( { "role": role, "content": message[message_content_field], "weight": weight, } ) return {"conversation": messages} return transform_builder ================================================ FILE: src/axolotl/core/trainers/__init__.py ================================================ """Init for axolotl.core.trainers""" # flake8: noqa from .base import AxolotlTrainer from .dpo.trainer import AxolotlDPOTrainer from .mamba import AxolotlMambaTrainer from .trl import ( AxolotlCPOTrainer, AxolotlKTOTrainer, AxolotlORPOTrainer, AxolotlPRMTrainer, AxolotlRewardTrainer, ) ================================================ FILE: src/axolotl/core/trainers/base.py ================================================ """Module for customized trainers""" from __future__ import annotations import json import math import os from collections import defaultdict from functools import partial, wraps from typing import Any, Callable, Literal, Optional import datasets import safetensors import torch from accelerate.state import AcceleratorState from datasets import Dataset from peft import PeftModel from torch.utils.data import ( BatchSampler, DataLoader, RandomSampler, Sampler, SequentialSampler, ) from transformers import PreTrainedModel, Trainer from transformers.trainer import TRAINING_ARGS_NAME from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available from trl.experimental.utils import pad_to_length from typing_extensions import override from axolotl.core.trainers.mixins import ( ActivationOffloadingMixin, CheckpointSaveMixin, DistributedParallelMixin, OptimizerMixin, PackingMixin, RngLoaderMixin, SchedulerMixin, ) from axolotl.core.trainers.utils import ( sanitize_kwargs_for_ds_tagging, sanitize_kwargs_for_tagging, ) from axolotl.utils import get_not_null from axolotl.utils.bench import get_gpu_memory_usage from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import is_distributed, is_main_process from axolotl.utils.logging import get_logger from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths LOG = get_logger(__name__) TOKENS_STATE_FILE = "tokens_state." REDUCTION_FNS = { "mean": torch.mean, "min": torch.min, "max": torch.max, "sum": torch.sum, } class AxolotlTrainer( PackingMixin, SchedulerMixin, OptimizerMixin, RngLoaderMixin, CheckpointSaveMixin, ActivationOffloadingMixin, DistributedParallelMixin, Trainer, ): """Extend the base Trainer for axolotl helpers""" args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined] tag_names = ["axolotl"] _axolotl_cfg: DictDefault | None = None @property def axolotl_cfg(self): return self._axolotl_cfg @axolotl_cfg.setter def axolotl_cfg(self, cfg): self._axolotl_cfg = cfg def __init__( self, *_args, bench_data_collator=None, eval_data_collator=None, dataset_tags=None, **kwargs, ): self.bench_data_collator = bench_data_collator self.eval_data_collator = eval_data_collator self.dataset_tags = dataset_tags self._signature_columns = None # workaround for pylint super().__init__(*_args, **kwargs) self.train_data_collator = self.data_collator self._stored_metrics = defaultdict( lambda: defaultdict(lambda: {"values": [], "reduction": "mean"}) ) if self.args.orpo_alpha: self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none") def _create_multipack_sampler( self, base_sampler: Sampler, dataset: Dataset ) -> MultipackBatchSampler: """ Helper method to create a `MultipackBatchSampler` for multipacking sequences for training. Args: base_sampler: Sampler to wrap with `MultipackBatchSampler`. dataset: Dataset to sample from. Returns: Multipack (sample packing) batch sampler. """ if self.args.multipack_real_batches: batch_size = self.args.per_device_train_batch_size batch_max_len = self.args.max_seq_length else: batch_size = 1 train_batch_size = ( self.state.train_batch_size or self.args.per_device_train_batch_size ) batch_max_len = train_batch_size * self.args.max_seq_length sampler = MultipackBatchSampler( base_sampler, lengths=get_dataset_lengths(dataset), packing_efficiency_estimate=self.args.sample_packing_efficiency, batch_max_len=batch_max_len, batch_size=batch_size, group_size=self.args.sample_packing_group_size, bin_size=self.args.sample_packing_bin_size, sequential=self.args.sample_packing_sequentially, drop_last=True, num_processes=self.args.dataset_num_proc, mp_start_method=self.args.sample_packing_mp_start_method or "fork", ) len(sampler) return sampler def _get_train_sampler( self, train_dataset: Dataset | None = None ) -> Sampler | None: """ Helper method to get the sampler for training. Handles cases for sample packing and curriculum sampling (sequential). Returns: If the dataset is non-empty, a sampler is returned, the type of which depends on the passed training args. """ # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L969C1-L972C24 if train_dataset is None: train_dataset = self.train_dataset if train_dataset is None or not has_length(train_dataset): return None use_sample_packing = self.args.sample_packing and not self.args.pretraining # Determine the base sampler first if self.args.curriculum_sampling: base_sampler = SequentialSampler(train_dataset) elif use_sample_packing: base_sampler = RandomSampler(train_dataset) else: # Default to parent class implementation for standard random sampling return super()._get_train_sampler(train_dataset) # Apply multipack wrapper if needed if use_sample_packing: return self._create_multipack_sampler( base_sampler=base_sampler, dataset=train_dataset, ) return base_sampler def _get_eval_sampler(self, eval_dataset: Dataset | None = None) -> Sampler | None: """ Helper method to get the sampler for evaluation. Handles sample packing case. Returns: If the dataset is non-empty, a sampler is returned, the type of which depends on the passed training args. """ # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L1065C9-L1066C24 if eval_dataset is None or not has_length(eval_dataset): return None # Multipacking enabled if training is enabled and eval is not explicitly disabled use_multipack = ( self.args.sample_packing and self.args.eval_sample_packing is not False ) # Determine the base sampler if use_multipack: base_sampler = SequentialSampler(eval_dataset) else: return super()._get_eval_sampler(eval_dataset) # Apply multipack wrapper if needed if use_multipack: return self._create_multipack_sampler( base_sampler=base_sampler, dataset=eval_dataset, ) return base_sampler def _get_dataloader( self, dataset: Dataset, description: str, batch_size: int, sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None, is_training: bool = False, dataloader_key: Optional[str] = None, ) -> DataLoader: """Create a [`~torch.utils.data.DataLoader`] from the given dataset.""" data_collator = self.data_collator if is_training else self.eval_data_collator if isinstance(dataset, datasets.Dataset): if is_training: if not self.args.sample_packing or self.args.pretraining: dataset = self._remove_unused_columns( dataset, description="training" ) elif ( not is_training and self.args.sample_packing and self.args.eval_sample_packing is not False ): batch_size = ( batch_size if self.args.sample_packing else self.args.per_device_eval_batch_size ) else: dataset = self._remove_unused_columns(dataset, description=description) else: data_collator = self._get_collator_with_removed_columns( self.data_collator, description=description ) dataloader_params = { "batch_size": batch_size, "collate_fn": data_collator, "num_workers": self.args.dataloader_num_workers, "pin_memory": self.args.dataloader_pin_memory, "persistent_workers": self.args.dataloader_persistent_workers, } if not isinstance(dataset, torch.utils.data.IterableDataset): dataloader_params["drop_last"] = get_not_null( self.args.dataloader_drop_last, True ) if sampler_fn is not None: sampler = sampler_fn(dataset) if isinstance(sampler, BatchSampler): # batch_size and batch_sampler are mutually exclusive dataloader_params["batch_sampler"] = sampler del dataloader_params["batch_size"] del dataloader_params["drop_last"] else: dataloader_params["sampler"] = sampler dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor if is_training: dataloader_params["worker_init_fn"] = partial( seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index, ) if self.args.sample_packing and ( (is_training and not self.args.pretraining) or (not is_training and self.args.eval_sample_packing is not False) ): self.accelerator.even_batches = False if dataset.column_names and "length" in dataset.column_names: dataset = dataset.remove_columns(["length"]) if ( dataset.column_names and "position_ids" in dataset.column_names and "attention_mask" in dataset.column_names and self.args.sample_packing and self.args.sample_packing_drop_attention_mask ): dataset = dataset.remove_columns(["attention_mask"]) dataloader = DataLoader(dataset, **dataloader_params) # Accelerator.free_memory() will destroy the references, so # we need to store the non-prepared version for eval dataloaders. # fmt: off if dataloader_key is not None and self.args.dataloader_persistent_workers: if hasattr(self, "_eval_dataloaders"): self._eval_dataloaders[dataloader_key] = dataloader # type: ignore else: self._eval_dataloaders = {dataloader_key: dataloader} # fmt: on return self.accelerator.prepare(dataloader) def _get_bench_sampler( self, bench_dataset: Dataset ) -> torch.utils.data.Sampler | None: if self.args.world_size <= 1: return SequentialSampler(bench_dataset) return None def get_bench_dataloader( self, bench_dataset: Dataset, ) -> DataLoader: dataloader_params = { "batch_size": self.args.eval_batch_size, "collate_fn": self.bench_data_collator, "num_workers": self.args.dataloader_num_workers, "pin_memory": self.args.dataloader_pin_memory, } if self.args.dataloader_prefetch_factor: dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor if not isinstance(bench_dataset, torch.utils.data.IterableDataset): dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset) dataloader_params["drop_last"] = self.args.dataloader_drop_last return DataLoader(bench_dataset, **dataloader_params) # return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params)) @override def compute_loss( self, model, inputs, return_outputs=False, num_items_in_batch=None ): # use one's weighted cross entropy loss calc # if self.args.sample_packing: # labels = inputs.pop("labels") # outputs = model(**inputs) # loss = trainer_weighted_loss(outputs, labels, shift_labels=True) # return (loss, outputs) if return_outputs else loss # track number of tokens for tokens per second calculation if self.args.include_tkps and model.training: inputs_key = "labels" if "labels" in inputs else "input_ids" trainable_tokens = (inputs[inputs_key] != -100).sum() total_tokens = inputs[inputs_key].numel() total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device) if is_distributed(): torch.distributed.all_reduce( trainable_tokens, op=torch.distributed.ReduceOp.SUM ) torch.distributed.all_reduce( total_tokens, op=torch.distributed.ReduceOp.SUM ) if not hasattr(self.state, "tokens"): self.state.tokens = { "trainable": torch.zeros(1), "total": torch.zeros(1), } # trainable tokens for throughput and total token slots for summaries self.state.tokens["trainable"] = ( self.state.tokens["trainable"] + trainable_tokens.detach().cpu() ) self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu() # Store per-step trainable tokens for throughput calculation self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu() if self.args.orpo_alpha: return self.orpo_compute_loss( model, inputs, return_outputs=return_outputs, num_items_in_batch=num_items_in_batch, ) return super().compute_loss( model, inputs, return_outputs=return_outputs, num_items_in_batch=num_items_in_batch, ) @override def evaluate(self, *args, **kwargs): LOG.info("Running evaluation step...") return super().evaluate(*args, **kwargs) @staticmethod def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None): concatenated_batch = {} max_length = max( inputs["input_ids"].shape[1], inputs["rejected_input_ids"].shape[1] ) # Concatenate positive and negative inputs concatenated_batch["input_ids"] = pad_to_length( inputs["input_ids"], max_length, pad_token ) concatenated_batch["rejected_input_ids"] = pad_to_length( inputs["rejected_input_ids"], max_length, pad_token ) concatenated_batch["labels"] = pad_to_length( inputs["labels"], max_length, label_pad_token ) concatenated_batch["rejected_labels"] = pad_to_length( inputs["rejected_labels"], max_length, label_pad_token ) concatenated_batch["attention_mask"] = pad_to_length( inputs["attention_mask"], max_length, 0 ) concatenated_batch["rejected_attention_mask"] = pad_to_length( inputs["rejected_attention_mask"], max_length, 0 ) concatenated_batch["prompt_attention_mask"] = pad_to_length( inputs["prompt_attention_mask"], max_length, 0 ).to(device=device) input_ids = torch.cat( [concatenated_batch["input_ids"], concatenated_batch["rejected_input_ids"]], dim=0, ).to(device=device) attention_mask = torch.cat( [ concatenated_batch["attention_mask"], concatenated_batch["rejected_attention_mask"], ], dim=0, ).to(device=device) labels = torch.cat( [concatenated_batch["labels"], concatenated_batch["rejected_labels"]], dim=0 ).to(device=device) return { "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask, "prompt_attention_mask": concatenated_batch["prompt_attention_mask"], } def orpo_compute_custom_loss(self, logits, labels): logits = logits.contiguous() loss = 0.0 if labels is not None: # move labels to correct device to enable model parallelism labels = labels.to(logits.device) # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss = self.loss_fct(shift_logits.transpose(2, 1), shift_labels).mean( dim=-1 ) return loss def orpo_compute_logps( self, prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits ): # Get the shape of chosen_attention_mask[:, :-1] chosen_shape = chosen_attention_mask[:, :-1].shape # Calculate the padding size pad_length = chosen_shape[1] - (prompt_attention_mask.shape[1] - 1) # Pad prompt_attention_mask with zeros to match the desired shape prompt_attention_mask_padded = torch.nn.functional.pad( prompt_attention_mask[:, 1:], (0, pad_length), mode="constant", value=0 ) # Perform the subtraction operation mask = chosen_attention_mask[:, :-1] > prompt_attention_mask_padded per_token_logps = torch.gather( logits[:, :-1, :].log_softmax(-1), dim=2, index=(mask * chosen_inputs[:, 1:]).unsqueeze(2), ).squeeze(2) return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1) def orpo_compute_loss( self, model, inputs, return_outputs=False, num_items_in_batch=None, ): concat_inputs = AxolotlTrainer.orpo_concatenate_inputs( inputs, label_pad_token=-100, pad_token=self.tokenizer.pad_token_id, device=self.accelerator.device, ) # Perform a single forward pass outputs = model( **{ "input_ids": concat_inputs["input_ids"], "attention_mask": concat_inputs["attention_mask"], "labels": concat_inputs["labels"], }, output_hidden_states=True, ) # Split the outputs for positive and negative examples outputs_pos, outputs_neg = outputs.logits.chunk(2) # Calculate NLL loss pos_loss = self.orpo_compute_custom_loss( logits=outputs_pos, labels=concat_inputs["input_ids"].chunk(2)[0] ) # Calculate Log Probability pos_prob = self.orpo_compute_logps( prompt_attention_mask=concat_inputs["prompt_attention_mask"], chosen_inputs=concat_inputs["input_ids"].chunk(2)[0], chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[0], logits=outputs_pos, ) neg_prob = self.orpo_compute_logps( prompt_attention_mask=concat_inputs["prompt_attention_mask"], chosen_inputs=concat_inputs["input_ids"].chunk(2)[1], chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[1], logits=outputs_neg, ) # Calculate log odds log_odds = (pos_prob - neg_prob) - ( torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob)) ) sig_ratio = torch.nn.functional.sigmoid(log_odds) ratio = torch.log(sig_ratio) # Calculate the Final Loss loss = torch.mean(pos_loss - self.args.orpo_alpha * ratio).to( dtype=torch.bfloat16 ) metrics = {} metrics["chosen_geometric_mean"] = torch.mean(pos_prob).cpu().item() metrics["rejected_geometric_mean"] = torch.mean(neg_prob).cpu().item() metrics["log_odds_ratio"] = torch.mean(ratio).cpu().item() metrics["log_odds"] = torch.mean(log_odds).cpu().item() self.store_metrics(metrics, train_eval="train") return (loss, outputs_pos) if return_outputs else loss @wraps(Trainer.push_to_hub) def push_to_hub(self, *args, **kwargs) -> str: """ Overwrite the `push_to_hub` method in order to force-add the tags when pushing the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details. """ kwargs = sanitize_kwargs_for_ds_tagging( dataset_tags=self.dataset_tags, kwargs=kwargs ) kwargs = sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs) return super().push_to_hub(*args, **kwargs) @wraps(Trainer.create_accelerator_and_postprocess) def create_accelerator_and_postprocess(self): # cleanup the PartialState states so Accelerate automatically configures everything from the env vars accelerator_config = self.args.accelerator_config.to_dict() use_configured_state = accelerator_config.get("use_configured_state", False) if not use_configured_state: AcceleratorState._reset_state(reset_partial_state=True) super().create_accelerator_and_postprocess() def additional_accelerator_args( self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs ) -> dict[str, Any]: ret_kwargs = {} if fp8: from accelerate.utils import AORecipeKwargs from torchao.float8 import Float8LinearConfig # By default, Float8LinearConfig is instantiated using the "tensorwise" # scaling strategy. See more details here: # https://github.com/pytorch/ao/tree/main/torchao/float8. config = Float8LinearConfig( enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather, force_recompute_fp8_weight_in_bwd=enable_fsdp_float8_all_gather is True, ) ret_kwargs["mixed_precision"] = "fp8" ret_kwargs["kwargs_handlers"] = [AORecipeKwargs(config=config)] # type: ignore os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8" return ret_kwargs def log(self, logs: dict[str, float], start_time: float | None = None) -> None: """ Log `logs` on the various objects watching training, including stored metrics. Args: logs: The values to log. start_time: The start of training. """ # logs either has 'loss' or 'eval_loss' train_eval = "train" if "loss" in logs else "eval" metric_ndigits = int(os.getenv("AXOLOTL_METRIC_NDIGITS", "5")) for key, metric_data in self._stored_metrics[train_eval].items(): values = torch.tensor(metric_data["values"]) # type: ignore[arg-type] reduction_type = metric_data["reduction"] fn = REDUCTION_FNS.get(reduction_type) if fn is None: raise NotImplementedError( "Metric reduction must be one of [mean, min, max, sum]" ) logs[key] = round(fn(values).item(), metric_ndigits) if "loss" in logs: try: logs["ppl"] = round(math.exp(logs["loss"]), metric_ndigits) except OverflowError: logs["ppl"] = float("inf") if "eval_loss" in logs: try: logs["eval_ppl"] = round(math.exp(logs["eval_loss"]), metric_ndigits) except OverflowError: logs["eval_ppl"] = float("inf") if is_main_process(): # Add memory usage try: active, allocated, reserved = get_gpu_memory_usage() logs["memory/max_active (GiB)"] = round(active, 2) logs["memory/max_allocated (GiB)"] = round(allocated, 2) logs["memory/device_reserved (GiB)"] = round(reserved, 2) except (ValueError, TypeError, FileNotFoundError): pass if ( self.args.include_tkps and train_eval == "train" and hasattr(self.state, "tokens") ): # each rank will log its own tokens per second # for logging_steps > 1 we obtain a moving average of this metric logs["tokens/train_per_sec_per_gpu"] = round( self.state.last_tokens_per_second.item() / self.args.logging_steps, 2 ) if "total" in self.state.tokens: logs["tokens/total"] = int(self.state.tokens["total"].item()) if "trainable" in self.state.tokens: logs["tokens/trainable"] = int(self.state.tokens["trainable"].item()) del self._stored_metrics[train_eval] return super().log(logs, start_time) def store_metrics( self, metrics: dict[str, float] | dict[str, tuple[int | float, str]], train_eval: Literal["train", "eval"] = "train", reduction: Literal["mean", "min", "max", "sum"] = "mean", ) -> None: """ Store metrics with specified reduction type. Args: metrics: Dictionary of metric names to values, or metric names to (value, reduction_type) tuples. train_eval: Whether this is for training or evaluation. """ for key, value in metrics.items(): if isinstance(value, tuple): value, _reduction = value # type: ignore[assignment] else: value, _reduction = value, reduction self._stored_metrics[train_eval][key]["values"].append(value) self._stored_metrics[train_eval][key]["reduction"] = _reduction def _save_checkpoint(self, model, trial, **kwargs): # make sure the checkpoint dir exists, since trainer is flakey checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" run_dir = self._get_output_dir(trial=trial) output_dir = os.path.join(run_dir, checkpoint_folder) os.makedirs(output_dir, exist_ok=True) # Save total_tokens state if tracking is enabled if self.args.include_tkps and hasattr(self.state, "tokens"): tokens_state = { "total": int(torch.as_tensor(self.state.tokens.get("total", 0)).item()), "trainable": int( torch.as_tensor(self.state.tokens.get("trainable", 0)).item() ), } tokens_state_path = os.path.join(output_dir, TOKENS_STATE_FILE) with open(tokens_state_path, "w", encoding="utf-8") as f: json.dump(tokens_state, f) return super()._save_checkpoint(model, trial, **kwargs) # TODO(wing): remove once https://github.com/huggingface/transformers/pull/39866/files is merged def _save(self, output_dir: Optional[str] = None, state_dict=None): # If we are executing this function, we are the process zero, so we don't check for that. output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) LOG.info(f"Saving model checkpoint to {output_dir}") # fix for Context Parallel save: CP eval invalidates tensor storage # pointers, so clone to CPU to get fresh valid storage for safetensors if ( state_dict is not None and self.axolotl_cfg and self.axolotl_cfg.context_parallel_size and self.axolotl_cfg.context_parallel_size > 1 ): state_dict = { k: v.detach().cpu() if isinstance(v, torch.Tensor) else v for k, v in state_dict.items() } supported_classes = ( (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel) ) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, supported_classes): if state_dict is None: state_dict = self.model.state_dict() if isinstance( self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes, ): self.accelerator.unwrap_model( self.model, keep_torch_compile=False ).save_pretrained( output_dir, state_dict=state_dict, is_main_process=self.accelerator.is_main_process, ) else: LOG.info( "Trainer.model is not a `PreTrainedModel`, only saving its state dict." ) safetensors.torch.save_file( state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}, ) else: self.model.save_pretrained( output_dir, state_dict=state_dict, is_main_process=self.accelerator.is_main_process, ) if self.processing_class is not None: self.processing_class.save_pretrained(output_dir) elif ( self.data_collator is not None and hasattr(self.data_collator, "tokenizer") and self.data_collator.tokenizer is not None ): LOG.info( "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`" ) self.data_collator.tokenizer.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) ================================================ FILE: src/axolotl/core/trainers/dpo/__init__.py ================================================ """DPO Specific Strategy for training""" from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer from axolotl.utils.schemas.enums import RLType class DPOStrategy: """Strategy for DPO training""" @classmethod def get_trainer_class(cls): return AxolotlDPOTrainer @classmethod def get_training_args_class(cls): from axolotl.core.trainers.dpo.args import AxolotlDPOConfig return AxolotlDPOConfig @classmethod def set_training_args_kwargs(cls, cfg): training_args_kwargs = {} if cfg.rl is RLType.IPO: training_args_kwargs["loss_type"] = "ipo" # Label smoothing is not compatible with IPO if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing: training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing training_args_kwargs["max_length"] = cfg.sequence_len if cfg.dpo_use_weighting is not None: training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting if cfg.dpo_padding_free is not None: training_args_kwargs["padding_free"] = cfg.dpo_padding_free if cfg.dpo_norm_loss is not None: training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss if cfg.dpo_use_liger_kernel is not None: training_args_kwargs["use_liger_kernel"] = cfg.dpo_use_liger_kernel return training_args_kwargs ================================================ FILE: src/axolotl/core/trainers/dpo/args.py ================================================ """ Axolotl specific DPO args """ from dataclasses import dataclass from trl import DPOConfig from axolotl.core.training_args import AxolotlTrainingMixins @dataclass class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig): """ DPO config for DPO training """ dpo_norm_loss: bool | None = False ================================================ FILE: src/axolotl/core/trainers/dpo/trainer.py ================================================ """DPO trainer for axolotl""" import gc from functools import wraps from typing import Any, Dict, Union import torch from torch import nn from trl import DPOTrainer from axolotl.core.trainers.mixins import ( DistributedParallelMixin, RngLoaderMixin, SchedulerMixin, ) from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin from axolotl.core.trainers.utils import ( sanitize_kwargs_for_ds_tagging, sanitize_kwargs_for_tagging, ) class AxolotlDPOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DPOTrainer, DistributedParallelMixin, ): """Extend the base DPOTrainer for axolotl helpers.""" tag_names = ["axolotl", "dpo"] def __init__(self, *args, dataset_tags=None, **kwargs): super().__init__(*args, **kwargs) self.dataset_tags = dataset_tags self.optimizer = None self.model_accepts_loss_kwargs = False @wraps(DPOTrainer.push_to_hub) def push_to_hub(self, *args, **kwargs) -> str: """ Overwrite the `push_to_hub` method in order to force-add the tags when pushing the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details. """ kwargs = sanitize_kwargs_for_ds_tagging( dataset_tags=self.dataset_tags, kwargs=kwargs ) kwargs = sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs) return super().push_to_hub(*args, **kwargs) @staticmethod def tokenize_row( features, processing_class, max_prompt_length: int | None = None, max_completion_length: int | None = None, add_special_tokens: bool = True, is_chat: bool = False, ) -> Dict: res = DPOTrainer.tokenize_row( features, processing_class, max_prompt_length=max_prompt_length, max_completion_length=max_completion_length, add_special_tokens=add_special_tokens, is_chat=is_chat, ) # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen if processing_class.bos_token is None and res["prompt_input_ids"][0] is None: for key in res.keys(): res[key] = res[key][1:] if processing_class.bos_token and processing_class.bos_token_id is not None: # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs if res["chosen_input_ids"][0] == processing_class.bos_token_id: res["chosen_input_ids"] = res["chosen_input_ids"][1:] if res["rejected_input_ids"][0] == processing_class.bos_token_id: res["rejected_input_ids"] = res["rejected_input_ids"][1:] return res def training_step( self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None, ) -> torch.Tensor: loss: torch.Tensor = super().training_step(model, inputs, num_items_in_batch) gc.collect() torch.cuda.empty_cache() return loss def concatenated_forward( self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]], is_ref_model: bool = False, ) -> dict[str, torch.Tensor]: if self.args.dpo_norm_loss: # fmt: off loss_type: list[str] = self.loss_type # type: ignore[has-type] # fmt: on # concatenated_forward handles avg token logprob for ipo case already self.loss_type = ["ipo"] res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model) self.loss_type = loss_type return res return super().concatenated_forward(model, batch, is_ref_model=is_ref_model) ================================================ FILE: src/axolotl/core/trainers/grpo/__init__.py ================================================ """GRPO Specific Strategy for training""" import importlib import inspect import os from typing import Any from huggingface_hub import snapshot_download from requests import HTTPError from trl.trainer.grpo_trainer import RewardFunc from axolotl.core.trainers.grpo.args import AxolotlAsyncGRPOConfig, AxolotlGRPOConfig from axolotl.core.trainers.grpo.trainer import ( AxolotlAsyncGRPOTrainer, AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer, ) from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.schemas.trl import TRLConfig from axolotl.utils.schemas.vllm import VllmConfig LOG = get_logger(__name__) class GRPOStrategy: """Strategy for GRPO training""" @classmethod def get_trainer_class( cls, sequence_parallel: bool, async_grpo: bool = False, ) -> ( type[AxolotlGRPOTrainer] | type[AxolotlGRPOSequenceParallelTrainer] | type[AxolotlAsyncGRPOTrainer] ): if sequence_parallel and async_grpo: raise ValueError( "sequence_parallel and async_grpo cannot both be enabled. " "Disable one of context_parallel_size > 1 or async_prefetch/use_data_producer." ) if sequence_parallel: return AxolotlGRPOSequenceParallelTrainer if async_grpo: return AxolotlAsyncGRPOTrainer return AxolotlGRPOTrainer @classmethod def get_training_args_class( cls, async_grpo: bool = False ) -> type[AxolotlGRPOConfig] | type[AxolotlAsyncGRPOConfig]: if async_grpo: return AxolotlAsyncGRPOConfig return AxolotlGRPOConfig @classmethod def set_training_args_kwargs(cls, cfg: DictDefault) -> dict[str, Any]: grpo_args_kwargs: dict[str, Any] = {} if not hasattr(cfg, "trl") or not cfg.trl: return grpo_args_kwargs trl: TRLConfig = cfg.trl # type: ignore vllm_cfg: VllmConfig = cfg.vllm # type: ignore if trl.use_vllm: grpo_args_kwargs["use_vllm"] = trl.use_vllm if trl.vllm_mode: grpo_args_kwargs["vllm_mode"] = trl.vllm_mode if trl.vllm_mode == "colocate": grpo_args_kwargs["vllm_enable_sleep_mode"] = trl.vllm_enable_sleep_mode # type: ignore[attr-defined] grpo_args_kwargs["vllm_gpu_memory_utilization"] = ( vllm_cfg.gpu_memory_utilization ) grpo_args_kwargs["vllm_tensor_parallel_size"] = ( vllm_cfg.tensor_parallel_size ) grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host # type: ignore[attr-defined] grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port # type: ignore[attr-defined] if trl.vllm_server_timeout: grpo_args_kwargs["vllm_server_timeout"] = trl.vllm_server_timeout if trl.vllm_guided_decoding_regex: grpo_args_kwargs["vllm_guided_decoding_regex"] = ( trl.vllm_guided_decoding_regex ) if trl.num_generations: grpo_args_kwargs["num_generations"] = trl.num_generations if trl.sync_ref_model: grpo_args_kwargs["sync_ref_model"] = trl.sync_ref_model if trl.ref_model_mixup_alpha: grpo_args_kwargs["ref_model_mixup_alpha"] = trl.ref_model_mixup_alpha if trl.ref_model_sync_steps: grpo_args_kwargs["ref_model_sync_steps"] = trl.ref_model_sync_steps grpo_args_kwargs["max_completion_length"] = trl.max_completion_length grpo_args_kwargs["log_completions"] = trl.log_completions grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print if cfg.context_parallel_size > 1: grpo_args_kwargs["context_parallel_size"] = cfg.context_parallel_size if trl.importance_sampling_level is not None: grpo_args_kwargs["importance_sampling_level"] = ( trl.importance_sampling_level ) if trl.reward_weights: grpo_args_kwargs["reward_weights"] = trl.reward_weights if trl.scale_rewards is not None: grpo_args_kwargs["scale_rewards"] = trl.scale_rewards if trl.loss_type is not None: grpo_args_kwargs["loss_type"] = trl.loss_type if trl.mask_truncated_completions is not None: grpo_args_kwargs["mask_truncated_completions"] = ( trl.mask_truncated_completions ) if trl.temperature is not None: grpo_args_kwargs["temperature"] = trl.temperature if trl.top_p is not None: grpo_args_kwargs["top_p"] = trl.top_p if trl.top_k is not None: grpo_args_kwargs["top_k"] = trl.top_k if trl.min_p is not None: grpo_args_kwargs["min_p"] = trl.min_p if trl.repetition_penalty is not None: grpo_args_kwargs["repetition_penalty"] = trl.repetition_penalty if trl.num_iterations is not None: grpo_args_kwargs["num_iterations"] = trl.num_iterations if trl.epsilon is not None: grpo_args_kwargs["epsilon"] = trl.epsilon if trl.epsilon_high is not None: grpo_args_kwargs["epsilon_high"] = trl.epsilon_high if trl.use_liger_loss is not None: grpo_args_kwargs["use_liger_kernel"] = trl.use_liger_loss if trl.multi_objective_aggregation is not None: grpo_args_kwargs["multi_objective_aggregation"] = ( trl.multi_objective_aggregation ) # Async GRPO fields if getattr(trl, "use_data_producer", None) is not None: grpo_args_kwargs["use_data_producer"] = trl.use_data_producer if getattr(trl, "async_prefetch", None) is not None: grpo_args_kwargs["async_prefetch"] = trl.async_prefetch if getattr(trl, "prefetch_depth", None) is not None: grpo_args_kwargs["prefetch_depth"] = trl.prefetch_depth if getattr(trl, "vllm_sync_interval", None) is not None: grpo_args_kwargs["vllm_sync_interval"] = trl.vllm_sync_interval if getattr(trl, "streaming_partial_batch", None) is not None: grpo_args_kwargs["streaming_partial_batch"] = trl.streaming_partial_batch if getattr(trl, "streaming_min_groups", None) is not None: grpo_args_kwargs["streaming_min_groups"] = trl.streaming_min_groups if getattr(trl, "vllm_importance_sampling_correction", None) is not None: grpo_args_kwargs["vllm_importance_sampling_correction"] = ( trl.vllm_importance_sampling_correction ) if getattr(trl, "vllm_importance_sampling_mode", None) is not None: grpo_args_kwargs["vllm_importance_sampling_mode"] = ( trl.vllm_importance_sampling_mode ) if getattr(trl, "vllm_importance_sampling_cap", None) is not None: grpo_args_kwargs["vllm_importance_sampling_cap"] = ( trl.vllm_importance_sampling_cap ) if getattr(trl, "off_policy_mask_threshold", None) is not None: grpo_args_kwargs["off_policy_mask_threshold"] = ( trl.off_policy_mask_threshold ) if getattr(trl, "use_bias_correction_kl", None) is not None: grpo_args_kwargs["use_bias_correction_kl"] = trl.use_bias_correction_kl # Fast Async GRPO fields if getattr(trl, "reward_num_workers", None) is not None: grpo_args_kwargs["reward_num_workers"] = trl.reward_num_workers if getattr(trl, "replay_buffer_size", None) is not None: grpo_args_kwargs["replay_buffer_size"] = trl.replay_buffer_size if getattr(trl, "replay_recompute_logps", None) is not None: grpo_args_kwargs["replay_recompute_logps"] = trl.replay_recompute_logps if getattr(trl, "reroll_start_fraction", None) is not None: grpo_args_kwargs["reroll_start_fraction"] = trl.reroll_start_fraction if getattr(trl, "reroll_max_groups", None) is not None: grpo_args_kwargs["reroll_max_groups"] = trl.reroll_max_groups if getattr(trl, "skip_zero_advantage_batches", None) is not None: grpo_args_kwargs["skip_zero_advantage_batches"] = ( trl.skip_zero_advantage_batches ) if getattr(trl, "vllm_lora_sync", None) is not None: grpo_args_kwargs["vllm_lora_sync"] = trl.vllm_lora_sync return grpo_args_kwargs @classmethod def set_trainer_args(cls, cfg: DictDefault) -> list[Any]: trainer_args = [] if cfg.trl and cfg.trl.reward_funcs: reward_funcs = [] for reward_func_fqn in cfg.trl.reward_funcs: reward_funcs.append(cls.get_reward_func(reward_func_fqn)) trainer_args.append(reward_funcs) return trainer_args @classmethod def set_trainer_kwargs(cls, cfg: DictDefault) -> dict[str, Any]: trainer_kwargs = {} if cfg.trl and cfg.trl.reward_processing_classes: trainer_kwargs["reward_processing_classes"] = ( cfg.trl.reward_processing_classes ) if cfg.trl and cfg.trl.rollout_func: trainer_kwargs["rollout_func"] = cls.get_rollout_func(cfg.trl.rollout_func) return trainer_kwargs @classmethod def get_collator(cls, *args, **kwargs): # No data collation is needed in GRPO, handled by trl's trainer __init__ return None @classmethod def get_blocklist_args_kwargs(cls) -> list[str]: return [ "dataset_num_proc", "max_length", "include_tokens_per_second", "max_prompt_length", ] @classmethod def get_reward_func(cls, reward_func_fqn: str) -> RewardFunc: """ Returns the reward function from the given fully qualified name, or the path to the reward function model. Args: reward_func_fqn (str): Fully qualified name of the reward function (e.g. r1_grpo.gsm8k_transform), or a HF hub path to the reward model. Returns: RewardFunc: A callable that accepts prompts and completions and returns rewards, or a path to a reward model. Raises: ValueError: If the reward function does not accept at least two arguments. """ try: # use importlib to dynamically load the reward function from the module reward_func_module_name = reward_func_fqn.split(".")[-1] reward_func_module = importlib.import_module( ".".join(reward_func_fqn.split(".")[:-1]) ) reward_func = getattr(reward_func_module, reward_func_module_name) if not len(inspect.signature(reward_func).parameters) >= 2: raise ValueError( "Reward function must accept at least two arguments: prompts: list and completions: list" ) return reward_func except ModuleNotFoundError as exc: # the user has passed a string (ideally indicating the path of a reward model) # check if it's a local dir path and not empty dir to a reward model pretrained_log_msg = f"Reward function {reward_func_fqn} is a pre-trained model path - if this is unexpected, please check the reward function path." if os.path.isdir(reward_func_fqn) and os.listdir(reward_func_fqn): LOG.info(pretrained_log_msg) return reward_func_fqn try: snapshot_download(reward_func_fqn, repo_type="model") LOG.info(pretrained_log_msg) return reward_func_fqn except HTTPError: raise ValueError( f"Reward function {reward_func_fqn} not found." ) from exc @classmethod def get_rollout_func(cls, rollout_func_fqn: str): """ Returns the rollout function from the given fully qualified name. Args: rollout_func_fqn (str): Fully qualified name of the rollout function (e.g. my_module.my_rollout_func) Returns: Callable rollout function """ try: rollout_func_module_name = rollout_func_fqn.split(".")[-1] rollout_func_module = importlib.import_module( ".".join(rollout_func_fqn.split(".")[:-1]) ) rollout_func = getattr(rollout_func_module, rollout_func_module_name) if not callable(rollout_func): raise ValueError( f"Rollout function {rollout_func_fqn} must be callable" ) return rollout_func except ModuleNotFoundError as exc: raise ValueError(f"Rollout function {rollout_func_fqn} not found.") from exc ================================================ FILE: src/axolotl/core/trainers/grpo/args.py ================================================ """ Axolotl Specific Training Args """ from dataclasses import dataclass from trl import GRPOConfig from axolotl.core.trainers.grpo.fast_async_trainer import FastAsyncGRPOConfig from axolotl.core.training_args import AxolotlTrainingMixins @dataclass class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig): """Axolotl GRPO Config for GRPO training""" context_parallel_size: int | None = None @dataclass class AxolotlAsyncGRPOConfig(AxolotlTrainingMixins, FastAsyncGRPOConfig): """Axolotl Async GRPO Config — adds async prefetch, streaming scoring, and IS correction.""" context_parallel_size: int | None = None ================================================ FILE: src/axolotl/core/trainers/grpo/async_trainer.py ================================================ """ Async GRPO training with streaming scoring and IS correction. Works on stock TRL v0.29.0 and transformers v5.3.0 — no custom branches needed. Features: - Async prefetch: background thread generates completions via vLLM while the main thread trains on the previous rollout. - Deferred scoring: rewards, advantages, and policy logprobs computed on the main thread (thread-safe with GPU forward passes). - Streaming group scoring: scores prompt groups incrementally so that reward computation overlaps with the next group's logprob computation. - Importance sampling (IS) correction: corrects for stale vLLM weights. - Off-Policy Sequence Mask (OPSM): drops sequences with high KL + negative advantage. - Configurable vLLM weight sync interval. Classes exported: - AsyncGRPOConfig: GRPOConfig extended with async/streaming/IS fields - AsyncGRPOTrainer: GRPOTrainer with async prefetch and IS correction - ProducerConfig, DataProducer, BaseDataProducer, AsyncDataProducer: data producer protocol """ import atexit import concurrent.futures import logging import queue import threading from abc import ABC, abstractmethod from collections import deque from contextlib import nullcontext from dataclasses import dataclass, field from typing import Any import torch from torch.utils.data import DataLoader, Dataset from trl.extras.profiling import profiling_decorator from trl.trainer import GRPOConfig, GRPOTrainer from trl.trainer.utils import ( RepeatSampler, entropy_from_logits, nanmax, nanmin, nanstd, pad, selective_log_softmax, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, unsplit_pixel_values_by_grid, ) try: from trl.data_utils import ( apply_chat_template, is_conversational, prepare_multimodal_messages, ) except ImportError: from trl.chat_template_utils import apply_chat_template from trl.data_utils import is_conversational, prepare_multimodal_messages try: from trl.models.utils import disable_gradient_checkpointing except ImportError: from contextlib import contextmanager @contextmanager def disable_gradient_checkpointing(model, kwargs): yield try: from accelerate.utils import gather_object except ImportError: gather_object = None try: from peft import PeftModel from trl.trainer.utils import use_adapter except ImportError: PeftModel = None use_adapter = nullcontext try: from liger_kernel.ops.grpo_loss import ( fused_selective_log_softmax as _fused_selective_log_softmax, ) except ImportError: _fused_selective_log_softmax = None # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- @dataclass class AsyncGRPOConfig(GRPOConfig): """GRPOConfig extended with async prefetch, streaming scoring, and IS correction fields. Fields already present in stock GRPOConfig (e.g. ``importance_sampling_level``, ``multi_objective_aggregation``) are listed here for safety: if the stock version does not define them, the defaults below ensure everything works. """ # --- Data producer --- use_data_producer: bool = field( default=False, metadata={ "help": "Use the GRPODataProducer protocol for online data generation." }, ) # --- Async data production --- async_prefetch: bool = field( default=False, metadata={ "help": "Generate rollouts in a background thread while training on the previous rollout." }, ) prefetch_depth: int = field( default=1, metadata={"help": "Number of rollouts to prefetch ahead of training."}, ) vllm_sync_interval: int = field( default=1, metadata={ "help": "Sync model weights to vLLM every N optimizer steps (async mode only)." }, ) # --- Streaming scoring --- streaming_partial_batch: bool = field( default=False, metadata={ "help": "Score prompt groups incrementally instead of the full batch at once." }, ) streaming_min_groups: int = field( default=1, metadata={"help": "Minimum prompt groups to score per streaming chunk."}, ) # --- vLLM importance sampling correction --- vllm_importance_sampling_correction: bool = field( default=True, metadata={ "help": "Apply IS correction for distribution mismatch between vLLM and training model." }, ) vllm_importance_sampling_mode: str = field( default="token_truncate", metadata={ "help": "IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask." }, ) vllm_importance_sampling_cap: float = field( default=3.0, metadata={"help": "Cap C for IS ratio clipping/masking."}, ) # --- Off-policy sequence mask (OPSM) --- off_policy_mask_threshold: float | None = field( default=None, metadata={"help": "KL threshold for OPSM (DeepSeek-V3.2). None = disabled."}, ) # --- Bias-corrected KL --- use_bias_correction_kl: bool = field( default=False, metadata={"help": "Apply IS correction to KL divergence term."}, ) # --------------------------------------------------------------------------- # Data Producer Protocol (standalone — no transformers branch needed) # --------------------------------------------------------------------------- logger = logging.getLogger(__name__) _dp_logger = logging.getLogger(__name__ + ".data_producer") @dataclass class ProducerConfig: """Configuration for a :class:`DataProducer`. Args: mini_epochs: Number of training passes over each produced dataset. max_rollouts: Maximum number of produce-then-train rounds (None = unlimited). steps_per_generation: Optimisation steps per produced dataset before regenerating. num_iterations: Number of times to reuse each generation across optimisation steps. async_prefetch: Produce the next dataset in a background thread. prefetch_depth: How many rollouts to queue ahead when async. sync_warmup_rollouts: Initial on-policy rollouts before switching to async. eval_during_produce: Switch model to eval() during produce(). empty_cache_before_produce: torch.cuda.empty_cache() before produce(). empty_cache_after_produce: torch.cuda.empty_cache() after produce(). """ mini_epochs: int = 1 max_rollouts: int | None = None steps_per_generation: int | None = None num_iterations: int = 1 async_prefetch: bool = False prefetch_depth: int = 1 sync_warmup_rollouts: int = 0 eval_during_produce: bool = True empty_cache_before_produce: bool = False empty_cache_after_produce: bool = False def __post_init__(self): if self.mini_epochs < 1: raise ValueError(f"mini_epochs must be >= 1, got {self.mini_epochs}") if self.max_rollouts is not None and self.max_rollouts < 1: raise ValueError( f"max_rollouts must be >= 1 or None, got {self.max_rollouts}" ) if self.num_iterations < 1: raise ValueError(f"num_iterations must be >= 1, got {self.num_iterations}") if self.steps_per_generation is not None and self.steps_per_generation < 1: raise ValueError( f"steps_per_generation must be >= 1 or None, got {self.steps_per_generation}" ) if self.prefetch_depth < 1: raise ValueError(f"prefetch_depth must be >= 1, got {self.prefetch_depth}") if self.sync_warmup_rollouts < 0: raise ValueError( f"sync_warmup_rollouts must be >= 0, got {self.sync_warmup_rollouts}" ) class DataProducer(ABC): """Abstract base class for online data producers. Subclass this and implement :meth:`produce` to supply fresh training data each rollout round. """ config: ProducerConfig @abstractmethod def produce( self, model: Any, global_step: int, *, processing_class: Any = None, accelerator: Any = None, args: Any = None, **kwargs, ) -> Dataset: """Generate a fresh training dataset.""" ... class BaseDataProducer(DataProducer): """Convenience base class with a default :class:`ProducerConfig` and lifecycle hooks.""" def __init__(self, config: ProducerConfig | None = None): self.config = config or ProducerConfig() def on_rollout_begin(self, global_step: int) -> None: """Called before each produce() invocation.""" def on_rollout_end(self, dataset: Dataset, global_step: int) -> None: """Called after each produce() invocation with the produced dataset.""" class AsyncDataProducer: """Wraps a synchronous :class:`DataProducer` for background-thread data generation. While the Trainer trains on the current rollout, this wrapper produces upcoming datasets in a background thread. FSDP compatibility: Background threads must NOT call cross-rank collectives (gather_object, broadcast_object_list, FSDP all-gather) because the main thread may be doing FSDP forward/backward concurrently, causing deadlocks. When ``num_processes > 1``, only rank 0 runs BG generation; results are broadcast to other ranks on the main thread when ``produce()`` is next called. """ def __init__( self, inner: DataProducer, background_produce_kwargs: dict | None = None ): self._inner = inner self._depth = inner.config.prefetch_depth self._warmup_remaining = inner.config.sync_warmup_rollouts self._background_kwargs = background_produce_kwargs or {} self._executor = concurrent.futures.ThreadPoolExecutor( max_workers=1, thread_name_prefix="async-producer" ) self._queue: deque[concurrent.futures.Future] = deque() self._initialized = False # Lock held by the background thread during vLLM generation. # The main thread acquires this lock for weight sync to ensure # merge_adapter/unmerge_adapter don't overlap with generation. self._generate_lock = threading.Lock() # Detected at first produce() call self._num_processes: int | None = None self._is_main: bool | None = None @property def config(self) -> ProducerConfig: return self._inner.config def produce(self, model: Any, global_step: int, **kwargs) -> Dataset: """Return the next dataset, blocking if the prefetch hasn't finished.""" # Detect multi-process on first call if self._num_processes is None: accelerator = kwargs.get("accelerator") if accelerator is not None: self._num_processes = accelerator.num_processes self._is_main = accelerator.is_main_process else: self._num_processes = 1 self._is_main = True # During warmup, produce synchronously (on-policy) if self._warmup_remaining > 0: self._warmup_remaining -= 1 _dp_logger.info( f"AsyncDataProducer: sync warmup rollout (remaining={self._warmup_remaining})" ) return self._inner.produce(model, global_step, **kwargs) if not self._initialized: dataset = self._inner.produce(model, global_step, **kwargs) bg_kwargs = {**kwargs, **self._background_kwargs} # With FSDP (multi-process), only submit BG tasks on rank 0. # Non-rank-0 processes will receive data via broadcast. if self._num_processes > 1: bg_kwargs["_rank0_only"] = True for i in range(1, self._depth + 1): self._queue.append( self._executor.submit( self._locked_produce, model, global_step + i, **bg_kwargs ) ) self._initialized = True return dataset # Get the pre-generated dataset from the BG thread dataset = self._queue.popleft().result() # With FSDP: BG thread only ran on rank 0. Broadcast to all ranks. if self._num_processes > 1: dataset = self._broadcast_dataset(dataset) bg_kwargs = {**kwargs, **self._background_kwargs} if self._num_processes > 1: bg_kwargs["_rank0_only"] = True next_step = global_step + self._depth self._queue.append( self._executor.submit(self._locked_produce, model, next_step, **bg_kwargs) ) return dataset def _broadcast_dataset(self, dataset) -> Dataset: """Broadcast a prefetched dataset from rank 0 to all ranks (main thread). Rank 0 has a full RolloutDataset from BG generation; other ranks have None. After broadcast, tensors are moved to each rank's local device. """ import torch.distributed as dist if not dist.is_initialized(): return dataset # Rank 0 sends _data dict; others receive it obj_list = [dataset._data if self._is_main else None] dist.broadcast_object_list(obj_list, src=0) data: dict[str, Any] = obj_list[0] # type: ignore[assignment] # Move tensors to local device (broadcast_object_list deserializes to CPU) accelerator = self._inner._trainer.accelerator # type: ignore[attr-defined] device = accelerator.device for key, val in data.items(): if isinstance(val, torch.Tensor) and val.device != device: data[key] = val.to(device) if not self._is_main: from axolotl.core.trainers.grpo.async_trainer import RolloutDataset dataset = RolloutDataset(data) else: # Rank 0 already has the dataset, but update _data with device-moved tensors dataset._data = data return dataset def _locked_produce(self, model: Any, global_step: int, **kwargs) -> Dataset: """Run produce while holding the generate lock.""" with self._generate_lock: return self._inner.produce(model, global_step, **kwargs) def on_rollout_begin(self, global_step: int) -> None: if hasattr(self._inner, "on_rollout_begin"): self._inner.on_rollout_begin(global_step) def on_rollout_end(self, dataset: Dataset, global_step: int) -> None: if hasattr(self._inner, "on_rollout_end"): self._inner.on_rollout_end(dataset, global_step) def shutdown(self) -> None: """Shut down the background thread pool and cancel pending futures.""" for future in self._queue: future.cancel() self._queue.clear() self._executor.shutdown(wait=False) class DataProducerCallback: """Marker class: if a DataProducer also inherits from this, the Trainer will automatically register it as a callback.""" pass # --------------------------------------------------------------------------- # RolloutDataset + GRPODataProducer # --------------------------------------------------------------------------- class RolloutDataset(Dataset): """A Dataset wrapping the output dict from _generate_and_score_completions. Per-sample tensors are sliced by index; shared metadata is passed through. """ _ALWAYS_SHARED = frozenset( {"num_items_in_batch", "_pending_policy_logps", "_rank0_only"} ) def __init__(self, data: dict[str, Any]): self._data = data self._shared_keys: set[str] = set() self._sample_keys: set[str] = set() for key, val in data.items(): if key in self._ALWAYS_SHARED: self._shared_keys.add(key) elif not isinstance(val, torch.Tensor): self._shared_keys.add(key) elif val.dim() == 0: self._shared_keys.add(key) else: self._sample_keys.add(key) self._num_samples = 0 for key in self._sample_keys: n = data[key].size(0) if self._num_samples == 0: self._num_samples = n elif n != self._num_samples: raise ValueError( f"Inconsistent sample count: key '{key}' has {n}, expected {self._num_samples}" ) if self._num_samples == 0: raise ValueError("No per-sample tensors found in rollout data") def __len__(self) -> int: return self._num_samples def __getitem__(self, idx: int) -> dict[str, Any]: item: dict[str, Any] = {} for key in self._sample_keys: item[key] = self._data[key][idx] for key in self._shared_keys: item[key] = self._data[key] return item def make_rollout_collator(shared_keys: set[str]): """Return a collator that stacks per-sample tensors and passes shared keys through.""" def _collate(batch: list[dict[str, Any]]) -> dict[str, Any]: result: dict[str, Any] = {} for key in batch[0]: if key in shared_keys: result[key] = batch[0][key] else: values = [item[key] for item in batch] if isinstance(values[0], torch.Tensor): result[key] = torch.stack(values) else: result[key] = values return result return _collate class GRPODataProducer(BaseDataProducer): """Produces GRPO training rollouts using the trainer's generation pipeline. Created before Trainer.__init__ completes; the trainer reference is injected later via set_trainer(). """ def __init__( self, config: ProducerConfig, prompt_dataset, *, num_generations: int, generation_batch_size: int, train_batch_size: int, steps_per_generation: int, shuffle_dataset: bool, seed: int, ): super().__init__(config) self._dataset = prompt_dataset self._num_generations = num_generations self._generation_batch_size = generation_batch_size self._train_batch_size = train_batch_size self._steps_per_generation = steps_per_generation self._shuffle_dataset = shuffle_dataset self._seed = seed self._trainer: Any = None self._prompt_dl: Any = None self._prompt_iter: Any = None def set_trainer(self, trainer) -> None: """Inject the live trainer reference and create the prompt DataLoader.""" self._trainer = trainer self._init_prompt_dataloader() def _init_prompt_dataloader(self) -> None: from functools import partial from transformers.trainer_utils import seed_worker trainer = self._trainer sampler = RepeatSampler( data_source=self._dataset, mini_repeat_count=self._num_generations, batch_size=self._generation_batch_size // self._num_generations, repeat_count=1, shuffle=self._shuffle_dataset, seed=self._seed, ) # Use identity collator (same as stock GRPOTrainer) def _identity(x): return x dl = DataLoader( self._dataset, batch_size=self._train_batch_size * self._steps_per_generation, sampler=sampler, collate_fn=_identity, num_workers=trainer.args.dataloader_num_workers, pin_memory=trainer.args.dataloader_pin_memory, persistent_workers=trainer.args.dataloader_persistent_workers, worker_init_fn=partial( seed_worker, num_workers=trainer.args.dataloader_num_workers, rank=trainer.args.process_index, ), ) self._prompt_dl = trainer.accelerator.prepare(dl) # Don't let accelerator track this dataloader acc_dls = trainer.accelerator._dataloaders if self._prompt_dl in acc_dls: acc_dls.remove(self._prompt_dl) self._prompt_iter = iter(self._prompt_dl) def produce( self, model: Any, global_step: int, *, skip_policy_logps: bool = False, processing_class: Any = None, accelerator: Any = None, args: Any = None, _rank0_only: bool = False, **kwargs, ) -> RolloutDataset | None: """Generate a fresh GRPO training rollout.""" is_main = self._trainer.accelerator.is_main_process # FSDP rank0-only mode: non-rank-0 returns None (broadcast fills it later) if _rank0_only and not is_main: return None try: inputs = next(self._prompt_iter) except StopIteration: self._prompt_iter = iter(self._prompt_dl) inputs = next(self._prompt_iter) if skip_policy_logps: # Async path: use _generate_only (generation without scoring) which # works on stock TRL (no skip_policy_logps parameter needed). output = self._trainer._generate_only(inputs, rank0_only=_rank0_only) else: # Sync path: full generation + scoring output = self._trainer._generate_and_score_completions(inputs) # Strip non-sequence metadata before shuffling metadata = {} for key in list(output.keys()): val = output[key] if not isinstance(val, (torch.Tensor, list)): metadata[key] = output.pop(key) elif isinstance(val, torch.Tensor) and val.dim() == 0: metadata[key] = output.pop(key) output = shuffle_sequence_dict(output) output.update(metadata) return RolloutDataset(output) # --------------------------------------------------------------------------- # Trainer # --------------------------------------------------------------------------- class AsyncGRPOTrainer(GRPOTrainer): """GRPOTrainer with async prefetch, streaming scoring, and IS correction. Drop-in replacement: pass ``AsyncGRPOConfig`` as ``args`` and use this trainer instead of ``GRPOTrainer``. """ def __init__(self, *args, **kwargs): # When using native LoRA sync, skip the NCCL communicator init in VLLMGeneration. # The communicator is not needed because weight sync happens via filesystem + HTTP, # and it fails when vLLM and a trainer rank share the same CUDA device. training_args = kwargs.get("args") or (args[1] if len(args) > 1 else None) if training_args is not None and getattr( training_args, "vllm_lora_sync", False ): from trl.generation.vllm_generation import VLLMGeneration _orig_init_vllm = VLLMGeneration._init_vllm def _init_vllm_no_communicator(self_vllm): """Init vLLM client without NCCL communicator (LoRA sync uses filesystem).""" if self_vllm.mode == "server" and self_vllm.accelerator.is_main_process: from trl.generation.vllm_client import VLLMClient if self_vllm.server_base_url is not None: base_url = self_vllm.server_base_url else: base_url = ( f"http://{self_vllm.server_host}:{self_vllm.server_port}" ) self_vllm.vllm_client = VLLMClient( base_url=base_url, group_port=self_vllm.group_port, connection_timeout=self_vllm.server_timeout, ) # Deliberately skip init_communicator — no NCCL needed elif self_vllm.mode != "server": _orig_init_vllm(self_vllm) VLLMGeneration._init_vllm = _init_vllm_no_communicator super().__init__(*args, **kwargs) # FP8 models: zero out the pad token embedding so that padding # positions have zero hidden states throughout the network. # FP8 linear layers produce NaN on non-zero inputs at masked # positions (the Triton fp8 matmul kernel can't handle the # extreme values that accumulate at unattended positions). self._zero_pad_embedding_for_fp8() # Ensure custom attributes exist (stock GRPOTrainer.__init__ may not set them). for attr, cfg_key, default in [ ( "vllm_importance_sampling_correction", "vllm_importance_sampling_correction", True, ), ( "vllm_importance_sampling_mode", "vllm_importance_sampling_mode", "token_truncate", ), ("vllm_importance_sampling_cap", "vllm_importance_sampling_cap", 3.0), ("off_policy_mask_threshold", "off_policy_mask_threshold", None), ]: if not hasattr(self, attr): setattr(self, attr, getattr(self.args, cfg_key, default)) # Async state self._async_queue: queue.Queue | None = None self._executor: concurrent.futures.ThreadPoolExecutor | None = None self._prompt_iter = None self._last_synced_step = -1 self._buffered_inputs: list | None = None # override stock attr # Data producer (the proper architecture for async generation) self.data_producer = None if getattr(self.args, "use_data_producer", False): self.data_producer = self._create_data_producer( kwargs["args"], kwargs["train_dataset"] ) if self.args.async_prefetch and self.data_producer is None: # Legacy path: direct _prepare_inputs override without data producer self._setup_async() def _create_data_producer(self, args, train_dataset): """Create and return the GRPODataProducer (possibly wrapped in AsyncDataProducer).""" producer_config = ProducerConfig( mini_epochs=args.num_iterations, max_rollouts=None, eval_during_produce=False, empty_cache_before_produce=True, empty_cache_after_produce=True, async_prefetch=args.async_prefetch, prefetch_depth=args.prefetch_depth, ) data_producer = GRPODataProducer( config=producer_config, prompt_dataset=train_dataset, num_generations=self.num_generations, generation_batch_size=args.generation_batch_size, train_batch_size=args.per_device_train_batch_size, steps_per_generation=args.steps_per_generation, shuffle_dataset=getattr(self, "shuffle_dataset", True), seed=args.seed, ) data_producer.set_trainer(self) if args.async_prefetch: data_producer = AsyncDataProducer( data_producer, background_produce_kwargs={"skip_policy_logps": True}, ) return data_producer # ------------------------------------------------------------------ # Async setup / teardown # ------------------------------------------------------------------ def _setup_async(self): """Create background thread pool, prompt iterator, and pre-fill the async queue.""" gen_batch_size = getattr( self.args, "generation_batch_size", self._train_batch_size * self.args.gradient_accumulation_steps, ) # RepeatSampler groups prompts with num_generations repetitions each. # DataLoader batches the yielded indices into generation-sized batches. sampler = RepeatSampler( data_source=self.train_dataset, mini_repeat_count=self.num_generations, batch_size=gen_batch_size // self.num_generations, repeat_count=10_000, # effectively infinite shuffle=True, seed=self.args.seed, ) self._prompt_dataloader = DataLoader( self.train_dataset, batch_size=gen_batch_size, sampler=sampler, collate_fn=self.data_collator, num_workers=0, ) self._prompt_iter = iter(self._prompt_dataloader) self._async_queue = queue.Queue(maxsize=self.args.prefetch_depth) self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) # Pre-submit generations to fill the queue for _ in range(self.args.prefetch_depth): self._submit_generation() atexit.register(self._shutdown_async) def _shutdown_async(self): if self._executor is not None: self._executor.shutdown(wait=False, cancel_futures=True) self._executor = None def _submit_generation(self): """Submit the next background generation job.""" batch = next(self._prompt_iter) future = self._executor.submit(self._generate_only, batch) self._async_queue.put(future) # ------------------------------------------------------------------ # Weight sync # ------------------------------------------------------------------ def _sync_peft_weights_no_merge(self): """Thread-safe weight sync: compute merged LoRA weights without in-place modification. Required for FP8 models where merge_adapter() fails (addmm not implemented for Float8), and also safe for concurrent use since it never modifies base weights in-place. """ model = self.vllm_generation.model accelerator = self.vllm_generation.accelerator vllm_client = self.vllm_generation.vllm_client fix_name = self.vllm_generation._fix_param_name_to_vllm if not (self.vllm_generation.mode == "server" and accelerator.is_main_process): return # Build lookup: module_path -> (A, B, scaling) for all active LoRA layers lora_info = {} for mod_name, module in model.base_model.model.named_modules(): if not hasattr(module, "lora_A") or not hasattr(module, "active_adapters"): continue active = module.active_adapters[0] if active not in module.lora_A: continue lora_info[mod_name] = ( module.lora_A[active].weight.data, module.lora_B[active].weight.data, module.scaling[active], ) # Build lookup for FP8 scale_inv parameters (needed for dequantization) scale_inv_lookup = {} for pname, pparam in model.named_parameters(): if "weight_scale_inv" in pname: # Map weight name -> scale_inv tensor weight_name = pname.replace(".weight_scale_inv", ".weight") scale_inv_lookup[weight_name] = pparam.data # Iterate all parameters, computing merged weights for LoRA layers. # Skip LoRA-specific params and FP8 scale params (scales will be # recomputed by vLLM when it receives the merged bf16 weight). params_to_sync = [] for name, param in model.named_parameters(): vllm_name = name.removeprefix("base_model.model.").replace( ".base_layer", "" ) if model.prefix in vllm_name: continue if "original_module" in vllm_name: continue # Skip FP8 quantization scale parameters - they are recomputed # on the vLLM side when we update the weight itself if "weight_scale_inv" in vllm_name or "input_scale" in vllm_name: continue vllm_name = fix_name(vllm_name, extra_prefixes=["modules_to_save.default."]) data = param.data compute_dtype = torch.bfloat16 if vllm_name.endswith(".weight"): # Dequantize FP8 weights before merging if data.dtype == torch.float8_e4m3fn and name in scale_inv_lookup: scale_inv = scale_inv_lookup[name] # Block dequantization: weight * scale_inv (with broadcasting) fp8_bf16 = data.to(compute_dtype) if scale_inv.dim() == 2 and fp8_bf16.dim() == 2: # Block-quantized: scale_inv shape (rows/block, cols/block) sr, sc = scale_inv.shape br = fp8_bf16.shape[0] // sr # block height bc = fp8_bf16.shape[1] // sc # block width # Reshape → multiply by block scale → reshape back data = ( fp8_bf16.reshape(sr, br, sc, bc) * scale_inv[:, None, :, None].to(compute_dtype) ).reshape(fp8_bf16.shape) elif scale_inv.dim() <= 1: # Per-tensor or per-channel scale data = fp8_bf16 * scale_inv.to(compute_dtype) else: data = fp8_bf16 elif data.dtype == torch.float8_e4m3fn: # FP8 but no scale found - just cast (lossy) data = data.to(compute_dtype) mod_path = vllm_name[: -len(".weight")] if mod_path in lora_info: A, B, s = lora_info[mod_path] merged = data.to(compute_dtype) + s * ( B.to(compute_dtype) @ A.to(compute_dtype) ) data = merged params_to_sync.append((vllm_name, data)) # Batch sync all params in one HTTP+NCCL call (vs individual calls) if params_to_sync: vllm_client.batch_update_named_params(params_to_sync) # Reset prefix cache after weight update vllm_client.reset_prefix_cache() def _sync_lora_adapter(self): """Sync LoRA adapter to vLLM via filesystem (native LoRA mode). Saves the PEFT adapter to a temp directory and POSTs the path to vLLM's /set_lora_adapter/ endpoint. vLLM loads the adapter natively using Punica kernels, avoiding the need to merge weights and NCCL-broadcast the full model. Syncs only the LoRA adapter weights via filesystem instead of the full merged model via NCCL. FSDP/DeepSpeed: All ranks must participate in the state_dict gather. accelerator.get_state_dict() handles this (FSDP uses FullStateDictConfig with rank0_only=True). Only rank 0 gets the full dict, writes files, and does the HTTP POST. """ import os import tempfile accelerator = self.vllm_generation.accelerator model = self.vllm_generation.model if self.vllm_generation.mode != "server": return is_main = accelerator.is_main_process # Increment adapter version (all ranks, kept in sync) if not hasattr(self, "_lora_sync_version"): self._lora_sync_version = 0 if is_main: self._lora_sync_dir = tempfile.mkdtemp(prefix="lora_sync_") else: self._lora_sync_dir = None # Broadcast sync dir from rank 0 to all ranks if accelerator.num_processes > 1: import torch.distributed as dist if dist.is_initialized(): obj_list = [self._lora_sync_dir] dist.broadcast_object_list(obj_list, src=0) self._lora_sync_dir = obj_list[0] self._lora_sync_version += 1 adapter_path = os.path.join(self._lora_sync_dir, f"v{self._lora_sync_version}") # Gather state dict from all ranks (FSDP/DeepSpeed gather, rank0_only) # All ranks must participate even though only rank 0 gets the result. # Use self.model_wrapped (the DeepSpeed/FSDP engine) for get_state_dict, # since it has the necessary hooks (e.g. zero_gather_16bit_weights_on_model_save). # self.vllm_generation.model is the unwrapped PEFT model which lacks these. wrapped_model = getattr(self, "model_wrapped", model) state_dict = accelerator.get_state_dict(wrapped_model) if is_main: # Unwrap to access PEFT's save_pretrained unwrapped = accelerator.unwrap_model(model) unwrapped.save_pretrained(adapter_path, state_dict=state_dict) import requests vllm_client = self.vllm_generation.vllm_client url = f"{vllm_client.base_url}/set_lora_adapter/" response = requests.post( url, json={ "lora_name": "active_lora", "lora_int_id": self._lora_sync_version, "lora_path": adapter_path, }, timeout=30, ) if response.status_code != 200: logger.warning( "Failed to set LoRA adapter: %s %s", response.status_code, response.text, ) return # Reset prefix cache after adapter update vllm_client.reset_prefix_cache() # Clean up old adapter versions (keep only current) if self._lora_sync_version > 1: old_path = os.path.join( self._lora_sync_dir, f"v{self._lora_sync_version - 1}" ) if os.path.exists(old_path): import shutil shutil.rmtree(old_path, ignore_errors=True) logger.info( "Synced LoRA adapter v%d to vLLM (%s)", self._lora_sync_version, adapter_path, ) # Barrier to ensure all ranks complete before resuming forward passes. # Without this, rank 1 may start a forward pass (triggering FSDP unshard) # while rank 0 is still doing save_pretrained, causing FSDP all-gather deadlock. if accelerator.num_processes > 1: import torch.distributed as dist if dist.is_initialized(): dist.barrier() def _maybe_sync_vllm_weights(self): """Sync model weights to vLLM if the interval has elapsed. Dispatches to one of three strategies: - vllm_lora_sync: saves adapter to filesystem, vLLM loads natively - PEFT no-merge: computes merged weights as new tensors, NCCL broadcast - Non-PEFT: stock sync_weights via merge_adapter + NCCL """ if not (self.use_vllm and self.args.async_prefetch): return step = self.state.global_step interval = self.args.vllm_sync_interval if step != self._last_synced_step and step % interval == 0: if getattr(self.args, "vllm_lora_sync", False): if step == 0: logger.info("Skipping LoRA sync at step 0 (no training yet)") self._last_synced_step = step return # Native LoRA sync: save adapter to filesystem, vLLM loads it directly self._sync_lora_adapter() else: from accelerate.utils import is_peft_model use_no_merge = is_peft_model(self.vllm_generation.model) if use_no_merge: # No-merge sync: computes merged weights as new tensors # (doesn't modify base weights in-place), so it's safe to # run concurrently with BG generation — no lock needed. self._sync_peft_weights_no_merge() else: # Non-PEFT: use stock sync (acquires lock to avoid overlap) if self.data_producer is not None and hasattr( self.data_producer, "_generate_lock" ): with self.data_producer._generate_lock: self.vllm_generation.sync_weights() elif self._async_queue is not None: pending = list(self._async_queue.queue) for f in pending: if isinstance(f, concurrent.futures.Future): f.result() self.vllm_generation.sync_weights() else: self.vllm_generation.sync_weights() self._last_synced_step = step def _zero_pad_embedding_for_fp8(self): """Zero out the pad token embedding for FP8 models. FP8 linear layers produce NaN when processing positions with attention_mask=0 (the hidden states at those positions have unconstrained values that overflow FP8 range during quantization). By setting the pad token embedding to zeros, padding positions start with zero hidden states and stay zero through masked attention, preventing NaN from FP8 matmul. """ model = self.accelerator.unwrap_model(self.model) # Check if model has FP8 weights has_fp8 = any( p.dtype == torch.float8_e4m3fn for p in model.parameters() if not p.requires_grad ) if not has_fp8: return # Find the embedding layer if hasattr(model, "model") and hasattr(model.model, "embed_tokens"): embed = model.model.embed_tokens elif hasattr(model, "base_model") and hasattr(model.base_model, "model"): m = model.base_model.model if hasattr(m, "model") and hasattr(m.model, "embed_tokens"): embed = m.model.embed_tokens else: return else: return pad_id = self.processing_class.pad_token_id if pad_id is not None and pad_id < embed.weight.shape[0]: with torch.no_grad(): embed.weight.data[pad_id].zero_() import logging logging.getLogger("async_grpo").info( f"Zeroed pad token embedding (id={pad_id}) for FP8 NaN prevention" ) # ------------------------------------------------------------------ # Background-thread generation (no scoring) # ------------------------------------------------------------------ def _generate_single_turn(self, prompts, **kwargs): """Override to prevent weight sync from background thread and to use no-merge sync for PEFT models (FP8 models can't merge_adapter).""" is_bg = threading.current_thread() is not threading.main_thread() saved_step = None if is_bg and self.use_vllm: # Trick: match _last_loaded_step so the stock sync check is a no-op saved_step = getattr(self, "_last_loaded_step", None) self._last_loaded_step = self.state.global_step # Permanently replace vllm_generation.sync_weights with our custom # sync to avoid merge_adapter (fails on FP8 / races with training). # For LoRA sync mode, make it a no-op here since _maybe_sync_vllm_weights # handles the sync with proper interval tracking. if not getattr(self, "_patched_sync_weights", False): if self.use_vllm and hasattr(self, "vllm_generation"): if getattr(self.args, "vllm_lora_sync", False): # No-op: LoRA sync is driven by _maybe_sync_vllm_weights self.vllm_generation.sync_weights = lambda: None self._patched_sync_weights = True else: from accelerate.utils import is_peft_model if is_peft_model(self.vllm_generation.model): def _no_merge_sync(): self._sync_peft_weights_no_merge() self.vllm_generation.sync_weights = _no_merge_sync self._patched_sync_weights = True try: return super()._generate_single_turn(prompts, **kwargs) finally: if saved_step is not None: self._last_loaded_step = saved_step def _generate_rank0_only(self, prompts): """Generate using vLLM directly on rank 0 without cross-rank collectives. Called from BG thread in FSDP mode. Bypasses ``gather_object`` / ``broadcast_object_list`` since the main thread may be running FSDP collectives concurrently. Returns the same tuple as ``_generate``. """ import copy prompts = copy.deepcopy(prompts) # Duplicate prompts for num_generations (same as TRL's gather+unique pattern) num_generations = self.num_generations unique_prompts = prompts[::num_generations] # Build sampling params vg = self.vllm_generation sampling_params = { "n": num_generations, "repetition_penalty": vg.repetition_penalty, "temperature": vg.temperature, "top_p": vg.top_p, "top_k": vg.top_k, "min_p": 0.0 if vg.min_p is None else vg.min_p, "max_tokens": vg.max_completion_length, "logprobs": vg.logprobs, "structured_outputs_regex": vg.structured_outputs_regex, "generation_kwargs": vg.generation_kwargs, } # Call vLLM directly (no collectives) from trl.data_utils import is_conversational if is_conversational({"prompt": unique_prompts[0]}): output = vg.vllm_client.chat( messages=unique_prompts, **sampling_params, chat_template_kwargs=vg.chat_template_kwargs, tools=vg.tools, chat_template=vg.chat_template, ) else: output = vg.vllm_client.generate(prompts=unique_prompts, **sampling_params) # vLLM returns 1 prompt_ids per unique prompt, but num_generations completion_ids. # Duplicate prompt_ids to match completions (one per generation). raw_prompt_ids = output["prompt_ids"] prompt_ids = [pid for pid in raw_prompt_ids for _ in range(num_generations)] completion_ids = output["completion_ids"] logprobs_raw = output["logprobs"] extra_fields = { k: v for k, v in output.items() if k not in {"prompt_ids", "completion_ids", "logprobs", "logprob_token_ids"} } # Extract top-1 logprob per token logprobs = [[lp[0] for lp in seq] for seq in logprobs_raw] # Decode completions if is_conversational({"prompt": prompts[0]}): contents = self.processing_class.batch_decode( completion_ids, skip_special_tokens=True ) completions = [[{"role": "assistant", "content": c}] for c in contents] else: completions = self.processing_class.batch_decode( completion_ids, skip_special_tokens=True ) tool_mask = extra_fields.pop("env_mask", None) # Compute total completion tokens locally (no gather) total_completion_tokens = sum(len(ids) for ids in completion_ids) return ( prompt_ids, completion_ids, tool_mask, completions, total_completion_tokens, logprobs, extra_fields, ) def _generate_only(self, inputs, rank0_only=False): """Generate completions without scoring. Runs on background thread. Mirrors the first half of ``_generate_and_score_completions`` (prompt extraction → vLLM generation → tensor padding) and returns a deferred output dict for main-thread scoring. When ``rank0_only=True`` (FSDP mode), bypasses ``gather_object`` / ``broadcast_object_list`` collectives and calls vLLM directly on rank 0. Results are broadcast to other ranks on the main thread later. Args: inputs: list of dicts (one per sample), as yielded by the DataLoader with ``identity`` collate_fn. """ device = self.accelerator.device prompts = [x["prompt"] for x in inputs] # --- Handle images (multimodal) --- if "images" in inputs[0]: images = [ex.get("images") for ex in inputs] elif "image" in inputs[0]: images = [ [ex.get("image")] if ex.get("image") is not None else None for ex in inputs ] else: images = None if images is not None and all(img == [] for img in images): images = None if images is not None: if not is_conversational(inputs[0]): raise ValueError("Multimodal training requires conversational prompts.") prompts = [ prepare_multimodal_messages(p, il) for p, il in zip(prompts, images, strict=True) ] # --- Generate completions --- if rank0_only: # FSDP mode: call vLLM directly without cross-rank collectives ( prompt_ids_list, completion_ids_list, tool_mask_list, completions, num_items_in_batch, sampling_per_token_logps_list, extra_fields, ) = self._generate_rank0_only(prompts) else: ( prompt_ids_list, completion_ids_list, tool_mask_list, completions, num_items_in_batch, sampling_per_token_logps_list, extra_fields, ) = self._generate(prompts) # _generate gathers prompts from all ranks internally. Gather inputs # to match the full-batch output size. if self.accelerator.num_processes > 1: from accelerate.utils import gather_object inputs = gather_object(inputs) prompts = [x["prompt"] for x in inputs] # --- Pad to tensors --- prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list] prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids] prompt_ids = pad( prompt_ids, padding_value=self.pad_token_id, padding_side="left" ) prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left") completion_ids = [ torch.tensor(ids, device=device) for ids in completion_ids_list ] completion_mask = [ torch.ones_like(ids, dtype=torch.long) for ids in completion_ids ] completion_ids = pad( completion_ids, padding_value=self.pad_token_id, padding_side="right" ) completion_mask = pad(completion_mask, padding_value=0, padding_side="right") if sampling_per_token_logps_list is not None: sampling_logps = [ torch.tensor(lp, device=device) for lp in sampling_per_token_logps_list ] sampling_per_token_logps = pad( sampling_logps, padding_value=0.0, padding_side="right" ) else: sampling_per_token_logps = None if tool_mask_list is not None: tool_mask = [torch.tensor(m, device=device) for m in tool_mask_list] tool_mask = pad(tool_mask, padding_value=1, padding_side="right") else: tool_mask = None # --- Mask truncated completions --- if self.mask_truncated_completions: eos_and_pad = [self.eos_token_id, self.pad_token_id] is_trunc = torch.tensor( [ids[-1] not in eos_and_pad for ids in completion_ids_list], device=device, ) completion_mask = completion_mask * (~is_trunc).unsqueeze(1).int() if tool_mask is not None: tool_mask = tool_mask * (~is_trunc).unsqueeze(1).int() # --- Multimodal forward kwargs --- num_images = [len(il) for il in images] if images is not None else None if images is not None: prompts_text = [ apply_chat_template( {"prompt": p}, self.processing_class, tools=self.tools, **self.chat_template_kwargs, )["prompt"] for p in prompts ] prompt_inputs = self.processing_class( images=images, text=prompts_text, padding=True, return_tensors="pt" ) forward_kwargs = { k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in prompt_inputs.items() if k not in ("input_ids", "attention_mask") } else: forward_kwargs = {} # Extend token_type_ids / mm_token_type_ids for completion tokens for ttid_key in ("token_type_ids", "mm_token_type_ids"): if ttid_key in forward_kwargs: tt = forward_kwargs[ttid_key] forward_kwargs[ttid_key] = torch.cat( [tt, tt.new_zeros(completion_ids.shape)], dim=1 ) # Merge extra_fields from rollout_func into inputs if extra_fields: for i, inp in enumerate(inputs): for key, values in extra_fields.items(): if isinstance(values, list) and i < len(values): inp[key] = values[i] elif not isinstance(values, list): inp[key] = values # No explicit CUDA sync needed here — both threads share the # default stream, so operations are naturally ordered. # --- Construct deferred output --- output = { "prompt_ids": prompt_ids, "prompt_mask": prompt_mask, "completion_ids": completion_ids, "completion_mask": completion_mask, "num_items_in_batch": num_items_in_batch, "advantages": torch.zeros(completion_ids.size(0), device=device), # Sentinels for deferred scoring "_pending_policy_logps": True, "_deferred_inputs": inputs, "_deferred_prompts": prompts, "_deferred_completions": completions, "_deferred_completion_ids_list": completion_ids_list, "_rank0_only": rank0_only, } if sampling_per_token_logps is not None: output["sampling_per_token_logps"] = sampling_per_token_logps if tool_mask is not None: output["tool_mask"] = tool_mask if images is not None: output["num_images"] = num_images for k in ( "pixel_values", "image_grid_thw", "pixel_attention_mask", "image_sizes", "token_type_ids", "mm_token_type_ids", ): if k in forward_kwargs: output[k] = forward_kwargs[k] return output # ------------------------------------------------------------------ # Hooks (overridden by subclasses like FastAsyncGRPOTrainer) # ------------------------------------------------------------------ def _compute_rewards_for_batch( self, inputs, prompts, completions, completion_ids_list ): """Compute rewards for a batch. Override for parallel workers, caching, etc.""" return self._calculate_rewards( inputs, prompts, completions, completion_ids_list ) def _launch_reward_workers(self, inputs, prompts, completions, completion_ids_list): """Launch reward computation in background. Override for parallel dispatch. Default: no-op (rewards computed synchronously in _collect_reward_workers). """ self._pending_reward_args = (inputs, prompts, completions, completion_ids_list) def _collect_reward_workers( self, inputs, prompts, completions, completion_ids_list ): """Collect reward results. Override to collect from parallel workers. Default: compute rewards synchronously now. """ args = getattr(self, "_pending_reward_args", None) if args is not None: self._pending_reward_args = None return self._compute_rewards_for_batch(*args) return self._compute_rewards_for_batch( inputs, prompts, completions, completion_ids_list ) def _post_advantage_hook( self, data: dict, rewards_per_func, advantages, inputs: list, num_generations: int, mode: str, s_start: int | None = None, s_end: int | None = None, is_last_chunk: bool = True, ) -> None: """Called after advantages are computed. Override for replay buffer, re-roll, etc.""" # ------------------------------------------------------------------ # Main-thread scoring # ------------------------------------------------------------------ @torch.no_grad() def _compute_deferred_scores(self, rollout: dict) -> dict: """Compute rewards, advantages, policy logprobs, and IS ratio on the main thread. Takes the deferred output from ``_generate_only`` and produces a fully scored dict ready for ``split_tensor_dict`` → micro-batches. """ device = self.accelerator.device batch_size = self.args.per_device_train_batch_size num_generations = self.num_generations mode = "train" # --- Extract deferred data --- data = rollout inputs = data.pop("_deferred_inputs") prompts = data.pop("_deferred_prompts") completions = data.pop("_deferred_completions") completion_ids_list = data.pop("_deferred_completion_ids_list") rank0_only = data.pop("_rank0_only", False) del data["_pending_policy_logps"] prompt_ids = data["prompt_ids"] completion_ids = data["completion_ids"] prompt_mask = data["prompt_mask"] completion_mask = data["completion_mask"] prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) logits_to_keep = completion_ids.size(1) # Multimodal forward kwargs forward_kwargs = {} for key in ( "pixel_values", "image_grid_thw", "pixel_attention_mask", "image_sizes", "token_type_ids", "mm_token_type_ids", ): if key in data: forward_kwargs[key] = data[key] num_images = data.get("num_images") # --- Launch rewards in parallel with logprobs --- self._launch_reward_workers(inputs, prompts, completions, completion_ids_list) # --- Policy logprobs --- logprob_batch_size = min(batch_size * 4, len(prompt_ids)) with disable_gradient_checkpointing( self.model, self.args.gradient_checkpointing_kwargs ): generate_every = self.args.steps_per_generation * self.num_iterations if self.args.gradient_accumulation_steps % generate_every != 0 or ( self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False) ): old_per_token_logps, _ = self._get_per_token_logps_and_entropies( self.model, prompt_completion_ids, attention_mask, logits_to_keep, logprob_batch_size, num_images=num_images, **forward_kwargs, ) data["old_per_token_logps"] = old_per_token_logps else: old_per_token_logps = None # Reference model logprobs if self.beta != 0.0: if self.ref_model is not None: ref_logps, _ = self._get_per_token_logps_and_entropies( self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, num_images=num_images, **forward_kwargs, ) else: unwrapped = self.accelerator.unwrap_model(self.model) adapter_name = ( "ref" if hasattr(unwrapped, "peft_config") and "ref" in unwrapped.peft_config else None ) with use_adapter(unwrapped, adapter_name=adapter_name): ref_logps, _ = self._get_per_token_logps_and_entropies( self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, num_images=num_images, **forward_kwargs, ) data["ref_per_token_logps"] = ref_logps # --- IS ratio --- if ( self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False) and old_per_token_logps is not None and "sampling_per_token_logps" in data ): sampling_logps = data["sampling_per_token_logps"] is_mask = ( completion_mask if "tool_mask" not in data else completion_mask * data["tool_mask"] ) per_token_logps_diff = (old_per_token_logps - sampling_logps) * is_mask is_mode = getattr(self, "vllm_importance_sampling_mode", "token_truncate") is_cap = getattr(self, "vllm_importance_sampling_cap", 3.0) sequence_level_is = is_mode in ("sequence_mask", "sequence_truncate") if sequence_level_is: logps_diff = per_token_logps_diff.sum(dim=-1, keepdim=True) else: logps_diff = per_token_logps_diff is_ratio = torch.exp(logps_diff) if is_mode in ("sequence_truncate", "token_truncate"): is_ratio = torch.clamp(is_ratio, max=is_cap) elif is_mode in ("sequence_mask", "token_mask"): is_ratio = is_ratio.masked_fill(is_ratio > is_cap, value=0.0) data["importance_sampling_ratio"] = is_ratio # --- Collect rewards (launched before logprobs, should be done) --- rewards_per_func = self._collect_reward_workers( inputs, prompts, completions, completion_ids_list ) # In rank0_only mode, all ranks compute the same rewards on identical data. # _calculate_rewards / _collect_reward_workers always `gather()` across ranks, # which duplicates the rows (N_local * num_processes). De-duplicate so that # rewards_per_func matches the data dict (which has N_local rows). if rank0_only and rewards_per_func.size(0) > len(prompts): rewards_per_func = rewards_per_func[: len(prompts)] # --- Advantages --- if self.multi_objective_aggregation == "sum_then_normalize": rewards = ( rewards_per_func * self.reward_weights.to(device).unsqueeze(0) ).nansum(dim=1) mean_grouped = ( rewards.view(-1, num_generations) .mean(dim=1) .repeat_interleave(num_generations) ) if self.scale_rewards in ("group", "none"): if num_generations > 1: std_rewards = ( rewards.view(-1, num_generations) .std(dim=1) .repeat_interleave(num_generations) ) else: std_rewards = torch.zeros_like(rewards) elif self.scale_rewards == "batch": std_rewards = ( rewards.std().expand_as(rewards) if rewards.numel() > 1 else torch.zeros_like(rewards) ) else: raise ValueError(f"Invalid scale_rewards: {self.scale_rewards}") advantages = rewards - mean_grouped if self.scale_rewards != "none": advantages = advantages / (std_rewards + 1e-4) is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards)) elif self.multi_objective_aggregation == "normalize_then_sum": grouped = rewards_per_func.view(-1, num_generations, len(self.reward_funcs)) mean_k = torch.nanmean(grouped, dim=1, keepdim=True) std_k = ( nanstd(grouped, dim=1, keepdim=True) if num_generations > 1 else torch.zeros_like(mean_k) ) reward_k = (grouped - mean_k) / (std_k + 1e-4) reward_k = reward_k.view(-1, len(self.reward_funcs)) rewards = (reward_k * self.reward_weights.to(device).unsqueeze(0)).nansum( dim=1 ) std_rewards = ( rewards.std().expand_as(rewards) if rewards.numel() > 1 else torch.zeros_like(rewards) ) advantages = (rewards - rewards.mean()) / (std_rewards + 1e-4) is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards)) else: raise ValueError( f"Invalid multi_objective_aggregation: {self.multi_objective_aggregation}" ) # Slice for local process # In rank0_only mode, all ranks already have identical data from broadcast, # so no slicing needed. Otherwise, each rank takes its portion. if rank0_only: process_slice = slice(0, len(prompts)) else: process_slice = slice( self.accelerator.process_index * len(prompts), (self.accelerator.process_index + 1) * len(prompts), ) all_advantages = advantages.clone() advantages = advantages[process_slice] data["advantages"] = advantages # --- Post-advantage hook (for replay buffer, re-roll, etc.) --- self._post_advantage_hook( data, rewards_per_func, advantages, inputs, num_generations, mode, ) # --- Metrics --- for i, name in enumerate(self.reward_func_names): self._metrics[mode][f"rewards/{name}/mean"].append( torch.nanmean(rewards_per_func[:, i]).item() ) self._metrics[mode][f"rewards/{name}/std"].append( nanstd(rewards_per_func[:, i]).item() ) agg_rewards = rewards_per_func.nansum(dim=1) self._metrics[mode]["reward"].append(agg_rewards.mean().item()) self._metrics[mode]["reward_std"].append(agg_rewards.std().item()) self._metrics[mode]["frac_reward_zero_std"].append( is_std_zero.float().mean().item() ) # Token counting total_prompt = self.accelerator.gather(prompt_mask.sum()).sum() total_completion = self.accelerator.gather(completion_mask.sum()).sum() self.state.num_input_tokens_seen += (total_prompt + total_completion).item() self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen] # Completion length metrics comp_lengths = completion_mask.sum(dim=1) agg_lengths = self.accelerator.gather(comp_lengths) self._metrics[mode]["completions/mean_length"].append( agg_lengths.float().mean().item() ) self._metrics[mode]["completions/min_length"].append( agg_lengths.float().min().item() ) self._metrics[mode]["completions/max_length"].append( agg_lengths.float().max().item() ) eos_and_pad = [self.eos_token_id, self.pad_token_id] is_trunc = torch.tensor( [ids[-1].item() not in eos_and_pad for ids in completion_ids], device=device ) agg_trunc = self.accelerator.gather(is_trunc) self._metrics[mode]["completions/clipped_ratio"].append( agg_trunc.float().mean().item() ) term_lengths = agg_lengths[~agg_trunc] if len(term_lengths) == 0: term_lengths = torch.zeros(1, device=device) self._metrics[mode]["completions/mean_terminated_length"].append( term_lengths.float().mean().item() ) self._metrics[mode]["completions/min_terminated_length"].append( term_lengths.float().min().item() ) self._metrics[mode]["completions/max_terminated_length"].append( term_lengths.float().max().item() ) # IS metrics if "importance_sampling_ratio" in data and "sampling_per_token_logps" in data: old_lp = data["old_per_token_logps"] samp_lp = data["sampling_per_token_logps"] mask = completion_mask.bool() delta = torch.abs(old_lp - samp_lp) delta_m = delta[mask] md = ( torch.mean(delta_m) if delta_m.numel() > 0 else torch.tensor(0.0, device=device) ) xd = ( torch.max(delta_m) if delta_m.numel() > 0 else torch.tensor(0.0, device=device) ) self._metrics[mode]["sampling/sampling_logp_difference/mean"].append( self.accelerator.gather(md).mean().item() ) self._metrics[mode]["sampling/sampling_logp_difference/max"].append( self.accelerator.gather(xd).max().item() ) isr = data["importance_sampling_ratio"] is_mode = getattr(self, "vllm_importance_sampling_mode", "token_truncate") if is_mode in ("sequence_mask", "sequence_truncate"): flat_isr = isr.flatten() else: flat_isr = isr[mask] if flat_isr.numel() > 0: self._metrics[mode]["sampling/importance_sampling_ratio/min"].append( nanmin(self.accelerator.gather(torch.min(flat_isr))).item() ) self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append( self.accelerator.gather(torch.mean(flat_isr)).nanmean().item() ) self._metrics[mode]["sampling/importance_sampling_ratio/max"].append( nanmax(self.accelerator.gather(torch.max(flat_isr))).item() ) # Log prompt/completion texts prompts_text = self.processing_class.batch_decode( prompt_ids, skip_special_tokens=True ) completions_text = self.processing_class.batch_decode( completion_ids, skip_special_tokens=True ) if gather_object is not None: self._logs["prompt"].extend(gather_object(prompts_text)) self._logs["completion"].extend(gather_object(completions_text)) for i, name in enumerate(self.reward_func_names): self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist()) self._logs["advantages"].extend(all_advantages.tolist()) # Remove deferred keys for k in list(data.keys()): if k.startswith("_deferred") or k == "_pending_policy_logps": data.pop(k, None) return data @torch.no_grad() def _compute_streaming_group_scores( self, data, s_start, s_end, inputs, prompts, completions, completion_ids_list, is_last_chunk, rank0_only=False, ): """Score a chunk of prompt groups: rewards, policy logprobs, advantages. Called during streaming scoring to incrementally score groups. Writes results directly into ``data`` at positions ``s_start:s_end``. """ device = self.accelerator.device batch_size = self.args.per_device_train_batch_size num_generations = self.num_generations mode = "train" chunk_size = s_end - s_start # --- Policy logprobs for this chunk --- chunk_prompt_ids = data["prompt_ids"][s_start:s_end] chunk_completion_ids = data["completion_ids"][s_start:s_end] chunk_prompt_mask = data["prompt_mask"][s_start:s_end] chunk_completion_mask = data["completion_mask"][s_start:s_end] prompt_completion_ids = torch.cat( [chunk_prompt_ids, chunk_completion_ids], dim=1 ) attention_mask = torch.cat([chunk_prompt_mask, chunk_completion_mask], dim=1) logits_to_keep = chunk_completion_ids.size(1) # Slice multimodal forward kwargs for this chunk forward_kwargs = {} for key in ( "pixel_values", "image_grid_thw", "pixel_attention_mask", "image_sizes", "token_type_ids", "mm_token_type_ids", ): if key in data: val = data[key] if ( isinstance(val, torch.Tensor) and val.dim() > 0 and val.size(0) == len(data["prompt_ids"]) ): forward_kwargs[key] = val[s_start:s_end] else: forward_kwargs[key] = val num_images = data.get("num_images") if ( num_images is not None and hasattr(num_images, "__getitem__") and len(num_images) == len(data["prompt_ids"]) ): num_images = num_images[s_start:s_end] # --- Launch rewards in parallel with logprobs --- self._launch_reward_workers(inputs, prompts, completions, completion_ids_list) # --- Policy logprobs for this chunk (GPU, overlaps with BG rewards) --- logprob_batch_size = min(batch_size * 2, chunk_size) with disable_gradient_checkpointing( self.model, self.args.gradient_checkpointing_kwargs ): generate_every = self.args.steps_per_generation * self.num_iterations if self.args.gradient_accumulation_steps % generate_every != 0 or ( self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False) ): old_logps, _ = self._get_per_token_logps_and_entropies( self.model, prompt_completion_ids, attention_mask, logits_to_keep, logprob_batch_size, num_images=num_images, **forward_kwargs, ) if "old_per_token_logps" not in data: total = len(data["prompt_ids"]) data["old_per_token_logps"] = torch.zeros( total, old_logps.size(1), device=device, dtype=old_logps.dtype ) data["old_per_token_logps"][s_start:s_end] = old_logps # Compute IS ratio for this chunk if "sampling_per_token_logps" in data: samp_chunk = data["sampling_per_token_logps"][s_start:s_end] is_mask = ( chunk_completion_mask if "tool_mask" not in data else (chunk_completion_mask * data["tool_mask"][s_start:s_end]) ) diff = (old_logps - samp_chunk) * is_mask is_mode = getattr( self, "vllm_importance_sampling_mode", "token_truncate" ) is_cap = getattr(self, "vllm_importance_sampling_cap", 3.0) seq_is = is_mode in ("sequence_mask", "sequence_truncate") logps_diff = diff.sum(dim=-1, keepdim=True) if seq_is else diff is_ratio = torch.exp(logps_diff) if is_mode in ("sequence_truncate", "token_truncate"): is_ratio = torch.clamp(is_ratio, max=is_cap) elif is_mode in ("sequence_mask", "token_mask"): is_ratio = is_ratio.masked_fill(is_ratio > is_cap, value=0.0) if "importance_sampling_ratio" not in data: total = len(data["prompt_ids"]) shape = (total, 1) if seq_is else (total, is_ratio.size(1)) data["importance_sampling_ratio"] = torch.ones( *shape, device=device, dtype=is_ratio.dtype ) data["importance_sampling_ratio"][s_start:s_end] = is_ratio # Reference logprobs if self.beta != 0.0: if self.ref_model is not None: ref_logps, _ = self._get_per_token_logps_and_entropies( self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, num_images=num_images, **forward_kwargs, ) else: unwrapped = self.accelerator.unwrap_model(self.model) adapter_name = ( "ref" if hasattr(unwrapped, "peft_config") and "ref" in unwrapped.peft_config else None ) with use_adapter(unwrapped, adapter_name=adapter_name): ref_logps, _ = self._get_per_token_logps_and_entropies( self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, num_images=num_images, **forward_kwargs, ) if "ref_per_token_logps" not in data: total = len(data["prompt_ids"]) data["ref_per_token_logps"] = torch.zeros( total, ref_logps.size(1), device=device, dtype=ref_logps.dtype ) data["ref_per_token_logps"][s_start:s_end] = ref_logps # --- Collect rewards (should already be done, ran in parallel with logprobs) --- rewards_per_func = self._collect_reward_workers( inputs, prompts, completions, completion_ids_list ) # De-duplicate gathered rewards when all ranks computed the same data. # _calculate_rewards always gather()s, which duplicates rows in rank0_only mode. if rewards_per_func.size(0) > chunk_size: rewards_per_func = rewards_per_func[:chunk_size] # --- Advantages (group-level normalization) --- if self.multi_objective_aggregation == "sum_then_normalize": rewards = ( rewards_per_func * self.reward_weights.to(device).unsqueeze(0) ).nansum(dim=1) mean_g = ( rewards.view(-1, num_generations) .mean(dim=1) .repeat_interleave(num_generations) ) if num_generations > 1: std_r = ( rewards.view(-1, num_generations) .std(dim=1) .repeat_interleave(num_generations) ) else: std_r = torch.zeros_like(rewards) advantages = rewards - mean_g if self.scale_rewards != "none": advantages = advantages / (std_r + 1e-4) is_std_zero = torch.isclose(std_r, torch.zeros_like(std_r)) elif self.multi_objective_aggregation == "normalize_then_sum": grouped = rewards_per_func.view(-1, num_generations, len(self.reward_funcs)) mean_k = torch.nanmean(grouped, dim=1, keepdim=True) std_k = ( nanstd(grouped, dim=1, keepdim=True) if num_generations > 1 else torch.zeros_like(mean_k) ) reward_k = ((grouped - mean_k) / (std_k + 1e-4)).view( -1, len(self.reward_funcs) ) rewards = (reward_k * self.reward_weights.to(device).unsqueeze(0)).nansum( dim=1 ) std_r = ( rewards.view(-1, num_generations) .std(dim=1) .repeat_interleave(num_generations) ) mean_r = ( rewards.view(-1, num_generations) .mean(dim=1) .repeat_interleave(num_generations) ) advantages = (rewards - mean_r) / (std_r + 1e-4) is_std_zero = torch.isclose(std_r, torch.zeros_like(std_r)) else: raise ValueError( f"Invalid multi_objective_aggregation: {self.multi_objective_aggregation}" ) if rank0_only: process_slice = slice(0, len(prompts)) else: process_slice = slice( self.accelerator.process_index * len(prompts), (self.accelerator.process_index + 1) * len(prompts), ) advantages = advantages[process_slice] if "advantages" not in data or not isinstance(data["advantages"], torch.Tensor): data["advantages"] = torch.zeros(len(data["prompt_ids"]), device=device) data["advantages"][s_start:s_end] = advantages # --- Post-advantage hook (for replay buffer, re-roll, etc.) --- self._post_advantage_hook( data, rewards_per_func, advantages, inputs, num_generations, mode, s_start=s_start, s_end=s_end, is_last_chunk=is_last_chunk, ) # --- Chunk metrics --- for i, name in enumerate(self.reward_func_names): self._metrics[mode][f"rewards/{name}/mean"].append( torch.nanmean(rewards_per_func[:, i]).item() ) self._metrics[mode][f"rewards/{name}/std"].append( nanstd(rewards_per_func[:, i]).item() ) agg_rewards = rewards_per_func.nansum(dim=1) self._metrics[mode]["reward"].append(agg_rewards.mean().item()) self._metrics[mode]["reward_std"].append(agg_rewards.std().item()) self._metrics[mode]["frac_reward_zero_std"].append( is_std_zero.float().mean().item() ) # --- Full-batch metrics on last chunk --- if is_last_chunk: all_prompt_mask = data["prompt_mask"] all_completion_mask = data["completion_mask"] all_completion_ids = data["completion_ids"] total_p = self.accelerator.gather(all_prompt_mask.sum()).sum() total_c = self.accelerator.gather(all_completion_mask.sum()).sum() self.state.num_input_tokens_seen += (total_p + total_c).item() self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen] comp_lengths = all_completion_mask.sum(dim=1) agg_lengths = self.accelerator.gather(comp_lengths) self._metrics[mode]["completions/mean_length"].append( agg_lengths.float().mean().item() ) self._metrics[mode]["completions/min_length"].append( agg_lengths.float().min().item() ) self._metrics[mode]["completions/max_length"].append( agg_lengths.float().max().item() ) eos_and_pad = [self.eos_token_id, self.pad_token_id] is_trunc = torch.tensor( [ids[-1].item() not in eos_and_pad for ids in all_completion_ids], device=device, ) agg_trunc = self.accelerator.gather(is_trunc) self._metrics[mode]["completions/clipped_ratio"].append( agg_trunc.float().mean().item() ) term = agg_lengths[~agg_trunc] if len(term) == 0: term = torch.zeros(1, device=device) self._metrics[mode]["completions/mean_terminated_length"].append( term.float().mean().item() ) self._metrics[mode]["completions/min_terminated_length"].append( term.float().min().item() ) self._metrics[mode]["completions/max_terminated_length"].append( term.float().max().item() ) # IS metrics if ( self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False) and "sampling_per_token_logps" in data and "old_per_token_logps" in data ): old_lp = data["old_per_token_logps"] samp_lp = data["sampling_per_token_logps"] mask = all_completion_mask.bool() delta = torch.abs(old_lp - samp_lp)[mask] md = ( torch.mean(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device) ) xd = ( torch.max(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device) ) self._metrics[mode]["sampling/sampling_logp_difference/mean"].append( self.accelerator.gather(md).mean().item() ) self._metrics[mode]["sampling/sampling_logp_difference/max"].append( self.accelerator.gather(xd).max().item() ) is_mode = getattr( self, "vllm_importance_sampling_mode", "token_truncate" ) isr = data["importance_sampling_ratio"] flat = ( isr.flatten() if is_mode in ("sequence_mask", "sequence_truncate") else isr[mask] ) if flat.numel() > 0: self._metrics[mode][ "sampling/importance_sampling_ratio/min" ].append(nanmin(self.accelerator.gather(torch.min(flat))).item()) self._metrics[mode][ "sampling/importance_sampling_ratio/mean" ].append(self.accelerator.gather(torch.mean(flat)).nanmean().item()) self._metrics[mode][ "sampling/importance_sampling_ratio/max" ].append(nanmax(self.accelerator.gather(torch.max(flat))).item()) def _score_streaming(self, rollout: dict) -> list[dict]: """Score a rollout using streaming group scoring. Returns list of micro-batches.""" data = rollout num_gen = self.num_generations n_groups = len(data["prompt_ids"]) // num_gen batch_size = self.args.per_device_train_batch_size min_groups = max(1, self.args.streaming_min_groups) # Extract deferred data inputs = data.pop("_deferred_inputs") prompts = data.pop("_deferred_prompts") completions = data.pop("_deferred_completions") completion_ids_list = data.pop("_deferred_completion_ids_list") rank0_only = data.pop("_rank0_only", False) del data["_pending_policy_logps"] all_micro_batches = [] shared_keys = {"num_items_in_batch"} for chunk_start_g in range(0, n_groups, min_groups): chunk_end_g = min(chunk_start_g + min_groups, n_groups) s_start = chunk_start_g * num_gen s_end = chunk_end_g * num_gen self._compute_streaming_group_scores( data=data, s_start=s_start, s_end=s_end, inputs=inputs[s_start:s_end], prompts=prompts[s_start:s_end], completions=completions[s_start:s_end], completion_ids_list=completion_ids_list[s_start:s_end], is_last_chunk=(chunk_end_g == n_groups), rank0_only=rank0_only, ) # Yield micro-batches from this scored chunk chunk_size = s_end - s_start perm = torch.randperm(chunk_size) for mb_off in range(0, chunk_size, batch_size): mb_idx = perm[mb_off : mb_off + batch_size] abs_idx = mb_idx + s_start mb = {} for key in data: if key.startswith("_"): continue val = data[key] if key in shared_keys: mb[key] = val elif isinstance(val, torch.Tensor) and val.dim() > 0: mb[key] = val[abs_idx] else: mb[key] = val all_micro_batches.append(mb) # Repeat for num_iterations return all_micro_batches * self.num_iterations # ------------------------------------------------------------------ # _prepare_inputs override # ------------------------------------------------------------------ def _prepare_inputs(self, generation_batch): """Override to support data producer and async prefetch paths.""" mode = "train" if self.model.training else "eval" # --- Data producer path --- if mode == "train" and self.data_producer is not None: return self._prepare_inputs_data_producer(generation_batch) # --- Legacy async prefetch path (no data producer) --- if mode == "train" and self.args.async_prefetch: return self._prepare_inputs_legacy_async(generation_batch) # --- Stock path --- return super()._prepare_inputs(generation_batch) def _prepare_inputs_data_producer(self, generation_batch): """Data producer path: produce rollout, score deferred logps, split into micro-batches.""" # Return from buffer if available if self._buffered_inputs: return self._buffered_inputs.pop(0) # Produce a new rollout self._maybe_sync_vllm_weights() rollout_dataset = self.data_producer.produce( self.model, self.state.global_step, processing_class=self.processing_class, accelerator=self.accelerator, args=self.args, ) # Convert RolloutDataset back to a dict for scoring/splitting rollout = rollout_dataset._data # If async (skip_policy_logps=True), score deferred logps on main thread if rollout.get("_pending_policy_logps"): if self.args.streaming_partial_batch: micro_batches = self._score_streaming(rollout) else: scored = self._compute_deferred_scores(rollout) scored = split_pixel_values_by_grid(scored) scored = shuffle_sequence_dict(scored) batches = split_tensor_dict(scored, self.args.steps_per_generation) micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches] micro_batches = micro_batches * self.num_iterations else: # Sync path: data is already fully scored rollout = split_pixel_values_by_grid(rollout) batches = split_tensor_dict(rollout, self.args.steps_per_generation) micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches] micro_batches = micro_batches * self.num_iterations self._buffered_inputs = micro_batches[1:] return micro_batches[0] def _prepare_inputs_legacy_async(self, generation_batch): """Legacy async path: direct queue-based prefetch without data producer.""" # Return from buffer if available if self._buffered_inputs: return self._buffered_inputs.pop(0) # Need a new rollout self._maybe_sync_vllm_weights() future = self._async_queue.get() rollout = future.result() self._submit_generation() if self.args.streaming_partial_batch: micro_batches = self._score_streaming(rollout) else: scored = self._compute_deferred_scores(rollout) scored = split_pixel_values_by_grid(scored) scored = shuffle_sequence_dict(scored) batches = split_tensor_dict(scored, self.args.steps_per_generation) micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches] micro_batches = micro_batches * self.num_iterations self._buffered_inputs = micro_batches[1:] # Release cached CUDA memory from scoring # before training allocations begin, reducing peak reserved memory. torch.cuda.empty_cache() return micro_batches[0] @profiling_decorator def _get_per_token_logps_and_entropies( self, model, input_ids, attention_mask, logits_to_keep, batch_size=None, compute_entropy=False, pixel_values=None, image_grid_thw=None, num_images=None, pixel_attention_mask=None, image_sizes=None, token_type_ids=None, mm_token_type_ids=None, ) -> tuple[Any, torch.Tensor | None]: """Compute log-probs and (optionally) entropies for each token. When running under no_grad (scoring path), bypasses accelerate's ConvertOutputsToFp32 wrapper to avoid a fp32 copy of the logits tensor. """ # Bypass accelerate's ConvertOutputsToFp32 wrapper which converts the # entire (B, L, V) logits tensor from bf16 to fp32 — unnecessary and # extremely wasteful for large vocabularies. # Skip unwrapping for FSDP — parameters are only valid inside FSDP's # forward context; unwrapping exposes flattened/sharded tensors. if not self.is_fsdp_enabled: model = self.accelerator.unwrap_model(model, keep_fp32_wrapper=False) autocast_ctx = torch.autocast( device_type=input_ids.device.type, dtype=torch.bfloat16 ) # Use Liger's Triton kernel in scoring path (no grad): fuses # temperature + log_softmax + gather into a single kernel pass. use_fused = ( self.use_liger_kernel and _fused_selective_log_softmax is not None and not torch.is_grad_enabled() ) batch_size = batch_size or input_ids.size(0) all_logps = [] all_entropies = [] with autocast_ctx: for start in range(0, input_ids.size(0), batch_size): input_ids_batch = input_ids[start : start + batch_size] attention_mask_batch = attention_mask[start : start + batch_size] # Build model inputs model_inputs = { "input_ids": input_ids_batch, "attention_mask": attention_mask_batch, } if image_grid_thw is not None and pixel_values is not None: rows_per_image = image_grid_thw.prod(dim=-1) rows_per_sample = torch.split(rows_per_image, num_images) rows_per_sample = torch.stack([s.sum() for s in rows_per_sample]) cum_rows = torch.cat( [ torch.tensor([0], device=rows_per_sample.device), rows_per_sample.cumsum(0), ] ) row_start, row_end = ( cum_rows[start].item(), cum_rows[start + batch_size].item(), ) model_inputs["pixel_values"] = pixel_values[row_start:row_end] cum_imgs = torch.tensor([0] + num_images).cumsum(0) img_start, img_end = cum_imgs[start], cum_imgs[start + batch_size] model_inputs["image_grid_thw"] = image_grid_thw[img_start:img_end] elif pixel_values is not None: model_inputs["pixel_values"] = pixel_values[ start : start + batch_size ] if pixel_attention_mask is not None: model_inputs["pixel_attention_mask"] = pixel_attention_mask[ start : start + batch_size ] if image_sizes is not None: model_inputs["image_sizes"] = image_sizes[ start : start + batch_size ] if token_type_ids is not None: model_inputs["token_type_ids"] = token_type_ids[ start : start + batch_size ] if mm_token_type_ids is not None: model_inputs["mm_token_type_ids"] = mm_token_type_ids[ start : start + batch_size ] if "logits_to_keep" in self.model_kwarg_keys: model_inputs["logits_to_keep"] = logits_to_keep + 1 model_inputs["use_cache"] = False logits = model(**model_inputs).logits completion_ids = input_ids_batch[:, -logits_to_keep:] # FP8 models produce NaN logits at positions where # attention_mask=0 (padding). Replace NaN with 0 so # log_softmax yields uniform distribution for those positions. # The completion_mask ensures these don't affect the loss. logits = torch.nan_to_num(logits, nan=0.0) if use_fused: logits = logits[:, -(logits_to_keep + 1) :, :] if not logits.is_contiguous(): logits = logits.contiguous() logps = _fused_selective_log_softmax( logits, completion_ids, self.temperature ) all_logps.append(logps) else: logits = logits[:, :-1, :] logits = logits[:, -logits_to_keep:, :] logits.div_(self.temperature) logps = selective_log_softmax(logits, completion_ids) all_logps.append(logps) if compute_entropy: with torch.no_grad(): entropies = entropy_from_logits(logits) all_entropies.append(entropies) logps = torch.cat(all_logps, dim=0) entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None return logps, entropies # ------------------------------------------------------------------ # Loss override (adds IS ratio + OPSM) # ------------------------------------------------------------------ @staticmethod def get_off_policy_mask( advantages, per_token_logps, sampling_per_token_logps, mask, off_policy_threshold, ): """OPSM from DeepSeek-V3.2: drop sequences with negative advantage + high KL.""" kl_div = sampling_per_token_logps - per_token_logps.detach() seq_kl = (kl_div * mask).sum(dim=1, keepdim=True) / mask.sum( dim=1, keepdim=True ).clamp(min=1.0) is_pos_adv = advantages >= 0 is_low_kl = seq_kl <= off_policy_threshold return (is_pos_adv | is_low_kl).to(dtype=mask.dtype) def _compute_loss(self, model, inputs): """Override to add IS ratio correction and off-policy sequence masking.""" prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] completion_ids, completion_mask = ( inputs["completion_ids"], inputs["completion_mask"], ) input_ids = torch.cat([prompt_ids, completion_ids], dim=1) attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) logits_to_keep = completion_ids.size(1) mask = ( completion_mask if "tool_mask" not in inputs else completion_mask * inputs["tool_mask"] ) per_token_logps, entropies = self._get_per_token_logps_and_entropies( model, input_ids, attention_mask, logits_to_keep, compute_entropy=True, pixel_values=inputs.get("pixel_values"), image_grid_thw=inputs.get("image_grid_thw"), num_images=inputs.get("num_images"), pixel_attention_mask=inputs.get("pixel_attention_mask"), image_sizes=inputs.get("image_sizes"), token_type_ids=inputs.get("token_type_ids"), mm_token_type_ids=inputs.get("mm_token_type_ids"), ) if self.top_entropy_quantile < 1.0: entropy_mask = self.get_high_entropy_mask( entropies, mask, 1 - self.top_entropy_quantile ) else: entropy_mask = None advantages = inputs["advantages"] if advantages.dim() == 1: advantages = advantages.unsqueeze(1) old_per_token_logps = inputs.get("old_per_token_logps") old_per_token_logps = ( per_token_logps.detach() if old_per_token_logps is None else old_per_token_logps ) # --- OPSM (off-policy sequence mask) --- off_policy_mask = None if getattr(self, "off_policy_mask_threshold", None) is not None: sampling_per_token_logps = inputs.get( "sampling_per_token_logps", old_per_token_logps ) off_policy_mask = self.get_off_policy_mask( advantages=advantages, per_token_logps=per_token_logps, sampling_per_token_logps=sampling_per_token_logps, mask=mask, off_policy_threshold=self.off_policy_mask_threshold, ) # --- Importance weights --- log_ratio = per_token_logps - old_per_token_logps is_level = getattr( self, "importance_sampling_level", getattr(self.args, "importance_sampling_level", "token"), ) if is_level == "token": log_importance_weights = log_ratio elif is_level == "sequence": log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp( min=1.0 ) log_importance_weights = log_importance_weights.unsqueeze(-1) else: raise ValueError(f"Unknown importance sampling level: {is_level}") coef_1 = torch.exp(log_importance_weights) # --- KL divergence --- if self.beta != 0.0: ref_per_token_logps = inputs["ref_per_token_logps"] per_token_kl = ( torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 ) if getattr(self.args, "use_bias_correction_kl", False): per_token_kl = per_token_kl * coef_1 # --- Per-token loss --- if self.loss_type == "cispo": clamped = torch.clamp(coef_1, max=self.epsilon_high).detach() per_token_loss = -clamped * advantages * per_token_logps elif self.loss_type in ("grpo", "bnpo", "dr_grpo", "dapo", "luspo"): coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high) if self.args.delta is not None: coef_1_c = torch.clamp(coef_1, max=self.args.delta) else: coef_1_c = coef_1 per_token_loss = -torch.min(coef_1_c * advantages, coef_2 * advantages) elif self.loss_type == "sapo": temps = torch.where( advantages > 0, self.args.sapo_temperature_pos, self.args.sapo_temperature_neg, ) soft = torch.sigmoid(temps * (coef_1 - 1)) * 4 / temps per_token_loss = -soft * advantages else: raise ValueError(f"Unknown loss type: {self.loss_type}") # --- Apply masks --- if off_policy_mask is not None: per_token_loss = per_token_loss * off_policy_mask if entropy_mask is not None: per_token_loss = per_token_loss * entropy_mask # --- IS ratio correction (vLLM distribution mismatch) --- if ( self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False) and "importance_sampling_ratio" in inputs ): per_token_loss = per_token_loss * inputs["importance_sampling_ratio"] if self.beta != 0.0: per_token_loss = per_token_loss + self.beta * per_token_kl # --- Aggregate loss --- mode = "train" if self.model.training else "eval" normalizer = ( self.current_gradient_accumulation_steps if mode == "train" else 1.0 ) if self.loss_type in ("grpo", "sapo"): loss = ( (per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0) ).mean() / normalizer elif self.loss_type == "bnpo": loss = ( (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0) / normalizer ) elif self.loss_type == "dr_grpo": loss = ( (per_token_loss * mask).sum() / (per_token_loss.size(0) * self.max_completion_length) / normalizer ) elif self.loss_type in ("cispo", "dapo"): norm = inputs["num_items_in_batch"] / self.accelerator.num_processes loss = (per_token_loss * mask).sum() / norm elif self.loss_type == "luspo": loss = (per_token_loss * mask.sum(1, keepdim=True)).mean() / normalizer else: raise ValueError(f"Unknown loss type: {self.loss_type}") # --- Metrics --- completion_token_count = mask.sum().clamp(min=1.0) def masked_batch_mean(x): return ( x.mean() if x.shape[1] == 1 else (x * mask).sum() / completion_token_count ) if self.beta != 0.0: mean_kl = masked_batch_mean(per_token_kl) self._metrics[mode]["kl"].append( self.accelerator.gather(mean_kl).nanmean().item() ) mean_entropy = masked_batch_mean(entropies) self._metrics[mode]["entropy"].append( self.accelerator.gather(mean_entropy).nanmean().item() ) if self.loss_type in ("grpo", "bnpo", "dr_grpo", "dapo", "luspo"): is_low = (coef_1 < 1 - self.epsilon_low) & (advantages < 0) is_high = (coef_1 > 1 + self.epsilon_high) & (advantages > 0) is_region = is_low | is_high low_clip = masked_batch_mean(is_low.float()) high_clip = masked_batch_mean(is_high.float()) clip_ratio = masked_batch_mean(is_region.float()) g_low = self.accelerator.gather(low_clip) self._metrics[mode]["clip_ratio/low_mean"].append(g_low.nanmean().item()) self._metrics[mode]["clip_ratio/low_min"].append(nanmin(g_low).item()) g_high = self.accelerator.gather(high_clip) self._metrics[mode]["clip_ratio/high_mean"].append(g_high.nanmean().item()) self._metrics[mode]["clip_ratio/high_max"].append(nanmax(g_high).item()) g_clip = self.accelerator.gather(clip_ratio) self._metrics[mode]["clip_ratio/region_mean"].append( g_clip.nanmean().item() ) elif self.loss_type == "cispo": is_cispo = (coef_1 > self.epsilon_high) & (advantages > 0) cr = masked_batch_mean(is_cispo.float()) self._metrics[mode]["cispo_clip_ratio"].append( self.accelerator.gather(cr).nanmean().item() ) return loss ================================================ FILE: src/axolotl/core/trainers/grpo/fast_async_trainer.py ================================================ # Copyright 2020-2026 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Experimental GRPO extensions: parallel reward workers, replay buffer, deferred re-roll, and zero-advantage skipping. These features are built as subclasses of GRPOTrainer and GRPODataProducer, using the hook system (_compute_rewards_for_batch, _post_advantage_hook, _pre_produce_hook) defined in the base classes. """ from __future__ import annotations import asyncio import logging import threading from dataclasses import dataclass, field import torch from torch import nn from trl import GRPOTrainer from axolotl.core.trainers.grpo.async_trainer import ( AsyncGRPOConfig, AsyncGRPOTrainer, GRPODataProducer, ) from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Extended config # --------------------------------------------------------------------------- @dataclass class FastAsyncGRPOConfig(AsyncGRPOConfig): """GRPOConfig with additional experimental parameters.""" reward_num_workers: int = field( default=1, metadata={ "help": "Number of persistent subprocess workers for parallel reward computation. Each worker has its " "own main thread so signal.alarm() (used by math_verify) works correctly. Work is sharded across " "workers by prompt groups. Only used with use_data_producer=True and non-nn.Module reward functions." }, ) replay_buffer_size: int = field( default=0, metadata={ "help": "[Experimental, disabled by default] Size of the replay buffer for storing high-signal rollout " "groups. When > 0, groups with reward variance are cached and used to replace zero-signal groups " "(where all rewards are identical). Set to 0 to disable. Only used with use_data_producer=True." }, ) replay_recompute_logps: bool = field( default=True, metadata={ "help": "When True (default), recompute old_per_token_logps for replayed groups using the current " "training model. This fixes the importance sampling mismatch that occurs when replaying stale data. " "Only relevant when replay_buffer_size > 0." }, ) reroll_start_fraction: float = field( default=0.5, metadata={ "help": "Fraction of total training steps after which deferred re-rolling begins. Zero-signal prompts " "(where all rewards in a group are identical) are buffered and re-injected into later batches when the " "model is more likely to solve them. Set to 1.0 to disable. Only used with use_data_producer=True." }, ) reroll_max_groups: int = field( default=1, metadata={ "help": "Maximum number of prompt groups to replace with re-roll candidates per batch. Higher values " "increase data utilization but reduce prompt diversity. Only used with use_data_producer=True." }, ) skip_zero_advantage_batches: bool = field( default=True, metadata={ "help": "When True, skip gradient computation for micro-batches where all advantages are zero (no learning " "signal). This avoids the forward/backward pass entirely when no learning signal is present. The step is " "logged with skipped_zero_adv_batches=1 for monitoring." }, ) vllm_lora_sync: bool = field( default=False, metadata={ "help": "When True, sync LoRA adapter weights to vLLM via filesystem instead of merging into base model " "and NCCL-broadcasting all parameters. vLLM loads the adapter natively using Punica kernels. " "Requires vllm_serve_lora serve module (auto-selected when this is True). " "Syncs only LoRA adapter weights (much smaller) vs full merged model. Legacy merge behavior is used when False." }, ) # --------------------------------------------------------------------------- # Extended data producer with re-roll injection # --------------------------------------------------------------------------- class RerollDataProducer(GRPODataProducer): """GRPODataProducer that injects re-roll candidates into prompt batches. Reads from the trainer's ``_reroll_buffer`` (populated by ``GRPOExperimentalTrainer._post_advantage_hook``) and replaces the last N prompt groups with previously-failed prompts. """ def _pre_produce_hook(self, inputs: list, global_step: int) -> list: trainer = self._trainer reroll_buf = getattr(trainer, "_reroll_buffer", None) reroll_lock = getattr(trainer, "_reroll_lock", None) if reroll_buf is None or reroll_lock is None: return inputs max_steps = getattr(trainer.args, "max_steps", -1) start_frac = getattr(trainer.args, "reroll_start_fraction", 1.0) max_groups = getattr(trainer.args, "reroll_max_groups", 1) reroll_start_step = ( max(1, int(max_steps * start_frac)) if max_steps > 0 else float("inf") ) if global_step < reroll_start_step: return inputs with reroll_lock: n_to_take = min(max_groups, len(reroll_buf)) reroll_prompts = [reroll_buf.pop(0) for _ in range(n_to_take)] if reroll_prompts: num_gen = self._num_generations n_groups = len(inputs) // num_gen for i, reroll_prompt in enumerate(reroll_prompts): group_idx = n_groups - 1 - i if group_idx < 0: break start = group_idx * num_gen for j in range(num_gen): inputs[start + j] = reroll_prompt logger.info( f"[REROLL] Step {global_step}: replaced {len(reroll_prompts)}/{n_groups} prompt groups " f"with deferred re-roll candidates ({len(reroll_buf)} remaining)" ) return inputs # --------------------------------------------------------------------------- # Persistent reward subprocess pool # --------------------------------------------------------------------------- def _persistent_reward_worker(conn): """Long-lived reward worker. Receives work items, returns results.""" while True: try: msg = conn.recv() except EOFError: break if msg is None: # Shutdown signal break ( reward_funcs, prompts, completions, completion_ids_list, inputs, reward_func_names, ) = msg try: keys = [ key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"] ] reward_kwargs = {key: [example[key] for example in inputs] for key in keys} results = [] for reward_func, _reward_func_name in zip( reward_funcs, reward_func_names, strict=True ): output = reward_func( prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs, ) results.append( [float(r) if r is not None else float("nan") for r in output] ) conn.send(results) except Exception: conn.send(None) # --------------------------------------------------------------------------- # Extended trainer # --------------------------------------------------------------------------- class FastAsyncGRPOTrainer(AsyncGRPOTrainer): """GRPOTrainer with experimental extensions. Adds: - Parallel reward subprocess workers (``reward_num_workers``) - Replay buffer for high-signal group reuse (``replay_buffer_size``) - Deferred re-roll of failed prompts (``reroll_start_fraction``) - Zero-advantage micro-batch skipping """ def __init__(self, *args, **kwargs): # These must be initialized before super().__init__() because # _create_data_producer (called during super().__init__) needs them. self._reroll_buffer: list = [] self._reroll_lock = threading.Lock() # Temporarily suppress the base class's Liger + OPSM validation check, # since this subclass supports it via a custom compute_liger_loss override. grpo_args = kwargs.get("args") if grpo_args is None: for a in args: if hasattr(a, "off_policy_mask_threshold"): grpo_args = a break saved_threshold = None if grpo_args is not None and getattr(grpo_args, "use_liger_kernel", False): saved_threshold = grpo_args.off_policy_mask_threshold grpo_args.off_policy_mask_threshold = None super().__init__(*args, **kwargs) if saved_threshold is not None: grpo_args.off_policy_mask_threshold = saved_threshold self.off_policy_mask_threshold = saved_threshold # Replay buffer if getattr(self.args, "replay_buffer_size", 0) > 0: self._replay_buffer = ReplayBuffer(max_size=self.args.replay_buffer_size) else: self._replay_buffer = None self._replay_recompute_logps = getattr( self.args, "replay_recompute_logps", True ) # Reward worker pool (lazy-initialized) self._reward_workers = None # -- Factory override: use RerollDataProducer ---------------------------- def _create_data_producer(self, args, train_dataset): """Override to use RerollDataProducer for re-roll prompt injection.""" from axolotl.core.trainers.grpo.async_trainer import ( AsyncDataProducer, ProducerConfig, ) producer_config = ProducerConfig( mini_epochs=args.num_iterations, max_rollouts=None, eval_during_produce=False, empty_cache_before_produce=True, empty_cache_after_produce=True, async_prefetch=args.async_prefetch, prefetch_depth=args.prefetch_depth, ) data_producer = RerollDataProducer( config=producer_config, prompt_dataset=train_dataset, num_generations=self.num_generations, generation_batch_size=args.generation_batch_size, train_batch_size=args.per_device_train_batch_size, steps_per_generation=args.steps_per_generation, shuffle_dataset=self.shuffle_dataset, seed=args.seed, ) data_producer.set_trainer(self) if args.async_prefetch: data_producer = AsyncDataProducer( data_producer, background_produce_kwargs={"skip_policy_logps": True}, ) return data_producer # -- Reward worker pool -------------------------------------------------- def _get_reward_workers(self): """Return a list of persistent reward worker subprocesses (lazy-initialized).""" import multiprocessing as _mp num_workers = getattr(self.args, "reward_num_workers", 1) if num_workers < 1: num_workers = 1 if self._reward_workers is not None: alive = all(proc.is_alive() for conn, proc in self._reward_workers) if alive and len(self._reward_workers) == num_workers: return self._reward_workers self._shutdown_reward_workers() workers = [] for _ in range(num_workers): parent_conn, child_conn = _mp.Pipe() proc = _mp.Process( target=_persistent_reward_worker, args=(child_conn,), daemon=True ) proc.start() child_conn.close() workers.append((parent_conn, proc)) self._reward_workers = workers return workers def _shutdown_reward_workers(self): """Shut down all persistent reward workers.""" if self._reward_workers is None: return for conn, proc in self._reward_workers: try: conn.send(None) proc.join(timeout=5) except Exception: pass try: conn.close() except Exception: pass self._reward_workers = None # -- Hook overrides ------------------------------------------------------ def _compute_rewards_for_batch( self, inputs, prompts, completions, completion_ids_list ): """Dispatch rewards to parallel subprocess workers (synchronous wrapper).""" self._launch_reward_workers(inputs, prompts, completions, completion_ids_list) return self._collect_reward_workers( inputs, prompts, completions, completion_ids_list ) def _launch_reward_workers(self, inputs, prompts, completions, completion_ids_list): """Send reward work to subprocess workers (non-blocking). Results are collected later by _collect_reward_workers, allowing GPU logprob computation to overlap with CPU reward computation. """ reward_can_bg = all( callable(rf) and not isinstance(rf, nn.Module) and not asyncio.iscoroutinefunction(rf) for rf in self.reward_funcs ) num_workers = getattr(self.args, "reward_num_workers", 1) if not reward_can_bg or num_workers <= 1: # Can't parallelize — store args for sync fallback in collect self._reward_workers_used = None self._pending_reward_args = ( inputs, prompts, completions, completion_ids_list, ) return workers = self._get_reward_workers() num_generations = self.num_generations num_prompts = len(prompts) num_groups = num_prompts // num_generations # Shard by prompt groups across workers groups_per_worker = max(1, (num_groups + len(workers) - 1) // len(workers)) workers_used = [] for w_idx, (conn, _proc) in enumerate(workers): g_start = w_idx * groups_per_worker g_end = min((w_idx + 1) * groups_per_worker, num_groups) if g_start >= num_groups: break s_start = g_start * num_generations s_end = g_end * num_generations conn.send( ( self.reward_funcs, prompts[s_start:s_end], completions[s_start:s_end], completion_ids_list[s_start:s_end], inputs[s_start:s_end], self.reward_func_names, ) ) workers_used.append(conn) self._reward_workers_used = workers_used self._pending_reward_args = (inputs, prompts, completions, completion_ids_list) def _collect_reward_workers( self, inputs, prompts, completions, completion_ids_list ): """Collect reward results from subprocess workers (blocks until done).""" from accelerate.utils import gather workers_used = getattr(self, "_reward_workers_used", None) args = getattr(self, "_pending_reward_args", None) self._reward_workers_used = None self._pending_reward_args = None if workers_used is None: # Sync fallback — compute on main thread if args is not None: return self._calculate_rewards(*args) return self._calculate_rewards( inputs, prompts, completions, completion_ids_list ) device = self.accelerator.device num_prompts = len(args[1]) if args else len(prompts) # Collect results from workers all_worker_results = [] any_failed = False for conn in workers_used: result = conn.recv() if result is None: any_failed = True # Drain remaining workers to prevent stale results in pipes for remaining_conn in workers_used: if remaining_conn is not conn: try: remaining_conn.recv() except Exception: pass break all_worker_results.append(result) if not any_failed: rewards_per_func = torch.zeros( num_prompts, len(self.reward_funcs), device=device ) offset = 0 for worker_result in all_worker_results: chunk_size = len(worker_result[0]) for i, result in enumerate(worker_result): rewards_per_func[offset : offset + chunk_size, i] = torch.tensor( result, dtype=torch.float32, device=device ) offset += chunk_size return gather(rewards_per_func) # Fallback to main thread on failure if args is not None: return self._calculate_rewards(*args) return self._calculate_rewards( inputs, prompts, completions, completion_ids_list ) def _post_advantage_hook( self, data: dict, rewards_per_func, advantages, inputs: list, num_generations: int, mode: str, s_start: int | None = None, s_end: int | None = None, is_last_chunk: bool = True, ) -> None: """Replay buffer store/replace + re-roll buffering.""" from trl.models.utils import disable_gradient_checkpointing # -- Replay buffer: store high-signal groups -- if self._replay_buffer is not None: local_grouped = rewards_per_func.view( -1, num_generations, len(self.reward_funcs) ) per_group_std = local_grouped.std(dim=1) has_signal = (per_group_std > 0).any(dim=1) offset = s_start or 0 if has_signal.any(): grouped_adv = advantages.view(-1, num_generations) replay_scores = grouped_adv.abs().sum(dim=1) * per_group_std.sum(dim=1) for group_idx in has_signal.nonzero(as_tuple=True)[0]: gi = group_idx.item() start = offset + gi * num_generations end = start + num_generations group_data = {} for key in data: val = data[key] if ( isinstance(val, torch.Tensor) and val.dim() > 0 and val.size(0) >= end ): group_data[key] = val[start:end].clone() self._replay_buffer.add(replay_scores[gi].item(), group_data) # Replace zero-signal groups with high-signal replay buffer entries # Only in non-streaming path (s_start is None) — streaming scores # groups incrementally, so replacement + logprob recompute would be # too expensive per chunk. n_replaced = 0 if s_start is None: no_signal = ~has_signal replaced_ranges = [] if no_signal.any() and len(self._replay_buffer) > 0: for group_idx in no_signal.nonzero(as_tuple=True)[0]: sampled = self._replay_buffer.sample(1) if sampled is None: break sampled_group = sampled[0] gi = group_idx.item() start = offset + gi * num_generations end = start + num_generations for key, val in sampled_group.items(): if key in data and isinstance(data[key], torch.Tensor): src = val.to(data[key].device) tgt_seq_len = ( data[key].size(1) if data[key].dim() > 1 else None ) if start >= data[key].size(0) or end > data[key].size( 0 ): continue if tgt_seq_len is not None: if src.size(1) <= tgt_seq_len: data[key][start:end] = 0 data[key][start:end, : src.size(1)] = src else: data[key][start:end] = src[:, :tgt_seq_len] else: data[key][start:end] = src replaced_ranges.append((start, end)) n_replaced += 1 # Recompute old_per_token_logps for replayed groups if ( n_replaced > 0 and self._replay_recompute_logps and "old_per_token_logps" in data ): with ( torch.no_grad(), disable_gradient_checkpointing( self.model, self.args.gradient_checkpointing_kwargs ), ): for r_start, r_end in replaced_ranges: r_ids = torch.cat( [ data["prompt_ids"][r_start:r_end], data["completion_ids"][r_start:r_end], ], dim=1, ) r_mask = torch.cat( [ data["prompt_mask"][r_start:r_end], data["completion_mask"][r_start:r_end], ], dim=1, ) r_logits_to_keep = data["completion_ids"].size(1) r_fwd_kwargs = {} for fk in ( "pixel_values", "image_grid_thw", "pixel_attention_mask", "image_sizes", "token_type_ids", "mm_token_type_ids", ): if fk in data: r_fwd_kwargs[fk] = data[fk] r_logps, _ = self._get_per_token_logps_and_entropies( self.model, r_ids, r_mask, r_logits_to_keep, r_end - r_start, **r_fwd_kwargs, ) data["old_per_token_logps"][r_start:r_end] = r_logps if n_replaced > 0: self._metrics[mode]["replay_buffer_replacements"].append( float(n_replaced) ) if is_last_chunk: self._metrics[mode]["replay_buffer_size"].append( float(len(self._replay_buffer)) ) # -- Re-roll buffer: store failed prompts -- if getattr(self.args, "reroll_start_fraction", 1.0) < 1.0: grouped_rewards = rewards_per_func.view( -1, num_generations, len(self.reward_funcs) ) per_group_std = grouped_rewards.std(dim=1) per_group_mean = grouped_rewards.mean(dim=1) zero_signal = (per_group_std == 0).all(dim=1) all_failed = (per_group_mean.abs() < 1e-6).all(dim=1) should_reroll = zero_signal & all_failed _n_buffered = 0 with self._reroll_lock: for group_idx in should_reroll.nonzero(as_tuple=True)[0]: idx = group_idx.item() * num_generations if idx >= len(inputs): continue prompt_input = inputs[idx] self._reroll_buffer.append(prompt_input) _n_buffered += 1 if _n_buffered > 0: self._metrics[mode]["reroll_buffered"].append(float(_n_buffered)) if is_last_chunk: self._metrics[mode]["reroll_buffer_size"].append( float(len(self._reroll_buffer)) ) # -- Zero-advantage skipping + Liger OPSM --------------------------------- def compute_liger_loss(self, unwrapped_model, inputs): """Liger loss with zero-adv skipping and off-policy sequence masking (OPSM). The base class Liger path doesn't support OPSM because the fused kernel doesn't expose per-token logprobs needed for the KL computation. This override computes them via chunked lm_head matmul (no grad, low memory) and applies the OPSM to the loss mask before calling the kernel. """ if self.args.skip_zero_advantage_batches and torch.all( inputs["advantages"] == 0 ): mode = "train" if self.model.training else "eval" self._metrics[mode]["skipped_zero_adv_batches"].append(1.0) return torch.tensor( 0.0, device=inputs["advantages"].device, requires_grad=True ) if self.off_policy_mask_threshold is None: return super().compute_liger_loss(unwrapped_model, inputs) # OPSM path: need per_token_logps for KL, which Liger kernel doesn't provide prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] completion_ids, completion_mask = ( inputs["completion_ids"], inputs["completion_mask"], ) input_ids = torch.cat([prompt_ids, completion_ids], dim=1) attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) logits_to_keep = completion_ids.size(1) last_hidden_state = self._get_last_hidden_state( unwrapped_model, input_ids, attention_mask, logits_to_keep, inputs.get("pixel_values"), inputs.get("image_grid_thw"), inputs.get("pixel_attention_mask"), inputs.get("image_sizes"), ) loss_mask = ( completion_mask if "tool_mask" not in inputs else completion_mask * inputs["tool_mask"] ) # Compute per_token_logps via chunked lm_head matmul (no grad, low memory) lm_weight = unwrapped_model.lm_head.weight lm_bias = unwrapped_model.lm_head.bias with torch.no_grad(): per_token_logps_chunks = [] for i in range(last_hidden_state.size(0)): chunk_logits = torch.matmul(last_hidden_state[i : i + 1], lm_weight.t()) if lm_bias is not None: chunk_logits = chunk_logits + lm_bias chunk_lps = ( chunk_logits.float() .log_softmax(-1) .gather(-1, completion_ids[i : i + 1].unsqueeze(-1)) .squeeze(-1) ) per_token_logps_chunks.append(chunk_lps) del chunk_logits per_token_logps = torch.cat(per_token_logps_chunks, dim=0) advantages = inputs["advantages"] if advantages.dim() == 1: advantages_2d = advantages.unsqueeze(1) else: advantages_2d = advantages sampling_per_token_logps = inputs.get("sampling_per_token_logps") if sampling_per_token_logps is None: sampling_per_token_logps = inputs.get("old_per_token_logps") if sampling_per_token_logps is None: sampling_per_token_logps = per_token_logps off_policy_mask = GRPOTrainer.get_off_policy_mask( advantages=advantages_2d, per_token_logps=per_token_logps, sampling_per_token_logps=sampling_per_token_logps, mask=loss_mask, off_policy_threshold=self.off_policy_mask_threshold, ) loss_mask = loss_mask * off_policy_mask # Call the Liger fused kernel with OPSM-modified mask loss, metrics = self.liger_grpo_loss( _input=last_hidden_state, lin_weight=unwrapped_model.lm_head.weight, selected_token_ids=completion_ids, attention_mask=loss_mask, advantages=inputs["advantages"], bias=unwrapped_model.lm_head.bias, old_per_token_logps=inputs.get("old_per_token_logps"), ref_per_token_logps=inputs.get("ref_per_token_logps"), vllm_is_ratio=inputs.get("importance_sampling_ratio"), ) mean_kl = metrics[0] if self.beta != 0.0 else None clip_ratio = metrics[-1] mode = "train" if self.model.training else "eval" if self.beta != 0.0: self._metrics[mode]["kl"].append( self.accelerator.gather(mean_kl).mean().item() ) self._metrics[mode]["clip_ratio"].append( self.accelerator.gather(clip_ratio).mean().item() ) normalizer = ( self.current_gradient_accumulation_steps if mode == "train" else 1.0 ) return loss / normalizer def _compute_loss(self, model, inputs): if self.args.skip_zero_advantage_batches and torch.all( inputs["advantages"] == 0 ): mode = "train" if self.model.training else "eval" self._metrics[mode]["skipped_zero_adv_batches"].append(1.0) # Create zero loss with grad_fn. DeepSpeed requires grad_fn != None. # With ZeRO-3, parameters are partitioned (shape=[0], requires_grad=False) # so we can't just do `(p * 0).sum()`. Instead, do a tiny forward pass # with a single token to create a proper computation graph. prompt_ids = inputs["prompt_ids"][:1, :1] # (1, 1) attn = torch.ones_like(prompt_ids) with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): out = model(input_ids=prompt_ids, attention_mask=attn) return out.logits.sum() * 0 return super()._compute_loss(model, inputs) ================================================ FILE: src/axolotl/core/trainers/grpo/replay_buffer.py ================================================ """Simple replay buffer for storing and sampling high-signal rollout groups.""" import heapq import torch class ReplayBuffer: """Min-heap replay buffer that keeps the highest-scoring rollout groups. Groups are scored by signal quality (advantage magnitude * reward variance). When sampling, groups are drawn proportional to their scores. """ def __init__(self, max_size: int): self.max_size = max_size self._heap: list[tuple[float, int, dict]] = [] # min-heap of (score, id, data) self._counter = 0 # unique tiebreaker for heap def __len__(self): return len(self._heap) def add(self, score: float, data: dict): """Add a group to the buffer. If full, replaces lowest-scoring entry.""" if self.max_size <= 0: return self._counter += 1 if len(self._heap) < self.max_size: heapq.heappush(self._heap, (score, self._counter, data)) elif score > self._heap[0][0]: heapq.heapreplace(self._heap, (score, self._counter, data)) def sample(self, num_samples: int) -> list[dict] | None: """Sample groups weighted by their scores. Returns None if buffer is empty.""" if self.max_size <= 0 or not self._heap: return None scores = torch.tensor([item[0] for item in self._heap], dtype=torch.float32) scores = scores.clamp(min=1e-8) # avoid zero probabilities probs = scores / scores.sum() replacement = num_samples > len(self._heap) indices = torch.multinomial( probs, num_samples, replacement=replacement ).tolist() return [self._heap[i][2] for i in indices] ================================================ FILE: src/axolotl/core/trainers/grpo/sampler.py ================================================ """Repeat random sampler (similar to the one implemented in https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds sequence parallelism functionality; i.e., duplicating data across ranks in the same sequence parallel group. """ from typing import Iterator, Sized import torch from torch.utils.data import Sampler class SequenceParallelRepeatRandomSampler(Sampler): """Sampler for GRPO training with sequence parallelism. This sampler ensures: - Ranks in the same sequence parallel (SP) group receive identical data. - Each index is repeated multiple times for sampling different completions. - Entire batches are repeated for reuse in multiple updates. - Data is properly distributed across SP groups. In the table below, the values represent dataset indices. Each SP group has `context_parallel_size = 2` GPUs working together on the same data. There are 2 SP groups (SP0 and SP1), with `world_size = 4` total GPUs. Sequence Parallel Groups | SP0 | SP1 | | GPU 0 | GPU 1 | GPU 2 | GPU 3 | global_step step <---> mini_repeat_count=3 <----------> batch_size=2 per SP group grad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] <- SP groups get different data ▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] <- Same data for each SP group GPU | | 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] <- Repeat same indices for iterations num_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] <- When using gradient accumulation 2 4 [4 4 4 5 5 5] [6 6 6 7 7 7] <- New batch of data indices 2 5 [4 4 4 5 5 5] [6 6 6 7 7 7] ... Args: dataset: Dataset to sample from. mini_repeat_count: How many times to repeat each sample immediately. world_size: Total number of processes. rank: Rank of current process. batch_size: Number of samples per batch. repeat_count: How many times to repeat the full sampling process. context_parallel_size: Number of ranks in a sequence parallel group. shuffle: Whether to shuffle the dataset. seed: Random seed for shuffling. drop_last: Whether to drop the last incomplete batch. """ def __init__( self, dataset: Sized, mini_repeat_count: int, world_size: int, rank: int, batch_size: int = 1, repeat_count: int = 1, context_parallel_size: int = 1, shuffle: bool = True, seed: int = 0, drop_last: bool = False, ): self.dataset = dataset self.mini_repeat_count = mini_repeat_count self.batch_size = batch_size self.repeat_count = repeat_count self.shuffle = shuffle self.seed = seed self.drop_last = drop_last self.epoch = 0 self.world_size = world_size self.rank = rank # Sequence parallelism parameters self.context_parallel_size = context_parallel_size self.num_sp_groups = world_size // context_parallel_size self.sp_group_id = rank // context_parallel_size # Adjust dataset size for distributed sampling self.num_samples = len(self.dataset) self.total_size = self.num_samples # Calculate effective number of samples per SP group if ( self.drop_last and self.total_size % (self.num_sp_groups * self.batch_size) != 0 ): # Drop last incomplete batch if drop_last is True self.num_samples_per_sp_group = ( self.total_size // self.batch_size // self.num_sp_groups ) * self.batch_size else: # Round up to include last batch if drop_last is False self.num_samples_per_sp_group = ( (self.total_size + self.batch_size * self.num_sp_groups - 1) // (self.batch_size * self.num_sp_groups) * self.batch_size ) if shuffle: self.generator = torch.Generator() self.generator.manual_seed(seed) def __iter__(self) -> Iterator[int]: """Creates iterator over dataset indices. Returns: Iterator that yields indices into the dataset. """ # Deterministically shuffle based on epoch and seed if self.shuffle: indices = torch.randperm( self.num_samples, generator=self.generator ).tolist() else: indices = list(range(self.num_samples)) # Add extra samples to make it evenly divisible by batch_size if len(indices) % self.batch_size != 0: padding = indices[: self.batch_size - len(indices) % self.batch_size] indices += padding # Subsample based on SP group ID # Each SP group gets distinct batches of data batch_indices = [] for i in range(0, len(indices), self.batch_size * self.num_sp_groups): start_idx = i + self.sp_group_id * self.batch_size end_idx = min(start_idx + self.batch_size, len(indices)) if start_idx < len(indices): for j in range(self.batch_size): if start_idx + j < end_idx: batch_indices.append(indices[start_idx + j]) # Make sure batch_indices is exactly batch_size * num_batches_per_sp_group if self.drop_last: num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size target_len = self.batch_size * num_batches_per_sp_group if len(batch_indices) > target_len: batch_indices = batch_indices[:target_len] # Apply the GRPO repeat pattern final_indices = [] for _ in range(self.repeat_count): for idx in batch_indices: for _ in range(self.mini_repeat_count): final_indices.append(idx) return iter(final_indices) def __len__(self) -> int: """Returns the total length of the iterable including repetitions. Returns: Total number of samples. """ # Total length including all repetitions return ( self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count ) def set_epoch(self, epoch: int) -> None: """Sets the epoch for this sampler. Args: epoch: Epoch number to use for shuffling. """ self.epoch = epoch ================================================ FILE: src/axolotl/core/trainers/grpo/trainer.py ================================================ """Axolotl GRPO trainers (with and without sequence parallelism handling)""" import warnings from functools import partial from typing import Any import datasets import torch import torch.distributed as dist import torch.utils.data from accelerate.utils import ( broadcast_object_list, gather, gather_object, is_peft_available, ) from datasets import Dataset, IterableDataset from torch import nn from torch.utils.data import ( BatchSampler, DataLoader, Sampler, ) from transformers import ( PreTrainedModel, PreTrainedTokenizerBase, Trainer, TrainerCallback, ) from transformers.trainer_utils import seed_worker from trl import GRPOTrainer from trl.data_utils import ( apply_chat_template, is_conversational, maybe_apply_chat_template, ) from trl.extras.profiling import profiling_context from trl.models import unwrap_model_for_generation from trl.trainer.grpo_config import GRPOConfig from trl.trainer.grpo_trainer import RewardFunc, nanstd from trl.trainer.utils import pad from axolotl.core.trainers.grpo.fast_async_trainer import FastAsyncGRPOTrainer from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler from axolotl.core.trainers.mixins import ( DistributedParallelMixin, RngLoaderMixin, SchedulerMixin, ) from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin from axolotl.monkeypatch.ring_attn import get_ring_attn_group if is_peft_available(): from peft import PeftConfig class AxolotlGRPOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, GRPOTrainer, ): """Extend the base GRPOTrainer for axolotl helpers""" _tag_names = ["trl", "grpo", "axolotl"] class AxolotlAsyncGRPOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, FastAsyncGRPOTrainer, ): """Extend AsyncGRPOTrainer with axolotl helpers""" _tag_names = ["trl", "grpo", "async", "axolotl"] class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer): """Extend the base GRPOTrainer for sequence parallelism handling""" def __init__( self, model: str | PreTrainedModel, reward_funcs: RewardFunc | list[RewardFunc], args: GRPOConfig | None = None, train_dataset: Dataset | IterableDataset | None = None, eval_dataset: ( Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None ) = None, processing_class: PreTrainedTokenizerBase | None = None, reward_processing_classes: ( PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None ) = None, callbacks: list[TrainerCallback] | None = None, optimizers: tuple[ torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None ] = (None, None), peft_config: "PeftConfig | None" = None, optimizer_cls_and_kwargs: tuple[type, dict] | None = None, ): # First call the superclass constructor with all arguments super().__init__( model=model, reward_funcs=reward_funcs, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, processing_class=processing_class, reward_processing_classes=reward_processing_classes, callbacks=callbacks, optimizers=optimizers, peft_config=peft_config, optimizer_cls_and_kwargs=optimizer_cls_and_kwargs, ) # Get number of SP groups (number of processes divided by SP degree) num_processes = self.accelerator.num_processes num_sp_groups = num_processes // self.args.context_parallel_size # Calculate batch size per SP group (not per process) sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups possible_values = [ n_gen for n_gen in range(2, sp_group_batch_size + 1) if (sp_group_batch_size) % n_gen == 0 ] if self.num_generations not in possible_values: raise ValueError( f"The batch size per SP group ({num_sp_groups} x " f"{self.args.per_device_train_batch_size}) must be evenly divisible by " f"the number of generations per prompt ({self.num_generations}). Given " "the current configuration, the valid values for the number of " f"generations are: {possible_values}." ) if self.args.eval_strategy != "no": # If sequence parallelism is enabled, calculate batch size per SP group sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups # type: ignore[union-attr] possible_values = [ n_gen for n_gen in range(2, sp_group_eval_batch_size + 1) if (sp_group_eval_batch_size) % n_gen == 0 ] if self.num_generations not in possible_values: raise ValueError( f"With sequence parallelism (degree {self.args.context_parallel_size}), " f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) " f"must be evenly divisible by the number of generations per prompt " f"({self.num_generations}). Given the current eval batch size, " f"the valid values for the number of generations are: {possible_values}." ) self.sp_group = None self.rank = dist.get_rank() self.world_size = dist.get_world_size() self.local_rank = 0 self.local_world_size = 1 def train(self, *args, **kwargs): # Initialize the SP group self.sp_group = get_ring_attn_group() self.rank = dist.get_rank() self.world_size = dist.get_world_size() self.local_rank = dist.get_rank(group=self.sp_group) self.local_world_size = dist.get_world_size(group=self.sp_group) return super().train(*args, **kwargs) def _get_train_sampler(self) -> Sampler: effective_batch_size = ( self.args.per_device_train_batch_size * self.world_size * self.args.gradient_accumulation_steps ) return SequenceParallelRepeatRandomSampler( dataset=self.train_dataset, mini_repeat_count=self.num_generations, world_size=self.world_size, rank=self.rank, batch_size=effective_batch_size // self.num_generations // self.args.context_parallel_size, repeat_count=self.num_iterations * self.args.gradient_accumulation_steps, context_parallel_size=self.args.context_parallel_size, shuffle=True, seed=self.args.seed, drop_last=True, ) def _create_dataloader_params(self, is_eval=False, custom_batch_size=None): """Create common dataloader parameters for train or eval.""" batch_size = custom_batch_size or ( self.args.eval_batch_size if is_eval else self._train_batch_size ) params = { "batch_size": batch_size, "collate_fn": self.data_collator, "num_workers": self.args.dataloader_num_workers, "pin_memory": self.args.dataloader_pin_memory, } # Add persistent workers only for training if not is_eval and hasattr(self.args, "dataloader_persistent_workers"): params["persistent_workers"] = self.args.dataloader_persistent_workers # Add prefetch factor if specified if self.args.dataloader_prefetch_factor: params["prefetch_factor"] = self.args.dataloader_prefetch_factor return params def _prepare_dataloader( self, dataset, sampler, is_eval=False, custom_batch_size=None ): """Prepare a dataloader with the given dataset and sampler.""" # Get base parameters dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size) # Add sampler configuration if not isinstance(dataset, torch.utils.data.IterableDataset): if isinstance(sampler, BatchSampler): # batch_size and batch_sampler are mutually exclusive dataloader_params["batch_sampler"] = sampler del dataloader_params["batch_size"] else: dataloader_params["sampler"] = sampler dataloader_params["drop_last"] = self.args.dataloader_drop_last if not is_eval: dataloader_params["worker_init_fn"] = partial( seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index, ) # Create the dataloader dataloader = DataLoader(dataset, **dataloader_params) if self.args.sample_packing and ( (not is_eval and not self.args.pretraining) or (is_eval and self.args.eval_sample_packing is not False) ): self.accelerator.even_batches = False # Return unprepared dataloader if using sequence parallelism # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e., # slice each batch along the sequence dimension). if self.args.context_parallel_size > 1: return dataloader # Otherwise prepare with accelerator return self.accelerator.prepare_data_loader(dataloader) def get_train_dataloader(self) -> DataLoader: """Get dataloader for training""" train_dataset = self.train_dataset data_collator = self.data_collator # type: ignore # Handle dataset preprocessing if isinstance(train_dataset, datasets.Dataset): # Add debug print before any modifications if self.args.sample_packing and not self.args.pretraining: train_dataset = train_dataset.remove_columns(["length"]) if not self.args.sample_packing or self.args.pretraining: train_dataset = self._remove_unused_columns( train_dataset, description="training" ) else: self.data_collator = self._get_collator_with_removed_columns( data_collator, description="training", ) # Get sampler and create dataloader sampler = self._get_train_sampler() dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False) return dataloader def _generate_and_score_completions( self, inputs: list[dict[str, torch.Tensor | Any]] ) -> dict[str, torch.Tensor | Any]: device = self.accelerator.device mode = "eval" if self.control.should_evaluate else "train" prompts = [x["prompt"] for x in inputs] prompts_text = [ maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs ] prompt_inputs = self.processing_class( text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False, ) prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs) prompt_ids, prompt_mask = ( prompt_inputs["input_ids"], prompt_inputs["attention_mask"], ) if self.max_prompt_length is not None: prompt_ids = prompt_ids[:, -self.max_prompt_length :] prompt_mask = prompt_mask[:, -self.max_prompt_length :] # Generate completions using either vLLM or regular generation if self.args.use_vllm: # First, have main process load weights if needed if self.state.global_step != self._last_loaded_step: # type: ignore[has-type] self._move_model_to_vllm() self._last_loaded_step = self.state.global_step # Generate completions using vLLM: gather all prompts and use them in a single call in the main process all_prompts_text = gather_object(prompts_text) if self.accelerator.is_main_process: if self.args.context_parallel_size > 1: # Calculate sequence parallel group information world_size = self.accelerator.num_processes context_parallel_size = self.args.context_parallel_size num_sp_groups = world_size // context_parallel_size # Since processes in the same SP group have the same prompts, we need to ensure # we only take one copy of each prompt from each SP group ordered_set_of_prompts = [] for sp_group_id in range(num_sp_groups): # Get the first process from each SP group (typically the group leader) group_leader_rank = sp_group_id * context_parallel_size # Extract prompts from this SP group, accounting for num_generations duplicates # We only need prompts from one rank in each SP group group_prompts = all_prompts_text[ group_leader_rank * len(prompts_text) : ( group_leader_rank + 1 ) * len(prompts_text) : self.num_generations ] ordered_set_of_prompts.extend(group_prompts) else: # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate # num_generations outputs for each one. This is faster than generating outputs for each duplicate # prompt individually. ordered_set_of_prompts = all_prompts_text[ :: self.num_generations * self.args.context_parallel_size ] with profiling_context(self, "vLLM.generate"): completion_ids = self.vllm_client.generate( prompts=ordered_set_of_prompts, n=self.num_generations, repetition_penalty=self.repetition_penalty, temperature=self.temperature, top_p=self.top_p, top_k=-1 if self.top_k is None else self.top_k, min_p=0.0 if self.min_p is None else self.min_p, max_tokens=self.max_completion_length, guided_decoding_regex=self.guided_decoding_regex, ) else: completion_ids = [None] * ( len(all_prompts_text) // self.args.context_parallel_size ) # Broadcast the completions from the main process to all processes completion_ids = broadcast_object_list(completion_ids, from_process=0) # Determine the appropriate slice based on sequence parallelism if self.args.context_parallel_size > 1: # Calculate SP group ID (which group of ranks this rank belongs to) sp_group_id = self.accelerator.process_index // self.local_world_size # Calculate the start index for this SP group sp_group_start = sp_group_id * len(prompts) * self.local_world_size # All ranks in the same SP group get the same data slice process_slice = slice( sp_group_start, sp_group_start + len(prompts), ) completion_ids = completion_ids[process_slice] else: # Original behavior for non-sequence parallel case process_slice = slice( self.accelerator.process_index * len(prompts), (self.accelerator.process_index + 1) * len(prompts), ) completion_ids = completion_ids[process_slice] # Pad the completions, and concatenate them with the prompts completion_ids = [ torch.tensor(ids, device=device) for ids in completion_ids ] completion_ids = pad( completion_ids, padding_value=self.processing_class.pad_token_id ) prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) else: # Regular generation path with unwrap_model_for_generation( self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation, ) as unwrapped_model: prompt_completion_ids = unwrapped_model.generate( prompt_ids, attention_mask=prompt_mask, generation_config=self.generation_config, ) # Compute prompt length and extract completion ids prompt_length = prompt_ids.size(1) prompt_ids = prompt_completion_ids[:, :prompt_length] completion_ids = prompt_completion_ids[:, prompt_length:] # Mask everything after the first EOS token is_eos = completion_ids == self.processing_class.eos_token_id eos_idx = torch.full( (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device ) eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] sequence_indices = torch.arange(is_eos.size(1), device=device).expand( is_eos.size(0), -1 ) completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask if self.args.mask_truncated_completions: truncated_completions = ~is_eos.any(dim=1) completion_mask = ( completion_mask * (~truncated_completions).unsqueeze(1).int() ) # Concatenate prompt_mask with completion_mask for logit computation attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) # (B, P+C) logits_to_keep = completion_ids.size( 1 ) # we only need to compute the logits for the completion tokens batch_size = ( self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size ) with torch.no_grad(): # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's # computation here, and use per_token_logps.detach() instead. if self.num_iterations > 1: old_per_token_logps = self._get_per_token_logps( self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, ) else: old_per_token_logps = None if self.beta == 0.0: ref_per_token_logps = None elif self.ref_model is not None: ref_per_token_logps = self._get_per_token_logps( self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, ) else: with self.accelerator.unwrap_model(self.model).disable_adapter(): ref_per_token_logps = self._get_per_token_logps( self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size, ) # Decode the generated completions completions_text = self.processing_class.batch_decode( completion_ids, skip_special_tokens=True ) if is_conversational(inputs[0]): completions = [] for prompt, completion in zip(prompts, completions_text, strict=False): bootstrap = ( prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else "" ) completions.append( [{"role": "assistant", "content": bootstrap + completion}] ) else: completions = completions_text rewards_per_func = torch.zeros( len(prompts), len(self.reward_funcs), device=device ) for i, (reward_func, reward_processing_class, reward_func_name) in enumerate( zip( self.reward_funcs, self.reward_processing_classes, self.reward_func_names, strict=False, ) ): with profiling_context(self, reward_func_name): if isinstance( reward_func, nn.Module ): # Module instead of PretrainedModel for compat with compiled models if is_conversational(inputs[0]): messages = [ {"messages": p + c} for p, c in zip(prompts, completions, strict=False) ] texts = [ apply_chat_template(x, reward_processing_class)["text"] for x in messages ] else: texts = [ p + c for p, c in zip(prompts, completions, strict=False) ] reward_inputs = reward_processing_class( text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False, ) reward_inputs = Trainer._prepare_inputs(self, reward_inputs) with torch.inference_mode(): rewards_per_func[:, i] = reward_func(**reward_inputs).logits[ :, 0 ] # Shape (B*G,) else: # Repeat all input columns (but "prompt" and "completion") to match the number of generations keys = [ key for key in inputs[0] if key not in ["prompt", "completion"] ] reward_kwargs = { key: [example[key] for example in inputs] for key in keys } output_reward_func = reward_func( prompts=prompts, completions=completions, **reward_kwargs ) # Convert None values to NaN output_reward_func = [ reward if reward is not None else torch.nan for reward in output_reward_func ] rewards_per_func[:, i] = torch.tensor( output_reward_func, dtype=torch.float32, device=device ) # If all reward functions return None for a given row, issue a detailed warning if torch.isnan(rewards_per_func).all(dim=1).any(): nan_row_idx = ( torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0] ) row_reward_kwargs = { key: value[nan_row_idx] for key, value in reward_kwargs.items() } row_reward_kwargs["prompt"] = prompts[nan_row_idx] row_reward_kwargs["completion"] = completions[nan_row_idx] warnings.warn( f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. " "Please ensure that at least one reward function returns a valid reward.", stacklevel=2, ) # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the # completions may be distributed across processes rewards_per_func = gather(rewards_per_func) # Apply weights to each reward function's output and sum rewards = ( rewards_per_func * self.reward_weights.to(device).unsqueeze(0) ).nansum(dim=1) # Compute grouped-wise rewards mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1) # Normalize the rewards to compute the advantages mean_grouped_rewards = mean_grouped_rewards.repeat_interleave( self.num_generations, dim=0 ) std_grouped_rewards = std_grouped_rewards.repeat_interleave( self.num_generations, dim=0 ) advantages = rewards - mean_grouped_rewards if self.args.scale_rewards: advantages = advantages / (std_grouped_rewards + 1e-4) # Slice to keep only the local part of the data if self.args.context_parallel_size > 1: # Calculate SP group ID (which group of ranks this rank belongs to) sp_group_id = self.accelerator.process_index // self.local_world_size # Calculate the start index for this SP group sp_group_start = sp_group_id * len(prompts) * self.local_world_size # All ranks in the same SP group get the same data slice process_slice = slice( sp_group_start, sp_group_start + len(prompts), ) else: # Original behavior for non-sequence parallel case process_slice = slice( self.accelerator.process_index * len(prompts), (self.accelerator.process_index + 1) * len(prompts), ) advantages = advantages[process_slice] # Log the metrics if mode == "train": self._total_train_tokens += ( self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item() ) self._metrics[mode]["num_tokens"] = [self._total_train_tokens] # log completion lengths, mean, min, max agg_completion_mask = self.accelerator.gather_for_metrics( completion_mask.sum(1) ) self._metrics[mode]["completions/mean_length"].append( agg_completion_mask.float().mean().item() ) self._metrics[mode]["completions/min_length"].append( agg_completion_mask.float().min().item() ) self._metrics[mode]["completions/max_length"].append( agg_completion_mask.float().max().item() ) # identify sequences that terminated with EOS and log their lengths agg_terminated_with_eos = self.accelerator.gather_for_metrics(is_eos.any(dim=1)) term_completion_mask = agg_completion_mask[agg_terminated_with_eos] clipped_completions_ratio = 1 - len(term_completion_mask) / len( agg_completion_mask ) self._metrics[mode]["completions/clipped_ratio"].append( clipped_completions_ratio ) if len(term_completion_mask) == 0: # edge case where no completed sequences are found term_completion_mask = torch.zeros(1, device=device) self._metrics[mode]["completions/mean_terminated_length"].append( term_completion_mask.float().mean().item() ) self._metrics[mode]["completions/min_terminated_length"].append( term_completion_mask.float().min().item() ) self._metrics[mode]["completions/max_terminated_length"].append( term_completion_mask.float().max().item() ) # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values) for i, reward_func_name in enumerate(self.reward_func_names): mean_rewards = torch.nanmean(rewards_per_func[:, i]).item() self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards) std_rewards = nanstd(rewards_per_func[:, i]).item() self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_rewards) self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item()) self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item()) # Log prompt and completion texts self._textual_logs["prompt"].extend(gather_object(prompts_text)) self._textual_logs["completion"].extend(gather_object(completions_text)) for i, name in enumerate(self.reward_func_names): self._textual_logs["rewards"][name].extend(rewards_per_func[:, i].tolist()) return { "prompt_ids": prompt_ids, "prompt_mask": prompt_mask, "completion_ids": completion_ids, "completion_mask": completion_mask, "advantages": advantages, "old_per_token_logps": old_per_token_logps, "ref_per_token_logps": ref_per_token_logps, } ================================================ FILE: src/axolotl/core/trainers/mamba.py ================================================ """Module for mamba trainer""" import torch from axolotl.core.trainers.base import AxolotlTrainer class AxolotlMambaTrainer(AxolotlTrainer): """Mamba specific trainer to handle loss calculation""" tag_names = ["axolotl", "mamba"] def compute_loss( self, model, inputs, return_outputs=False, num_items_in_batch=None, ): input_ids = inputs.pop("input_ids") lm_logits = model(input_ids).logits labels = input_ids.to(lm_logits.device) shift_logits = lm_logits[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = torch.nn.CrossEntropyLoss() lm_loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1) ) return lm_loss ================================================ FILE: src/axolotl/core/trainers/mixins/__init__.py ================================================ """Init for axolotl.core.trainers.mixins""" # flake8: noqa from .activation_checkpointing import ActivationOffloadingMixin from .checkpoints import CheckpointSaveMixin from .distributed_parallel import DistributedParallelMixin from .optimizer import OptimizerMixin from .packing import PackingMixin from .rng_state_loader import RngLoaderMixin from .scheduler import SchedulerMixin ================================================ FILE: src/axolotl/core/trainers/mixins/activation_checkpointing.py ================================================ """ Trainer mixin for activation checkpointing w offloading """ import contextlib from peft import PeftModel from torch import nn from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( apply_activation_checkpointing, ) from torch.distributed.fsdp.wrap import ModuleWrapPolicy from transformers import GradientCheckpointingLayer, Trainer from trl.models.activation_offloading import ( NoOpManager, OffloadActivations, get_act_offloading_ctx_manager, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class ActivationOffloadingMixin(Trainer): """ Trainer mixin class for activation checkpointing w offloading """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.args.activation_offloading: if isinstance(self.model, PeftModel): self.activation_offload_context = get_lora_act_offloading_ctx_manager( self.model, use_streams=True ) else: self.activation_offload_context = get_act_offloading_ctx_manager( self.model, use_streams=True ) else: self.activation_offload_context = contextlib.nullcontext() def training_step(self, *args, **kwargs): with self.activation_offload_context: return super().training_step(*args, **kwargs) def ac_wrap_hf_model(model: nn.Module, **kwargs): auto_wrap_policy = ModuleWrapPolicy(set((GradientCheckpointingLayer,))) apply_activation_checkpointing(model, auto_wrap_policy=auto_wrap_policy, **kwargs) def get_lora_act_offloading_ctx_manager( model: nn.Module, use_pin_memory: bool = True, use_streams: bool = True, min_offload_size: int = 1024, max_fwd_stash_size: int = 5, warn_if_no_head: bool = True, ) -> OffloadActivations: """ Returns the activation offloading context manager for the model. All but the last output Linear in every step will be offloaded. If activation offloading is enabled, we return the OffloadActivations context manager. If activation offloading is disabled, we return a NoOpManager context manager. Args: model (`nn.Module`): Model to wrap with the activation offloading context manager. use_pin_memory (`bool`, *optional*, defaults to `True`): Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to be moved back onto GPU more quickly but is a limited resource. use_streams (`bool`, *optional*, defaults to `True`): Whether to use streams for performance optimization where the communications get overlapped with the computation. Requires a torch build after torch-2.5.0. min_offload_size (`int`, *optional*, defaults to `1024`): Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we do not want to waste bandwidth and resources moving it to CPU and back. max_fwd_stash_size (`int`, *optional*, defaults to `5`): Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing runtime. warn_if_no_head (`bool`, *optional*, defaults to `True`): Whether to warn if no output head is detected. If set to `False`, no warning will be raised if no output head is detected. Returns: `contextlib.ContextDecorator`: Activation offloading context manager for the model. """ activations_handling_ctx = OffloadActivations( use_pin_memory=use_pin_memory, use_streams=use_streams, min_offload_size=min_offload_size, max_fwd_stash_size=max_fwd_stash_size, ) # Below is our hack to disable offloading the last output Linear in every # step, as the cost for offloading the activation and then soon after bringing # it back is expensive. output_head_detected = False noop_ctx = NoOpManager() # Try to get the actual model if it's wrapped unwrapped_model = model if hasattr(unwrapped_model, "module"): unwrapped_model = unwrapped_model.module # check for PEFT models if hasattr(unwrapped_model, "base_model") and hasattr( unwrapped_model, "peft_config" ): unwrapped_model = unwrapped_model.base_model # Check for different types of output heads if hasattr(unwrapped_model, "output"): if isinstance(unwrapped_model.output, nn.Module): unwrapped_model.output.register_forward_pre_hook( lambda *args: noop_ctx.__enter__() ) unwrapped_model.output.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True elif hasattr(unwrapped_model.output, "linear") and isinstance( unwrapped_model.output.linear, nn.Module ): unwrapped_model.output.linear.register_forward_pre_hook( lambda *args: noop_ctx.__enter__() ) unwrapped_model.output.linear.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True # Check for HuggingFace model output heads elif hasattr(unwrapped_model, "lm_head"): unwrapped_model.lm_head.register_forward_pre_hook( lambda *args: noop_ctx.__enter__() ) unwrapped_model.lm_head.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True # Check for decoder-based models elif hasattr(unwrapped_model, "decoder"): decoder = unwrapped_model.decoder if hasattr(decoder, "output"): decoder.output.register_forward_pre_hook(lambda *args: noop_ctx.__enter__()) decoder.output.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True # Some models have lm_head in the decoder elif hasattr(decoder, "lm_head"): decoder.lm_head.register_forward_pre_hook( lambda *args: noop_ctx.__enter__() ) decoder.lm_head.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True # Check for transformer models with final layer norm elif hasattr(unwrapped_model, "final_layer_norm") or hasattr( unwrapped_model, "ln_f" ): final_norm = ( getattr(unwrapped_model, "final_layer_norm", None) or unwrapped_model.ln_f ) final_norm.register_forward_pre_hook(lambda *args: noop_ctx.__enter__()) final_norm.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True # Check for models with head module elif hasattr(unwrapped_model, "head") and isinstance( unwrapped_model.head, nn.Module ): unwrapped_model.head.register_forward_pre_hook( lambda *args: noop_ctx.__enter__() ) unwrapped_model.head.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) output_head_detected = True if not output_head_detected and warn_if_no_head: LOG.warning( "During activation offloading, no output head was detected. If your model has an output head, it will be " "offloaded. This usually greatly slows training, given the large vocabulary size. To change this " "behavior, set your output head as model.output and make it an nn.Module. You can disable this warning by " "passing `warn_if_no_head=False`." ) for name, module in unwrapped_model.named_modules(): # Disable offloading for any Liger modules if "liger" in name.lower(): module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__()) module.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) # disable offloading for any submodules to fix LoRA training if name.endswith("._checkpoint_wrapped_module"): for _, sub_module in module.named_modules(): sub_module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__()) sub_module.register_forward_hook( lambda *args: noop_ctx.__exit__(), always_call=True ) return activations_handling_ctx ================================================ FILE: src/axolotl/core/trainers/mixins/checkpoints.py ================================================ """Custom handling to not fail training if fsdp optimizer is not savable""" from transformers import Trainer from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class CheckpointSaveMixin(Trainer): """Mixin to handle saving the optimizer and scheduler if they are not savable.""" def _save_optimizer_and_scheduler(self, output_dir): try: super()._save_optimizer_and_scheduler(output_dir) except (NotImplementedError, KeyError) as exc: # TODO: fix fsdp2 optimizer saving LOG.warning_once( f"Trainer does not support saving optimizer and scheduler: {exc}\n" "Optimizer and scheduler states were not saved - resuming from checkpoints " "for this training run will not be possible.", ) ================================================ FILE: src/axolotl/core/trainers/mixins/distributed_parallel.py ================================================ """ Mixin for correctly saving fsdp """ from accelerate import PartialState from transformers import Trainer class DistributedParallelMixin(Trainer): """ Mixin for correctly saving fsdp """ def _save(self, output_dir: str | None = None, state_dict=None): if ( state_dict is None and self.accelerator.parallelism_config and self.accelerator.parallelism_config.dp_shard_enabled ): state_dict = self.accelerator.get_state_dict(self.model) super()._save(output_dir, state_dict=state_dict) def create_accelerator_and_postprocess(self): super().create_accelerator_and_postprocess() if ( self.accelerator.distributed_type == "FSDP" and self.accelerator.state.fsdp_plugin is None ): # handle Context Parallelism without FSDP self.accelerator.state.distributed_type = "MULTI_GPU" self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU" PartialState().distributed_type = "MULTI_GPU" ================================================ FILE: src/axolotl/core/trainers/mixins/optimizer.py ================================================ """Module for Axolotl trainer optimizer mixin""" from peft.optimizers import create_loraplus_optimizer from torch import nn from transformers.trainer import Trainer from transformers.utils import is_sagemaker_mp_enabled from axolotl.integrations.base import BaseOptimizerFactory from axolotl.utils.logging import get_logger if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp LOG = get_logger(__name__) class OptimizerMixin(Trainer): """Mixin class for shared handling of building custom optimizers""" args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined] def create_optimizer_grouped_parameters( self, opt_model, optimizer_kwargs ) -> list[dict]: decay_parameters = self.get_decay_parameter_names(opt_model) params: dict = { "to_weight_decay": {}, # LayerNorm and bias "embeddings": {}, # lm_head, embed_tokens, "no_weight_decay": {}, } lr_groups_lookup = {} lr_groups_learning_rates = {} if self.args.lr_groups: for lr_group in self.args.lr_groups: group_name = lr_group["name"] group_modules = lr_group["modules"] for module in group_modules: lr_groups_lookup[module] = group_name lr_groups_learning_rates[group_name] = lr_group["lr"] params[f"to_weight_decay_{group_name}"] = {} for name, param in opt_model.named_parameters(): if not param.requires_grad: continue if name.endswith("modules_to_save.default.weight") or any( embed_name in name for embed_name in ["embed_tokens", "lm_head"] ): params["embeddings"][name] = param elif name in decay_parameters: lr_group_modules = [ group_modules for group_modules in lr_groups_lookup if group_modules in name ] if lr_groups_lookup and any(lr_group_modules): lr_group_module = lr_group_modules[0] group_name = lr_groups_lookup[lr_group_module] params[f"to_weight_decay_{group_name}"][name] = param else: params["to_weight_decay"][name] = param else: params["no_weight_decay"][name] = param optimizer_grouped_parameters = [] if params["to_weight_decay"]: optimizer_grouped_parameters.append( { "params": list(params["to_weight_decay"].values()), "weight_decay": self.args.weight_decay, "lr": optimizer_kwargs["lr"], } ) if params["embeddings"]: lr = optimizer_kwargs["lr"] if self.args.embedding_lr_scale: lr *= self.args.embedding_lr_scale elif self.args.embedding_lr: lr = self.args.embedding_lr optimizer_grouped_parameters.append( { "params": list(params["embeddings"].values()), "weight_decay": 0.0, "lr": lr, } ) if params["no_weight_decay"]: optimizer_grouped_parameters.append( { "params": list(params["no_weight_decay"].values()), "weight_decay": 0.0, "lr": optimizer_kwargs["lr"], } ) for group_name, group_lr in lr_groups_learning_rates.items(): if params[f"to_weight_decay_{group_name}"]: optimizer_grouped_parameters.append( { "params": list( params[f"to_weight_decay_{group_name}"].values() ), "weight_decay": self.args.weight_decay, "lr": group_lr, } ) return optimizer_grouped_parameters def create_optimizer(self, model=None): if ( self.args.loraplus_lr_ratio is None and self.args.embedding_lr_scale is None and self.args.embedding_lr is None and self.args.lr_groups is None and self.optimizer_cls_and_kwargs is None ): return super().create_optimizer(model=model) opt_model = self.model if model is None else model if ( not self.optimizer and self.optimizer_cls_and_kwargs is not None and issubclass(self.optimizer_cls_and_kwargs[0], BaseOptimizerFactory) ): optimizer_factory_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs self.optimizer = optimizer_factory_cls()( opt_model, self.args, **optimizer_kwargs ) if not self.optimizer: if self.optimizer_cls_and_kwargs is not None: optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs else: optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs( self.args, opt_model ) optimizer_grouped_parameters = self.create_optimizer_grouped_parameters( opt_model, optimizer_kwargs ) if self.args.loraplus_lr_ratio is not None: loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None) loraplus_lr_embedding = getattr( self.args, "loraplus_lr_embedding", 1e-6 ) self.optimizer = create_loraplus_optimizer( opt_model, optimizer_cls, loraplus_lr_ratio=loraplus_lr_ratio, loraplus_lr_embedding=loraplus_lr_embedding, **optimizer_kwargs, ) else: # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs` # e.g. for GaLore optimizer. if "params" in optimizer_kwargs: optimizer_grouped_parameters = optimizer_kwargs.pop("params") # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs` # e.g. for LOMO optimizer. if "model" in optimizer_kwargs: optimizer_grouped_parameters = optimizer_kwargs.pop("model") # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict` # to avoid arguments conflicts. if "optimizer_dict" in optimizer_kwargs: optimizer_grouped_parameters = optimizer_kwargs.pop( "optimizer_dict" ) self.optimizer = optimizer_cls( optimizer_grouped_parameters, **optimizer_kwargs ) if optimizer_cls.__name__ == "Adam8bit": import bitsandbytes manager = bitsandbytes.optim.GlobalOptimManager.get_instance() skipped = 0 for module in opt_model.modules(): if isinstance(module, nn.Embedding): skipped += sum( { p.data_ptr(): p.numel() for p in module.parameters() }.values() ) LOG.info(f"skipped {module}: {skipped / 2**20}M params") manager.register_module_override( module, "weight", {"optim_bits": 32} ) LOG.debug(f"bitsandbytes: will optimize {module} in fp32") LOG.info(f"skipped: {skipped / 2**20}M params") if is_sagemaker_mp_enabled(): self.optimizer = smp.DistributedOptimizer(self.optimizer) return self.optimizer class OptimizerInitMixin: """ Mixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not accept optimizer_cls_and_kwargs as kwarg in constructor. """ def __init__(self, *args, **kwargs): optimizer_cls_and_kwargs = kwargs.pop("optimizer_cls_and_kwargs", None) super().__init__(*args, **kwargs) if ( optimizer_cls_and_kwargs and self.optimizer_cls_and_kwargs is None and self.optimizer is None ): self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs ================================================ FILE: src/axolotl/core/trainers/mixins/packing.py ================================================ """Trainer mixin to support packing""" from transformers import Trainer class PackingMixin(Trainer): """ Trainer mixin to support packing """ def _set_signature_columns_if_needed(self): super()._set_signature_columns_if_needed() if ( self._signature_columns and self.args.sample_packing and self.args.sample_packing_drop_attention_mask ): set_sig_columns = set(self._signature_columns) set_sig_columns.remove("attention_mask") self._signature_columns = list(set_sig_columns) ================================================ FILE: src/axolotl/core/trainers/mixins/rng_state_loader.py ================================================ """ Temporary fix/override for bug in resume from checkpoint See https://github.com/huggingface/transformers/pull/37162 TODO: Remove when upstream added PR to release """ import os import random import numpy as np import torch from transformers import Trainer, is_torch_npu_available from transformers.trainer import safe_globals from transformers.trainer_pt_utils import set_rng_state_for_device from transformers.training_args import ParallelMode from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class RngLoaderMixin(Trainer): """ mixin for method override to load RNG states from a checkpoint """ def _load_rng_state(self, checkpoint): # Load RNG states from `checkpoint` if checkpoint is None: return if self.args.world_size > 1: process_index = self.args.process_index rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth") if not os.path.isfile(rng_file): LOG.info( f"Didn't find an RNG file for process {process_index}, if you are resuming a training that " "wasn't launched in a distributed fashion, reproducibility is not guaranteed." ) return else: rng_file = os.path.join(checkpoint, "rng_state.pth") if not os.path.isfile(rng_file): LOG.info( "Didn't find an RNG file, if you are resuming a training that was launched in a distributed " "fashion, reproducibility is not guaranteed." ) return # Use safe_globals to ensure numpy RNG states can be deserialized safely under PyTorch 2.6+, # which requires allowlisted classes when loading with weights_only=True. with safe_globals(): checkpoint_rng_state = torch.load(rng_file) # nosec B614 random.setstate(checkpoint_rng_state["python"]) np.random.set_state(checkpoint_rng_state["numpy"]) torch.random.set_rng_state(checkpoint_rng_state["cpu"]) is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED if torch.cuda.is_available(): set_rng_state_for_device( "CUDA", torch.cuda, checkpoint_rng_state, is_distributed ) if is_torch_npu_available(): set_rng_state_for_device( "NPU", torch.npu, checkpoint_rng_state, is_distributed ) ================================================ FILE: src/axolotl/core/trainers/mixins/scheduler.py ================================================ """Module for Axolotl trainer scheduler mixin""" import torch from torch.optim.lr_scheduler import LRScheduler, OneCycleLR from transformers.trainer import Trainer from axolotl.integrations.base import PluginManager from axolotl.utils.logging import get_logger from axolotl.utils.schedulers import ( JaggedLRRestartScheduler, RexLR, get_cosine_schedule_with_min_lr, get_cosine_schedule_with_quadratic_warmup, get_cosine_schedule_with_warmup_decay_constant, ) LOG = get_logger(__name__) class SchedulerMixin(Trainer): """ Mixin class for scheduler setup in CausalTrainer. """ args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined] def create_scheduler( self, num_training_steps: int, optimizer: None | torch.optim.Optimizer = None ) -> LRScheduler: """ Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or passed as an argument. Args: num_training_steps (int): The number of training steps to do. optimizer (torch.optim.Optimizer): The training optimizer """ use_cosine_quadratic = ( self.args.lr_scheduler_type == "cosine" and self.args.lr_quadratic_warmup is True ) use_cosine_min_lr = ( self.args.lr_scheduler_type == "cosine" and self.args.cosine_min_lr_ratio is not None ) if optimizer is None: if self.optimizer is None: raise ValueError( "Optimizer must be set before calling create_scheduler or passed as an argument." ) optimizer = self.optimizer # fmt: off if self.lr_scheduler is None: # type: ignore # fmt: on plugin_manager = PluginManager.get_instance() lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler( trainer=self, optimizer=optimizer, num_training_steps=num_training_steps ) if lr_scheduler is not None: LOG.info(f"Using plugin-created lr_scheduler: {lr_scheduler}") self.lr_scheduler = lr_scheduler elif self.args.alternate_lr_scheduler_type == "one_cycle": num_warmup_steps = self.args.get_warmup_steps(num_training_steps) pct_start = num_warmup_steps / num_training_steps extra_lr_kwargs = {} if "pct_start" not in self.args.lr_scheduler_kwargs: extra_lr_kwargs["pct_start"] = pct_start if "anneal_strategy" not in self.args.lr_scheduler_kwargs: extra_lr_kwargs["anneal_strategy"] = "cos" self.lr_scheduler = OneCycleLR( optimizer, max_lr=self.args.learning_rate, total_steps=num_training_steps, **extra_lr_kwargs, **self.args.lr_scheduler_kwargs, ) elif self.args.alternate_lr_scheduler_type == "rex": if use_cosine_min_lr: assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0" self.lr_scheduler = RexLR( optimizer=optimizer, max_lr=self.args.learning_rate, min_lr=0 if not use_cosine_min_lr else ( self.args.learning_rate * self.args.cosine_min_lr_ratio), total_steps=num_training_steps, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), ) elif use_cosine_quadratic: if use_cosine_min_lr: LOG.warning( "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.") self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( optimizer, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), num_training_steps=num_training_steps, ) elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr: assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0" assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0" self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant( optimizer, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), num_training_steps=num_training_steps, min_lr_ratio=self.args.cosine_min_lr_ratio, constant_lr_ratio=self.args.cosine_constant_lr_ratio, ) elif self.args.cosine_min_lr_ratio and use_cosine_min_lr: assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0" self.lr_scheduler = get_cosine_schedule_with_min_lr( optimizer, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), num_training_steps=num_training_steps, min_lr_ratio=self.args.cosine_min_lr_ratio, ) else: super().create_scheduler(num_training_steps, optimizer=optimizer) else: if use_cosine_quadratic: LOG.warning( "axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).") if use_cosine_min_lr: LOG.warning( "axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).") if self.args.jagged_restart_steps: warmup_steps = ( self.args.jagged_restart_warmup_steps or 10 ) anneal_steps = ( self.args.jagged_restart_anneal_steps or 1 ) if not self.lr_scheduler: super().create_scheduler(num_training_steps, optimizer) self.lr_scheduler = JaggedLRRestartScheduler( optimizer, self.lr_scheduler, self.args.jagged_restart_steps, warmup_steps, anneal_steps, min_lr_scale=self.args.cosine_min_lr_ratio or 0.001, ) return self.lr_scheduler # type: ignore ================================================ FILE: src/axolotl/core/trainers/trl.py ================================================ """Module for TRL RL trainers""" from trl import RewardTrainer from trl.experimental.cpo import CPOTrainer from trl.experimental.kto import KTOTrainer from trl.experimental.orpo import ORPOTrainer from trl.experimental.prm import PRMTrainer from axolotl.core.trainers.mixins import DistributedParallelMixin, RngLoaderMixin from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin from axolotl.core.trainers.mixins.scheduler import SchedulerMixin class AxolotlORPOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, ORPOTrainer, ): """ Extend the base ORPOTrainer for axolotl helpers """ tag_names = ["axolotl", "orpo"] class AxolotlKTOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, KTOTrainer, ): """ Extend the base KTOTrainer for axolotl helpers """ tag_names = ["axolotl", "kto"] class AxolotlCPOTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, CPOTrainer, ): """ Extend the base CPOTrainer for axolotl helpers """ tag_names = ["axolotl", "cpo"] class AxolotlRewardTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, RewardTrainer, ): """ Extend the base RewardTrainer for axolotl helpers """ tag_names = ["axolotl", "reward"] class AxolotlPRMTrainer( RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DistributedParallelMixin, PRMTrainer, ): """ Extend the base trl.PRMTrainer for axolotl helpers """ tag_names = ["axolotl", "prm"] ================================================ FILE: src/axolotl/core/trainers/utils.py ================================================ """Utils for Axolotl trainers""" def sanitize_kwargs_for_tagging(tag_names, kwargs=None): if isinstance(tag_names, str): tag_names = [tag_names] if kwargs is not None: if "tags" not in kwargs: kwargs["tags"] = tag_names elif "tags" in kwargs and isinstance(kwargs["tags"], list): kwargs["tags"].extend(tag_names) elif "tags" in kwargs and isinstance(kwargs["tags"], str): tag_names.append(kwargs["tags"]) kwargs["tags"] = tag_names return kwargs def sanitize_kwargs_for_ds_tagging(dataset_tags, kwargs=None): if isinstance(dataset_tags, str): dataset_tags = [dataset_tags] if (dataset_tags is not None) and (kwargs is not None): if "dataset_tags" not in kwargs: kwargs["dataset_tags"] = dataset_tags elif "dataset_tags" in kwargs and isinstance(kwargs["dataset_tags"], list): kwargs["dataset_tags"].extend(dataset_tags) elif "dataset_tags" in kwargs and isinstance(kwargs["dataset_tags"], str): dataset_tags.append(kwargs["dataset_tags"]) kwargs["dataset_tags"] = dataset_tags return kwargs ================================================ FILE: src/axolotl/core/training_args.py ================================================ """ extra axolotl specific training args """ from __future__ import annotations from dataclasses import dataclass, field from typing import Optional, Type from transformers import TrainingArguments from trl import RewardConfig from trl.experimental.cpo import CPOConfig from trl.experimental.kto import KTOConfig from trl.experimental.orpo import ORPOConfig from trl.experimental.prm import PRMConfig from axolotl.integrations.config import merge_training_args AxolotlTrainingMixins: Type = merge_training_args() @dataclass class AxolotlTrainingArguments(AxolotlTrainingMixins, TrainingArguments): """ Training arguments for Causal trainer This code is duplicated due to HF TrainingArguments not setting output_dir with a default value so it can't be used as a mixin. """ @dataclass class AxolotlORPOConfig(AxolotlTrainingMixins, ORPOConfig): """ ORPO config for ORPO training """ @dataclass class AxolotlKTOConfig(AxolotlTrainingMixins, KTOConfig): """ KTO config for KTO training """ @dataclass class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig): """ CPO config for CPO training """ simpo_gamma: Optional[float] = field( default=None, metadata={"help": "simpo gamma parameter"}, ) @dataclass class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig): """ Reward config for Reward training """ @dataclass class AxolotlPRMConfig(AxolotlTrainingMixins, PRMConfig): """ PRM config for PRM training """ ================================================ FILE: src/axolotl/core/training_args_base.py ================================================ """ Base Axolotl Training Mixins shared across various trainer configs """ from dataclasses import dataclass, field from typing import Optional from PIL.Image import Resampling @dataclass class AxolotlTrainingMixins: """ Mixin class for the Axolotl training args. """ model_type: Optional[str] = field( default=None, metadata={"help": "HF model configuration model_type."} ) lr_quadratic_warmup: bool = field( default=False, metadata={"help": "Use quadratic warmup for cosine scheduling."}, ) pretraining: bool = field( default=False, metadata={ "help": "Indicates to trainer whether we are doing continued pretraining." }, ) sample_packing: bool = field( default=False, metadata={"help": "Use sample packing for efficient training."}, ) sample_packing_sequentially: bool = field( default=False, metadata={ "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing." }, ) sample_packing_mp_start_method: str | None = field( default=None, metadata={"help": "The multiprocessing start method to use."}, ) sample_packing_drop_attention_mask: bool = field( default=False, metadata={"help": "Drop attention mask from inputs when using packing."}, ) multipack_real_batches: bool = field( default=False, metadata={"help": "Use real batches for efficient training."}, ) include_tkps: bool = field( default=True, metadata={ "help": "Whether to include tokens per second in the training metrics." }, ) eval_sample_packing: Optional[bool] = field( default=None, metadata={"help": "Use sample packing for efficient evals."}, ) sample_packing_efficiency: float = field( default=1.0, metadata={"help": "Sample packing efficiency for calculating batch length."}, ) sample_packing_bin_size: int = field( default=200, metadata={ "help": "The max number of samples that packed sample can contain after packing. Increase for better packing." }, ) sample_packing_group_size: int = field( default=100000, metadata={ "help": "The number of samples to group together for packing. Increase for better packing." }, ) max_seq_length: int = field( default=2048, metadata={"help": "The maximum sequence length the model can handle"}, ) dataset_num_proc: int | None = field( default=None, metadata={"help": "The number of processes to use for data processing"}, ) relora_steps: Optional[int] = field( default=None, metadata={"help": "how often to reset for ReLoRA"}, ) relora_prune_ratio: Optional[float] = field( default=0.9, metadata={"help": "prune ratio for magnitude pruning of the optimizer"}, ) jagged_restart_steps: Optional[int] = field( default=None, metadata={"help": "how often to reset for jagged restarts"}, ) jagged_restart_warmup_steps: Optional[int] = field( default=None, metadata={ "help": "how many warmup steps to take after reset for jagged restarts" }, ) jagged_restart_anneal_steps: Optional[int] = field( default=None, metadata={ "help": "how many anneal steps to take before reset for jagged restarts" }, ) bench_split: Optional[str] = field( default="eval", metadata={"help": "The benchmark split to run on"} ) bench_dataset: Optional[str] = field( default="pharaouk/dharma-1/dharma_1_mini.json", metadata={ "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file" }, ) do_bench_eval: Optional[bool] = field( default=False, metadata={"help": "Whether to run the Benchmark evaluation."} ) do_causal_lm_eval: Optional[bool] = field( default=False, metadata={"help": "Whether to run the Causal LM evaluation."} ) max_bench_samples: Optional[int] = field( default=None, metadata={ "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset." }, ) bench_source_max_len: int = field( default=2048, metadata={"help": "Maximum source sequence length for bench."} ) dataloader_prefetch_factor: Optional[int] = field( default=None, metadata={"help": "prefetch_factor argument to the dataloader"}, ) cosine_min_lr_ratio: Optional[float] = field( default=None, metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"}, ) cosine_constant_lr_ratio: Optional[float] = field( default=None, metadata={ "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps" }, ) loraplus_lr_ratio: Optional[float] = field( default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."} ) loraplus_lr_embedding: Optional[float] = field( default=1e-6, metadata={"help": "loraplus learning rate for lora embedding layers."}, ) embedding_lr_scale: Optional[float] = field( default=None, metadata={"help": "Scale the learning rate for the embedding layers."}, ) lr_groups: Optional[list[dict]] = field( default=None, metadata={"help": "Specify learning rate groups for with different LRs."}, ) embedding_lr: Optional[float] = field( default=None, metadata={"help": "absolute learning rate for the embedding layers."}, ) qlora: bool = field( default=False, metadata={"help": "whether this is a qlora training"}, ) orpo_alpha: Optional[float] = field( default=None, ) lisa_n_layers: Optional[int] = field( default=None, metadata={"help": "the number of activate layers in LISA"}, ) lisa_step_interval: Optional[int] = field( default=None, metadata={"help": "how often to switch layers in LISA"}, ) lisa_layers_attribute: Optional[str] = field( default=None, metadata={"help": "path under the model to access the layers"}, ) curriculum_sampling: Optional[bool] = field( default=None, metadata={"help": "whether to use sequential sampling for curriculum learning"}, ) alternate_lr_scheduler_type: Optional[str] = field( default=None, metadata={ "help": "workaround to pass an alternate lr scheduler to the HF trainer" }, ) chat_template: Optional[str] = field( default=None, metadata={"help": "Chat template converting chat messages to text"}, ) # kd_ce_alpha: Optional[float] = field( # default=None, # metadata={ # "help": "The alpha scaling parameter for SFT cross entropy loss when using KD" # }, # ) # # kd_alpha: Optional[float] = field( # default=1.0, # metadata={"help": "The alpha scaling parameter for KD loss"}, # ) # # kd_temperature: Optional[float] = field( # default=1.0, # metadata={ # "help": "the temperature parameter for KL divergence loss when using KD" # }, # ) adam_beta3: Optional[float] = field( default=None, metadata={ "help": "The beta3 hyperparameter used in some optimizers such as CAME" }, ) adam_epsilon2: Optional[float] = field( default=None, metadata={ "help": "The epsilon2 hyperparameter used in some optimizers such as CAME" }, ) activation_offloading: bool | None = field( default=None, metadata={"help": "Use activation offloading with CUDA streams for training."}, ) # multi-modal section image_size: int | tuple[int, int] | None = field( default=None, metadata={"help": "The size of the image to resize to"}, ) image_resize_algorithm: Resampling | None = field( default=None, metadata={"help": "The algorithm to use for image resizing"}, ) # end of multi-modal section dion_learning_rate: float | None = field( default=None, metadata={"help": "The learning rate for Dion"}, ) dion_momentum: float | None = field( default=None, metadata={"help": "The momentum for Dion"}, ) dion_rank_fraction: float | None = field( default=None, ) dion_rank_multiple_of: int | None = field( default=None, ) ================================================ FILE: src/axolotl/datasets.py ================================================ """ Module containing dataset functionality. We want this to be a wrapper for an existing dataset that we have loaded. Lets use the concept of middlewares to wrap each dataset. We'll use the collators later on to pad the datasets. """ from datasets import Dataset, IterableDataset from axolotl.utils.logging import get_logger from .prompt_tokenizers import PromptTokenizingStrategy LOG = get_logger(__name__) class TokenizedPromptDataset(Dataset): """Dataset that returns tokenized prompts from a stream of text files. Args: prompt_tokenizer: The prompt tokenizing method for processing the data. dataset: Dataset with text files. process_count: Number of processes to use for tokenizing. keep_in_memory: Whether to keep the tokenized dataset in memory. """ def __init__( self, prompt_tokenizer: PromptTokenizingStrategy, dataset: Dataset, process_count: int | None = None, keep_in_memory: bool | None = False, **kwargs, ): self.prompt_tokenizer = prompt_tokenizer self.process_count = process_count self.keep_in_memory = keep_in_memory super().__init__( self.process(dataset).data, **kwargs, ) def process(self, dataset): features = dataset.features.keys() map_kwargs = {} if self.prompt_tokenizer.supports_batched: map_kwargs["batched"] = True map_kwargs["batch_size"] = 1_000 if ( hasattr(self.prompt_tokenizer, "filter_rows") and self.prompt_tokenizer.filter_rows ): dataset = dataset.filter( self.prompt_tokenizer.filter_rows, num_proc=self.process_count, desc="Strategy Filtering Rows", ) return dataset.map( self.prompt_tokenizer.tokenize_prompt, num_proc=self.process_count, remove_columns=features, keep_in_memory=self.keep_in_memory, desc="Tokenizing Prompts", **map_kwargs, ) def wrap_dataset_for_tokenized_prompt( prompt_tokenizer: PromptTokenizingStrategy, dataset: Dataset | IterableDataset, **kwargs, ): if isinstance(dataset, IterableDataset): map_kwargs = {} if prompt_tokenizer.supports_batched: map_kwargs["batched"] = True features = list(dataset.features.keys()) return dataset.map( prompt_tokenizer.tokenize_prompt, remove_columns=features, **map_kwargs, ) return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs) ================================================ FILE: src/axolotl/evaluate.py ================================================ """Module for evaluating models.""" import csv import os import sys from pathlib import Path from typing import Dict, Optional import torch from datasets import Dataset from transformers.trainer import Trainer from axolotl.telemetry.errors import send_errors from axolotl.train import ( TrainDatasetMeta, setup_model_and_tokenizer, ) from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import cleanup_distributed from axolotl.utils.logging import get_logger from axolotl.utils.trainer import setup_trainer project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) src_dir = os.path.join(project_root, "src") sys.path.insert(0, src_dir) LOG = get_logger(__name__) def evaluate_dataset( trainer: Trainer, dataset: Dataset, dataset_type: str, flash_optimum: bool = False ) -> Optional[Dict[str, float]]: """Helper function to evaluate a single dataset. Args: trainer: The trainer instance. dataset: Dataset to evaluate. dataset_type: Type of dataset ('train' or 'eval'). flash_optimum: Whether to use flash optimum. Returns: Dictionary of metrics or None if dataset is None. """ if dataset is None: return None LOG.info(f"Starting {dataset_type} set evaluation...") if flash_optimum: with torch.backends.cuda.sdp_kernel( enable_flash=True, enable_math=True, enable_mem_efficient=True, ): metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type) else: metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type) LOG.info(f"{dataset_type.capitalize()} set evaluation completed!") LOG.info(f"{dataset_type.capitalize()} Metrics:") for key, value in metrics.items(): LOG.info(f"{key}: {value}") return metrics @send_errors def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]: """ Evaluate a model on training and validation datasets. Args: cfg: Dictionary mapping `axolotl` config keys to values. dataset_meta: Dataset metadata containing training and evaluation datasets. Returns: Dictionary mapping metric names to their values. """ # Load tokenizer, processor and model LOG.debug("loading model for evaluation...") model, tokenizer, _, processor = setup_model_and_tokenizer(cfg) # Get datasets train_dataset = dataset_meta.train_dataset eval_dataset = dataset_meta.eval_dataset total_num_steps = dataset_meta.total_num_steps # Set up trainer trainer = setup_trainer( cfg=cfg, train_dataset=train_dataset, eval_dataset=eval_dataset, model=model, tokenizer=tokenizer, processor=processor, total_num_steps=total_num_steps, ) # Evaluate datasets all_metrics = {} train_metrics = evaluate_dataset(trainer, train_dataset, "train", cfg.flash_optimum) eval_metrics = evaluate_dataset(trainer, eval_dataset, "eval", cfg.flash_optimum) if train_metrics: all_metrics.update(train_metrics) if eval_metrics: all_metrics.update(eval_metrics) # Save metrics to CSV if output directory is specified and we have metrics if cfg.output_dir and (train_metrics or eval_metrics): output_dir = Path(cfg.output_dir) output_dir.mkdir(parents=True, exist_ok=True) metrics_file = output_dir / "eval_summary.csv" with metrics_file.open("w", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(["metric", "training", "validation"]) # Get unique metric names (removing prefixes) from available metrics train_metric_names = { k.replace("train_", ""): k for k in (train_metrics or {}) } eval_metric_names = { k.replace("eval_", ""): k for k in (eval_metrics or {}) } all_metric_names = sorted( set(train_metric_names.keys()) | set(eval_metric_names.keys()) ) for metric_name in all_metric_names: train_value = ( train_metrics.get(train_metric_names.get(metric_name, ""), "") if train_metrics else "" ) eval_value = ( eval_metrics.get(eval_metric_names.get(metric_name, ""), "") if eval_metrics else "" ) writer.writerow([metric_name, train_value, eval_value]) LOG.info(f"Evaluation results saved to {metrics_file}") del model del tokenizer cleanup_distributed() return all_metrics ================================================ FILE: src/axolotl/integrations/LICENSE.md ================================================ ### AXOLOTL COMMUNITY LICENSE AGREEMENT This Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms and conditions set forth in this Agreement. 1. Definitions 1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. 1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, which may be licensed separately by their respective authors and/or licensors. 1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which permits Plugin Integrations to integrate with the Axolotl service. 2. Grant of License 2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: - Licensee must comply with all the terms and conditions of this Agreement. - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial portions of the Software. 2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3. 3. Restrictions 3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such third parties to fine-tune artificial intelligence models. 3.2 Licensee shall not: - Use the Software for any illegal or unauthorized purpose. - Reverse engineer, decompile, or disassemble the Software. - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the Software or interfere with any third-party use of the Software. 3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement. 4. Intellectual Property Rights 4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to Licensee. 5. Disclaimer of Warranty 5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6. Termination 6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any copies in its possession. 7. Governing Law 7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, without regards to conflicts of laws provisions thereof. 8. Entire Agreement 8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be bound by the terms and conditions of this Agreement. This Agreement was last updated on August 23, 2024. ================================================ FILE: src/axolotl/integrations/__init__.py ================================================ ================================================ FILE: src/axolotl/integrations/base.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # This software may be used and distributed according to # the terms of the Axolotl Community License Agreement (the "License"); # you may not use this file except in compliance with the License. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. """Base class for all plugins. A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl. Plugins can be used to integrate third-party models, modify the training process, or add new features. To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods. """ from __future__ import annotations import collections import importlib import traceback from typing import TYPE_CHECKING, Callable, OrderedDict, Union from peft import PeftModel from torch import nn from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler from transformers import PreTrainedModel, Trainer from transformers.trainer_pt_utils import get_parameter_names from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) if TYPE_CHECKING: from axolotl.common.datasets import TrainDatasetMeta class BasePlugin: """Base class for all plugins. Defines the interface for plugin methods. A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl. Plugins can be used to integrate third-party models, modify the training process, or add new features. To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods. Note: Plugin methods include: - register(cfg): Registers the plugin with the given configuration. - load_datasets(cfg): Loads and preprocesses the dataset for training. - pre_model_load(cfg): Performs actions before the model is loaded. - post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied. - pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded. - post_lora_load(cfg, model): Performs actions after LoRA weights are loaded. - post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters. - post_trainer_create(cfg, trainer): Performs actions after the trainer is created. - create_optimizer(cfg, trainer): Creates and returns an optimizer for training. - create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler. - add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training. - add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training. """ def __init__(self): """Initializes the BasePlugin.""" def register(self, cfg: dict): """Registers the plugin with the given configuration as an unparsed dict. Args: cfg: The configuration for the plugin. """ def get_input_args(self) -> str | None: """Returns a pydantic model for the plugin's input arguments.""" def get_training_args_mixin(self) -> str | None: """ Returns a dataclass model for the plugin's training arguments. """ def load_datasets( self, cfg: DictDefault, preprocess: bool = False ) -> Union["TrainDatasetMeta", None]: """Loads and preprocesses the dataset for training. Args: cfg: The configuration for the plugin. preprocess: Whether this is the preprocess step of the datasets. Returns: dataset_meta: The metadata for the training dataset. """ def pre_model_load(self, cfg: DictDefault): """Performs actions before the model is loaded. Args: cfg: The configuration for the plugin. """ def post_model_build(self, cfg: DictDefault, model: PreTrainedModel): """Performs actions after the model is built/loaded, but before any adapters are applied. Args: cfg: The configuration for the plugin. """ def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel): """Performs actions before LoRA weights are loaded. Args: cfg: The configuration for the plugin. model: The loaded model. """ def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Performs actions after LoRA weights are loaded. Args: cfg: The configuration for the plugin. model: The loaded model. """ def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Performs actions after the model is loaded. Args: cfg: The configuration for the plugin. model: The loaded model. """ def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None: """Returns a custom class for the trainer. Args: cfg: The global axolotl configuration. Returns: The first non-`None` trainer class returned by a plugin. """ def post_trainer_create(self, cfg: DictDefault, trainer: Trainer): """Performs actions after the trainer is created. Args: cfg: The configuration for the plugin. trainer: The trainer object for training. """ def get_training_args(self, cfg: DictDefault): """ Returns custom training arguments to set on TrainingArgs. Args: cfg: The global axolotl configuration. Returns: object: dict containing the training arguments. """ def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False): """ Returns a custom class for the collator. Args: cfg: The global axolotl configuration. is_eval: Whether this is an eval split. Returns: class: The class for the collator. """ def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None: """Creates and returns an optimizer for training. Args: cfg: The configuration for the plugin. trainer: The trainer object for training. Returns: The created optimizer. """ def create_lr_scheduler( self, cfg: DictDefault, trainer: Trainer, optimizer: Optimizer, num_training_steps: int, ) -> LRScheduler | None: """Creates and returns a learning rate scheduler. Args: cfg: The configuration for the plugin. trainer: The trainer object for training. optimizer: The optimizer for training. num_training_steps: Total number of training steps Returns: The created learning rate scheduler. """ def add_callbacks_pre_trainer( self, cfg: DictDefault, model: PreTrainedModel ) -> list[Callable]: """Set up callbacks before creating the trainer. Args: cfg: The configuration for the plugin. model: The loaded model. Returns: A list of callback functions to be added to the `TrainingArgs`. """ return [] def add_callbacks_post_trainer( self, cfg: DictDefault, trainer: Trainer ) -> list[Callable]: """Adds callbacks to the trainer after creating the trainer. This is useful for callbacks that require access to the model or trainer. Args: cfg: The configuration for the plugin. trainer: The trainer object for training. Returns: A list of callback functions to be added """ return [] def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Performs actions after training is complete. Args: cfg: The axolotl configuration. model: The loaded model. """ def post_train_unload(self, cfg: DictDefault): """Performs actions after training is complete and the model is unloaded. Args: cfg: The configuration for the plugin. """ def load_plugin(plugin_name: str) -> BasePlugin: """Loads a plugin based on the given plugin name. The plugin name should be in the format "module_name.class_name". This function splits the plugin name into module and class, imports the module, retrieves the class from the module, and creates an instance of the class. Args: plugin_name: The name of the plugin to be loaded. The name should be in the format "module_name.class_name". Returns: An instance of the loaded plugin. Raises: ImportError: If the plugin module cannot be imported. """ # split the plugin name into module and class module_name, class_name = plugin_name.rsplit(".", 1) # import the module try: module = importlib.import_module(module_name) except ModuleNotFoundError as orig_exc: try: if not module_name.startswith("axolotl.integrations."): module = importlib.import_module("axolotl.integrations." + module_name) else: raise orig_exc except ModuleNotFoundError as exc: raise orig_exc from exc # instantiate the class plugin_class = getattr(module, class_name) # create an instance of the class plugin = plugin_class() return plugin class PluginManager: """The `PluginManager` class is responsible for loading and managing plugins. It should be a singleton so it can be accessed from anywhere in the codebase. Attributes: plugins: A list of loaded plugins. Note: Key methods include: - get_instance(): Static method to get the singleton instance of `PluginManager`. - register(plugin_name: str): Registers a new plugin by its name. - pre_model_load(cfg): Calls the pre_model_load method of all registered plugins. """ plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict() _instance: PluginManager | None = None _cfg: DictDefault | None = None def __new__(cls): """Creates a new instance of PluginManager if it doesn't exist yet.""" if cls._instance is None: cls._instance = super(PluginManager, cls).__new__(cls) cls._instance.plugins: OrderedDict[str, BasePlugin] = ( collections.OrderedDict() ) return cls._instance @staticmethod def get_instance() -> "PluginManager": """Returns the singleton instance of PluginManager. If the instance doesn't exist, it creates a new one. """ if PluginManager._instance is None: PluginManager() return PluginManager._instance # type: ignore @property def cfg(self): return self._cfg @cfg.setter def cfg(self, cfg): self._cfg = cfg def register(self, plugin_name: str): """Registers a new plugin by its name. Args: plugin_name: The name of the plugin to be registered. Raises: ImportError: If the plugin module cannot be imported. """ try: LOG.info(f"Attempting to load plugin: {plugin_name}") plugin = load_plugin(plugin_name) self.plugins[plugin_name] = plugin LOG.info(f"Plugin loaded successfully: {plugin_name}") except ImportError as exc: LOG.error(f"Failed to load plugin: {plugin_name}") # print stacktrace traceback.print_exc() print(f"Error: {exc}") def get_input_args(self) -> list[str]: """Returns a list of Pydantic classes for all registered plugins' input arguments.' Returns: A list of Pydantic classes for all registered plugins' input arguments.' """ input_args = [] for plugin in self.plugins.values(): input_args_from_plugin = plugin.get_input_args() if input_args_from_plugin is not None: input_args.append(input_args_from_plugin) return input_args def get_training_args_mixin(self): """ Returns a list of dataclasses for all registered plugins' training args mixins' Returns: list[str]: A list of dataclsses """ training_args = [] for plugin in self.plugins.values(): training_args_from_plugin = plugin.get_training_args_mixin() if training_args_from_plugin is not None: training_args.append(training_args_from_plugin) return training_args def load_datasets( self, cfg: DictDefault, preprocess: bool = False ) -> Union["TrainDatasetMeta", None]: """Calls the load_datasets method of each registered plugin. Args: cfg: The configuration for the plugins. preprocess: Whether this is preprocess step of the datasets. Returns: The dataset metadata loaded from all registered plugins. """ return_ds_meta = None for plugin in self.plugins.values(): dataset_meta = plugin.load_datasets(cfg, preprocess) if dataset_meta is not None: if return_ds_meta is None: return_ds_meta = dataset_meta else: raise RuntimeError("Multiple plugins loaded datasets") return return_ds_meta def pre_model_load(self, cfg: DictDefault): """Calls the pre_model_load method of all registered plugins. Args: cfg: The configuration for the plugins. """ for plugin in self.plugins.values(): plugin.pre_model_load(cfg) def post_model_build(self, cfg: DictDefault, model: PreTrainedModel): """Calls the `post_model_build` method of all registered plugins after the model has been built / loaded, but before any adapters have been applied. Args: cfg: The configuration for the plugins. model: The loaded model. """ for plugin in self.plugins.values(): plugin.post_model_build(cfg, model) def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel): """Calls the `pre_lora_load` method of all registered plugins. Args: cfg: The configuration for the plugins. model: The loaded model. """ for plugin in self.plugins.values(): plugin.pre_lora_load(cfg, model) def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Calls the `post_lora_load` method of all registered plugins. Args: cfg: The configuration for the plugins. model: The loaded model. """ for plugin in self.plugins.values(): plugin.post_lora_load(cfg, model) def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Calls the `post_model_load` method of all registered plugins after the model has been loaded inclusive of any adapters. Args: cfg: The configuration for the plugins. model: The loaded model. """ for plugin in self.plugins.values(): plugin.post_model_load(cfg, model) def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None: """Calls the `get_trainer_cls` method of all registered plugins and returns the first non-`None` trainer class. Args: cfg: The configuration for the plugins. Returns: The first non-`None` trainer class returned by a plugin. """ for plugin in self.plugins.values(): trainer_cls = plugin.get_trainer_cls(cfg) if trainer_cls is not None: return trainer_cls return None def get_training_args(self, cfg): """ Calls the get_training_args method of all registered plugins and returns the combined training arguments. Parameters: cfg (dict): The configuration for the plugins. Returns: object: The training arguments """ training_args_kwargs = {} for plugin in self.plugins.values(): training_args = plugin.get_training_args(cfg) if training_args is not None: training_args_kwargs.update(training_args) return training_args_kwargs def get_collator_cls_and_kwargs(self, cfg, is_eval=False): """ Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class. Parameters: cfg (dict): The configuration for the plugins. is_eval (bool): Whether this is an eval split. Returns: object: The collator class, or None if none was found. """ for plugin in self.plugins.values(): collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval) if collator is not None: collator_cls, collator_kwargs = collator return collator_cls, collator_kwargs return None def post_trainer_create(self, cfg: DictDefault, trainer: Trainer): """Calls the `post_trainer_create` method of all registered plugins. Args: cfg: The configuration for the plugins. trainer: The trainer object for training. """ for plugin in self.plugins.values(): plugin.post_trainer_create(cfg, trainer) def create_optimizer(self, trainer: Trainer) -> Optimizer | None: """Calls the `create_optimizer` method of all registered plugins and returns the first non-`None` optimizer. Args: trainer: The trainer object for training. Returns: The created optimizer, or `None` if none was found. """ for plugin in self.plugins.values(): optimizer = plugin.create_optimizer(self.cfg, trainer) if optimizer is not None: return optimizer return None def create_lr_scheduler( self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int ) -> LRScheduler | None: """Calls the `create_lr_scheduler` method of all registered plugins and returns the first non-`None` scheduler. Args: trainer: The trainer object for training. optimizer: The optimizer for training. Returns: The created learning rate scheduler, or `None` if not found. """ for plugin in self.plugins.values(): scheduler: LRScheduler | None = plugin.create_lr_scheduler( self.cfg, trainer=trainer, optimizer=optimizer, num_training_steps=num_training_steps, ) if scheduler is not None: return scheduler return None def add_callbacks_pre_trainer( self, cfg: DictDefault, model: PreTrainedModel ) -> list[Callable]: """Calls the add_callbacks_pre_trainer method of all registered plugins. Args: cfg: The configuration for the plugins. model: The loaded model. Returns: A list of callback functions to be added to the `TrainingArgs`. """ callbacks = [] for plugin in self.plugins.values(): plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model) if plugin_callbacks: # if the plugin returned a list of callbacks callbacks.extend(plugin_callbacks) return callbacks def add_callbacks_post_trainer( self, cfg: DictDefault, trainer: Trainer ) -> list[Callable]: """Calls the `add_callbacks_post_trainer` method of all registered plugins. Args: cfg: The configuration for the plugins. trainer: The trainer object for training. Returns: A list of callback functions to be added to the `TrainingArgs`. """ callbacks = [] for plugin in self.plugins.values(): plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer) if plugin_callbacks: callbacks.extend(plugin_callbacks) return callbacks def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Calls the post_train method of all registered plugins. Args: cfg: The configuration for the plugins. model: The loaded model. """ for plugin in self.plugins.values(): plugin.post_train(cfg, model) def post_train_unload(self, cfg: DictDefault): """Calls the post_train_unload method of all registered plugins. Args: cfg: The configuration for the plugins. """ for plugin in self.plugins.values(): plugin.post_train_unload(cfg) class BaseOptimizerFactory: """Base class for factories to create custom optimizers""" def __call__( self, opt_model, training_args, **optimizer_kwargs ) -> Optimizer | None: pass # duplicated from transformers def get_decay_parameter_names(self, model) -> list[str]: """ Get all parameter names that weight decay will be applied to. This function filters out parameters in two ways: 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS) 2. By parameter name patterns (containing 'bias', or variation of 'norm') """ forbidden_name_patterns = [ r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)", ] decay_parameters = get_parameter_names( model, [nn.LayerNorm], forbidden_name_patterns ) return decay_parameters ================================================ FILE: src/axolotl/integrations/config.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # This software may be used and distributed according to # the terms of the Axolotl Community License Agreement (the "License"); # you may not use this file except in compliance with the License. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. """ Module to handle merging the plugins' input arguments with the base configurations. This was moved here to prevent circular imports. """ from typing import Any, Dict, List, Type from axolotl.utils.schemas.config import ( AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase, AxolotlInputConfig as AxolotlInputConfigBase, ) def merge_input_args(): """ Merges input arguments from registered plugins with the base configurations. This function retrieves the input arguments from registered plugins using the PluginManager. It then dynamically creates new classes, AxolotlConfigWCapabilities and AxolotlInputConfig, that inherit from the base configurations and include the input arguments from the plugins. Returns: tuple: A tuple containing the newly created classes, AxolotlConfigWCapabilities and AxolotlInputConfig. """ from axolotl.integrations.base import PluginManager plugin_manager = PluginManager.get_instance() input_args: List[str] = plugin_manager.get_input_args() plugin_classes = [] dynamic_input = "" for plugin_args in input_args: plugin_module, plugin_cls = plugin_args.rsplit(".", 1) dynamic_input += f"from {plugin_module} import {plugin_cls}\n" plugin_classes.append(plugin_cls) if dynamic_input: dynamic_input += f"class AxolotlConfigWCapabilities(AxolotlConfigWCapabilitiesBase, {', '.join(plugin_classes)}):\n pass\n" dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n pass\n" namespace: Dict[Any, Any] = {} exec(dynamic_input, globals(), namespace) # nosec B102 AxolotlInputConfig = namespace["AxolotlInputConfig"] AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"] return AxolotlConfigWCapabilities, AxolotlInputConfig return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase def merge_training_args() -> Type: """ Merges training arguments from registered plugins with the base TrainingArguments. This function retrieves the training arguments from registered plugins using the PluginManager. It then dynamically creates new classes, AxolotlTrainingMixins, that inherit from the base configurations and include the training arguments from the plugins. Returns: tuple: A tuple containing the newly created classes, AxolotlTrainingMixins. """ from axolotl.core.training_args_base import ( AxolotlTrainingMixins as AxolotlTrainingMixinsBase, ) from axolotl.integrations.base import PluginManager plugin_manager = PluginManager.get_instance() training_args_mixins: List[str] = plugin_manager.get_training_args_mixin() mixin_classes = [] dynamic_input = "" for plugin_args in training_args_mixins: plugin_module, plugin_cls = plugin_args.rsplit(".", 1) dynamic_input += f"from {plugin_module} import {plugin_cls}\n" mixin_classes.append(plugin_cls) if dynamic_input: dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n pass\n" namespace: Dict[Any, Any] = {} local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase} exec(dynamic_input, {**globals(), **local_vars}, namespace) # nosec B102 AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"] return AxolotlTrainingMixins return AxolotlTrainingMixinsBase ================================================ FILE: src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.md ================================================ Acknowledgements Portions of this Cut Cross Entropy Software may utilize the following copyrighted material, the use of which is hereby acknowledged. ------ PyTorch From PyTorch: Copyright (c) 2016- Facebook, Inc (Adam Paszke) Copyright (c) 2014- Facebook, Inc (Soumith Chintala) Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) Copyright (c) 2011-2013 NYU (Clement Farabet) Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) From Caffe2: Copyright (c) 2016-present, Facebook Inc. All rights reserved. All contributions by Facebook: Copyright (c) 2016 Facebook Inc. All contributions by Google: Copyright (c) 2015 Google Inc. All rights reserved. All contributions by Yangqing Jia: Copyright (c) 2015 Yangqing Jia All rights reserved. All contributions by Kakao Brain: Copyright 2019-2020 Kakao Brain All contributions by Cruise LLC: Copyright (c) 2022 Cruise LLC. All rights reserved. All contributions by Arm: Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors All rights reserved. All other contributions: Copyright(c) 2015, 2016 the respective contributors All rights reserved. Caffe2 uses a copyright model similar to Caffe: each contributor holds copyright over their contributions to Caffe2. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America and IDIAP Research Institute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Triton /* * Copyright 2018-2020 Philippe Tillet * Copyright 2020-2022 OpenAI * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files * (the "Software"), to deal in the Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ Transformers Copyright 2018- The Hugging Face team. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: src/axolotl/integrations/cut_cross_entropy/LICENSE ================================================ Copyright (C) 2024 Apple Inc. All Rights Reserved. IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") in consideration of your agreement to the following terms, and your use, installation, modification or redistribution of this Apple software constitutes acceptance of these terms. If you do not agree with these terms, please do not use, install, modify or redistribute this Apple software. In consideration of your agreement to abide by the following terms, and subject to these terms, Apple grants you a personal, non-exclusive license, under Apple's copyrights in this original Apple software (the "Apple Software"), to use, reproduce, modify and redistribute the Apple Software, with or without modifications, in source and/or binary forms; provided that if you redistribute the Apple Software in its entirety and without modifications, you must retain this notice and the following text and disclaimers in all such redistributions of the Apple Software. Neither the name, trademarks, service marks or logos of Apple Inc. may be used to endorse or promote products derived from the Apple Software without specific prior written permission from Apple. Except as expressly stated in this notice, no other rights or licenses, express or implied, are granted by Apple herein, including but not limited to any patent rights that may be infringed by your derivative works or by other works in which the Apple Software may be incorporated. The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- SOFTWARE DISTRIBUTED WITH CUT CROSS ENTROPY: The Cut Cross Entropy software includes a number of subcomponents with separate copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.md. ------------------------------------------------------------------------------- ================================================ FILE: src/axolotl/integrations/cut_cross_entropy/README.md ================================================ # Cut Cross Entropy Cut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation. See https://github.com/apple/ml-cross-entropy ## Requirements - PyTorch 2.4.0 or higher ## Installation Run the following command to install `cut_cross_entropy[transformers]` if you don't have it already. - If you are in dev environment ```bash python scripts/cutcrossentropy_install.py | sh ``` - If you are installing from pip ```bash pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6" ``` ## Usage ```yaml plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin ``` ## Supported Models - afmoe - apertus - arcee - cohere - cohere2 - deepseek_v3 - exaone4 - gemma - gemma2 - gemma3 - gemma3_text - gemma3n - gemma3n_text - glm - glm4 - glm4_moe - glm4_moe_lite - glm46v - glm4v - glm4v_moe - glm_image - glm_moe_dsa - gpt_oss - granite - granitemoe - granitemoehybrid - granitemoeshared - hunyuan_v1_dense - hunyuan_v1_moe - internvl - kimi_linear - lfm2 - lfm2_moe - lfm2_vl - llama - llama4 - llama4_text - llava - ministral - ministral3 - mistral - mistral3 - mistral4 - mixtral - mllama - nemotron_h - olmo - olmo2 - olmo3 - olmoe - phi - phi3 - phi4_multimodal - qwen2 - qwen2_5_vl - qwen2_moe - qwen2_vl - qwen3 - qwen3_5 - qwen3_5_text - qwen3_5_moe - qwen3_5_moe_text - qwen3_moe - qwen3_next - qwen3_vl - qwen3_vl_moe - seed_oss - smollm3 - step3p5 - voxtral ## Citation ```bib @article{wijmans2024cut, author = {Erik Wijmans and Brody Huval and Alexander Hertzberg and Vladlen Koltun and Philipp Kr\"ahenb\"uhl}, title = {Cut Your Losses in Large-Vocabulary Language Models}, journal = {arXiv}, year = {2024}, url = {https://arxiv.org/abs/2411.09009}, } ``` ================================================ FILE: src/axolotl/integrations/cut_cross_entropy/__init__.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Module for the Plugin for Cut Cross Entropy integration with Axolotl. Cut Cross Entropy is an optimized implementation of cross entropy loss from Apple's ML team. """ import importlib from functools import partial import torch from axolotl.integrations.base import BasePlugin from axolotl.utils import get_pytorch_version from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix from axolotl.utils.logging import get_logger from .args import CutCrossEntropyArgs as CutCrossEntropyArgs LOG = get_logger(__name__) _CCE_INSTALL_MESSAGE = ( "Please install Axolotl's fork of cut_cross_entropy with transformers support using " '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"`' ) class CutCrossEntropyPlugin(BasePlugin): """ Plugin for Cut Cross Entropy integration with Axolotl. """ def get_input_args(self): return "axolotl.integrations.cut_cross_entropy.CutCrossEntropyArgs" def _check_requirements(self): """Check if all requirements are met.""" # Check PyTorch version major, minor, _ = get_pytorch_version() if (major, minor) < (2, 4): raise ImportError( "Cut Cross Entropy requires PyTorch >= 2.4.0. " f"Current version: {torch.__version__}" ) # Check if cut_cross_entropy is installed cce_spec = importlib.util.find_spec("cut_cross_entropy") if cce_spec is None: raise ImportError(_CCE_INSTALL_MESSAGE) cce_spec_transformers = importlib.util.find_spec( "cut_cross_entropy.transformers" ) if cce_spec_transformers is None: raise ImportError( "Transformers support is not installed. " + _CCE_INSTALL_MESSAGE ) # Check if Axolotl's cce fork is installed try: from cut_cross_entropy.transformers.patch import AXOLOTL_CCE_FORK if not AXOLOTL_CCE_FORK: raise ImportError except ImportError as e: raise ImportError( "Axolotl's fork of cut_cross_entropy is not installed. " + _CCE_INSTALL_MESSAGE ) from e def pre_model_load(self, cfg): """Apply cut cross entropy before model loading if enabled.""" if cfg.cut_cross_entropy: self._check_requirements() self.patch_llama_like(cfg.model_config_type) from cut_cross_entropy.transformers.patch import cce_patch LOG.info( f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}" ) # The patch checks model_type internally cce_patch( cfg.model_config_type, remote_model_id=cfg.base_model if cfg.trust_remote_code else None, ) def patch_llama_like( self, model_type_to_patch: str, ) -> None: """ Generic patch for model architectures with causal lm similar to llama """ from cut_cross_entropy.transformers.patch import PATCH_FNS def patch_generic( maybe_model, patch_options, remote_model_id: str | None, model_type: str, ): import cut_cross_entropy.transformers.llama from cut_cross_entropy.transformers.llama import cce_forward try: # Dynamically import the module and CausalLM class module_path = f"transformers.models.{model_type}.modeling_{model_type}" model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type) module = __import__( module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"] ) model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM") cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options model_cls.forward = cce_forward except (ImportError, AttributeError) as e: raise RuntimeError( f"Could not import ForCausalLM class for model_type: {model_type}. " f"Error: {str(e)}" ) from e if model_type_to_patch not in PATCH_FNS: LOG.warning_once( "Setting up generic cce patch for model type: %s", model_type_to_patch ) LOG.warning_once( f"Generic Cut Cross Entropy + {model_type_to_patch} support is experimental and may not work as expected." ) PATCH_FNS[model_type_to_patch] = partial( patch_generic, model_type=model_type_to_patch ) ================================================ FILE: src/axolotl/integrations/cut_cross_entropy/args.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Module for handling Cut Cross Entropy input arguments. """ from typing import Optional from pydantic import BaseModel, model_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class CutCrossEntropyArgs(BaseModel): """ Input args for Cut Cross Entropy. """ cut_cross_entropy: Optional[bool] = True @model_validator(mode="before") @classmethod def check_dtype_is_half(cls, data): if data.get("cut_cross_entropy") and not (data.get("bf16") or data.get("fp16")): raise ValueError( "Cut Cross Entropy requires fp16/bf16 training for backward pass. " "Please set `bf16` or `fp16` to `True`." ) return data @model_validator(mode="before") @classmethod def check_chunked_cross_entropy_not_set(cls, data): if data.get("chunked_cross_entropy"): raise ValueError( "Cut Cross Entropy does not support chunked cross entropy. " "Please set `chunked_cross_entropy` to `False` or disable Cut Cross Entropy." ) return data ================================================ FILE: src/axolotl/integrations/densemixer/README.md ================================================ # DenseMixer See [DenseMixer](https://github.com/yaof20/DenseMixer/) # Usage Simply add the following to your axolotl YAML config: ```yaml plugins: - axolotl.integrations.densemixer.DenseMixerPlugin ``` ================================================ FILE: src/axolotl/integrations/densemixer/__init__.py ================================================ """Integration entry point for the DenseMixer plugin.""" from .plugin import DenseMixerPlugin __all__ = ["DenseMixerPlugin"] ================================================ FILE: src/axolotl/integrations/densemixer/args.py ================================================ """Pydantic models for DenseMixer plugin""" from pydantic import BaseModel class DenseMixerArgs(BaseModel): """ Args for DenseMixer """ dense_mixer: bool = True ================================================ FILE: src/axolotl/integrations/densemixer/plugin.py ================================================ """DenseMixer plugin for Axolotl""" import importlib from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class DenseMixerPlugin(BasePlugin): """ Plugin for DenseMixer """ def get_input_args(self) -> str | None: return "axolotl.integrations.densemixer.args.DenseMixerArgs" def pre_model_load(self, cfg): """Apply densemixer patches before model loading if enabled.""" if cfg.dense_mixer: if not importlib.util.find_spec("densemixer"): raise RuntimeError( "DenseMixer is not installed. Install it with `pip install densemixer`" ) from densemixer.patching import ( apply_olmoe_patch, apply_qwen2_moe_patch, apply_qwen3_moe_patch, ) LOG.info( f"Applying DenseMixer patches for model type: {cfg.model_config_type}" ) if cfg.model_config_type == "olmoe": apply_olmoe_patch() if cfg.model_config_type == "qwen2_moe": apply_qwen2_moe_patch() if cfg.model_config_type == "qwen3_moe": apply_qwen3_moe_patch() ================================================ FILE: src/axolotl/integrations/diffusion/README.md ================================================ # Diffusion LM Training Plugin for Axolotl This plugin enables diffusion language model training using an approach inspired by LLaDA (Large Language Diffusion Models) within Axolotl. ## Overview LLaDA is a diffusion-based approach to language model training that uses: - **Random token masking** during training instead of next-token prediction - **Bidirectional attention** to allow the model to attend to the full context - **Importance weighting** based on masking probabilities for stable training This approach can lead to more robust language models with better understanding of bidirectional context. ## Installation The plugin is included with Axolotl. See our [installation docs](https://docs.axolotl.ai/docs/installation.html). ## Quickstart Train with an example config (Llama‑3.2 1B): - Pretrain: `axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml` - SFT: `axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml` ### Basic Configuration You can also modify your existing configs to enable / customize diffusion training. Add the following to your Axolotl config: ```yaml # Enable diffusion LM training plugin plugins: - axolotl.integrations.diffusion.DiffusionPlugin ``` And, configure the nested `diffusion` block (defaults shown): ```yaml diffusion: noise_schedule: linear # or "cosine" min_mask_ratio: 0.1 max_mask_ratio: 0.9 num_diffusion_steps: 128 eps: 1e-3 importance_weighting: true # Mask token (training auto-adds if missing, avoid pad/eos) mask_token_str: "<|diffusion_mask|>" # Or use an existing special token id (e.g., 128002 for Llama-3.x) # mask_token_id: 128002 # Sample generation during training (optional) generate_samples: true generation_interval: 100 num_generation_samples: 3 generation_steps: 128 generation_temperature: 0.0 generation_max_length: 100 ``` ## Supported Models Any models that support 4D attention masks should work out of the box. If not, please create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues) or open a [PR](https://github.com/axolotl-ai-cloud/axolotl/compare)! ## How It Works ### Random Masking During training, tokens are randomly masked: - Sample timestep `t` uniformly from [0, 1] - Calculate masking probability: `p = (1 - eps) * t + eps` - Randomly mask tokens with probability `p` ### Diffusion Loss Loss is computed only on masked tokens with (optional) importance weighting: ```python loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens ``` ## Sample Generation When `diffusion.generate_samples: true`, the plugin generates samples during training: ``` Sample 1: Original (45 tokens): The quick brown fox jumps over the lazy dog... Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]... Generated: The quick brown fox jumps over the lazy dog... ``` Samples are logged to console and wandb (if enabled). ## Inference Diffusion inference is integrated into the standard Axolotl CLI. Use the same config you trained with and run: ``` axolotl inference path/to/your-config.yaml ``` Optionally, pass `--gradio` to use a simple web interface. Interactive controls (prefix the prompt with commands): - `:complete N` → completion mode with N new masked tokens appended (default 64) - `:mask R` → random masking mode with target mask ratio R in [0.0, 1.0] Example session: ``` ================================================================================ Commands: :complete N -> completion mode with N tokens (default 64) :mask R -> random masking with ratio R (0.0–1.0) ================================================================================ Give me an instruction (Ctrl + D to submit): :mask 0.4 The quick brown fox jumps over the lazy dog Masked (40.0%): The [MASK] brown [MASK] jumps over the [MASK] dog Generated: The quick brown fox jumps over the loud dog ``` ## Metrics and Monitoring The plugin adds (or modifies) several metrics to track diffusion training: - `train/loss`: Weighted diffusion loss - `train/accuracy`: Accuracy on masked tokens - `train/mask_ratio`: Average fraction of tokens masked - `train/num_masked_tokens`: Number of tokens masked - `train/avg_p_mask`: Average masking probability - `train/ce_loss`: Unweighted cross-entropy loss - `train/importance_weight_avg`: Average importance weight ## Limitations - No flash attention support - No RL training support ## References - [LLaDA Paper](https://arxiv.org/abs/2404.10406) - [Axolotl Documentation](https://docs.axolotl.ai/) - [API reference for plugin](https://docs.axolotl.ai/docs/api/integrations.diffusion.args.html#axolotl.integrations.diffusion.args) ================================================ FILE: src/axolotl/integrations/diffusion/__init__.py ================================================ """Diffusion LM training plugin init.""" from .args import DiffusionArgs, DiffusionConfig from .callbacks import DiffusionGenerationCallback from .generation import generate from .plugin import DiffusionPlugin from .trainer import DiffusionTrainer from .utils import create_bidirectional_attention_mask, resolve_mask_token_id __all__ = [ "DiffusionArgs", "DiffusionPlugin", "DiffusionTrainer", "generate", "resolve_mask_token_id", "create_bidirectional_attention_mask", "DiffusionGenerationCallback", "DiffusionConfig", ] ================================================ FILE: src/axolotl/integrations/diffusion/args.py ================================================ """Config args for diffusion LM training (nested under `diffusion:`).""" from __future__ import annotations from typing import Literal from pydantic import BaseModel, Field, model_validator class DiffusionConfig(BaseModel): """Nested diffusion configuration available under the `diffusion` key.""" # Noise schedule config noise_schedule: Literal["linear", "cosine"] = Field( default="linear", description="Type of noise schedule for diffusion training" ) min_mask_ratio: float = Field( default=0.1, ge=0.0, le=1.0, description="Minimum masking ratio for diffusion noise schedule", ) max_mask_ratio: float = Field( default=0.9, ge=0.0, le=1.0, description="Maximum masking ratio for diffusion noise schedule", ) num_diffusion_steps: int = Field( default=128, ge=1, description="Number of diffusion timesteps" ) eps: float = Field( default=1e-3, ge=0.0, le=1.0, description="Epsilon value for minimum masking probability in forward process", ) # Training config importance_weighting: bool = Field( default=True, description="Apply importance weighting to loss based on masking probability", ) mask_token_id: int | None = Field( default=None, description=( "Token ID to use for masking. Unset by default; can use one of the " "tokenizer's special tokens here." ), ) mask_token_str: str | None = Field( default=None, description=( "Token string to use as a mask. If `mask_token_id` is invalid or unset, " "this token will be ensured to exist as an additional special token and " "used. If absent, a default '<|diffusion_mask|>' will be added." ), ) # Sample generation config generate_samples: bool = Field( default=True, description="Enable sample generation during training" ) generation_interval: int = Field( default=100, ge=1, description="Generate samples every N steps" ) num_generation_samples: int = Field( default=3, ge=1, description="Number of samples to generate each time" ) generation_steps: int = Field( default=128, ge=1, description="Number of diffusion steps for generation" ) generation_temperature: float = Field( default=0.0, ge=0.0, description="Temperature for generation sampling (0.0 = deterministic)", ) generation_max_length: int = Field( default=100, ge=1, description="Maximum sequence length for generation" ) @model_validator(mode="after") def _validate_mask_ratios(self) -> "DiffusionConfig": if self.min_mask_ratio > self.max_mask_ratio: raise ValueError("min_mask_ratio must be ≤ max_mask_ratio") return self class DiffusionArgs(BaseModel): """Plugin entry that exposes the nested `diffusion` block to the core config.""" diffusion: DiffusionConfig = Field( default_factory=DiffusionConfig, description="Diffusion training configuration. Only nested block is supported.", ) ================================================ FILE: src/axolotl/integrations/diffusion/callbacks.py ================================================ """Callbacks for diffusion training.""" import logging import sys import wandb from colorama import Fore, Style from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState from transformers.training_args import TrainingArguments from .generation import generate_samples # Simpler logger for more readable sample generation logger = logging.getLogger(__name__) if not logger.handlers: handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter("%(message)s")) logger.addHandler(handler) logger.propagate = False logger.setLevel(logging.INFO) class DiffusionGenerationCallback(TrainerCallback): """Callback for generating samples during diffusion training.""" def __init__(self, trainer): self.trainer = trainer def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Generate samples at specified intervals.""" if ( state.global_step > 0 and state.global_step % self.trainer.cfg.diffusion.generation_interval == 0 ): if not self.trainer.state.is_world_process_zero: return # Use eval dataloader if available, otherwise use train dataloader dataloader = None try: if getattr(self.trainer, "eval_dataset", None) is not None: dataloader = self.trainer.get_eval_dataloader() except Exception: dataloader = None if dataloader is None: dataloader = self.trainer.get_train_dataloader() # Generate samples diffusion_cfg = self.trainer.cfg.diffusion samples = generate_samples( model=self.trainer.model, tokenizer=self.trainer.processing_class, dataloader=dataloader, num_generation_samples=diffusion_cfg.num_generation_samples, max_length=diffusion_cfg.generation_max_length, num_diffusion_steps=diffusion_cfg.generation_steps, temperature=diffusion_cfg.generation_temperature, mask_token_id=diffusion_cfg.mask_token_id, ) # Log samples self._log_samples(samples, state.global_step) def _log_samples(self, samples: list, step: int): """Log generated samples.""" if not samples: return logger.info("=" * 60) logger.info("GENERATED SAMPLES") logger.info("=" * 60) for i, sample_data in enumerate(samples, 1): original = sample_data["original"] masked = sample_data["masked"] generated = sample_data["generated"] mask_ratio = sample_data["mask_ratio"] masked_tokens = sample_data["masked_tokens"] total_tokens = sample_data["total_tokens"] logger.info(f"\nSample {i}:") logger.info(f"\tOriginal ({total_tokens} tokens): {original}") logger.info( f"\tMasked ({masked_tokens}/{total_tokens} tokens, " f"{mask_ratio:.1%}): {masked}" ) try: gen_ids = sample_data.get("generated_ids") orig_ids = sample_data.get("orig_ids") masked_positions = set(sample_data.get("masked_positions") or []) if isinstance(gen_ids, list) and isinstance(orig_ids, list): styles: list[str] = [] for i, tid in enumerate(gen_ids): if i in masked_positions: if i < len(orig_ids) and tid == orig_ids[i]: styles.append("green") elif i < len(orig_ids): styles.append("red") else: styles.append("normal") else: same = i < len(orig_ids) and tid == orig_ids[i] styles.append("dim" if same else "normal") spans: list[tuple[str, int, int]] = [] if gen_ids: cur = styles[0] start = 0 for i in range(1, len(gen_ids)): s = styles[i] if s != cur: spans.append((cur, start, i)) cur, start = s, i spans.append((cur, start, len(gen_ids))) parts = [] for style_name, a, b in spans: chunk_text = self.trainer.processing_class.decode( gen_ids[a:b], skip_special_tokens=False ) if style_name == "green": parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL) elif style_name == "red": parts.append(Fore.RED + chunk_text + Style.RESET_ALL) else: if style_name == "dim": parts.append(Style.DIM + chunk_text + Style.RESET_ALL) else: parts.append(chunk_text) logger.info("\tGenerated:\n%s", "".join(parts)) else: logger.info(f"\tGenerated: {generated}") except Exception: logger.info(f"\tGenerated: {generated}") logger.info("=" * 60) if self.trainer.cfg.use_wandb: if wandb.run is not None: wandb.log( { "generated_samples": wandb.Table( columns=[ "step", "original", "masked", "generated", "mask_ratio", "masked_tokens", "total_tokens", ], data=[ [ step, sample["original"], sample["masked"], sample["generated"], f"{sample['mask_ratio']:.1%}", sample["masked_tokens"], sample["total_tokens"], ] for sample in samples ], ) }, step=step, ) ================================================ FILE: src/axolotl/integrations/diffusion/generation.py ================================================ """Sample generation utilities for diffusion training.""" import re from typing import Any, List, Literal, Optional import torch from axolotl.utils.logging import get_logger from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions LOG = get_logger(__name__) def generate_samples( model: torch.nn.Module, tokenizer: Any, dataloader: Optional[Any] = None, num_generation_samples: int = 3, max_length: int = 100, num_diffusion_steps: int = 128, temperature: float = 0.0, mask_token_id: int = 32000, mode: Literal["random", "completion"] = "random", completion_tokens: int = 0, target_mask_ratio: Optional[float] = None, ) -> List[dict]: """ Generate text samples using the diffusion model by randomly masking sequences from the given dataset and running the reverse diffusion process. Args: model: The wrapped or unwrapped model tokenizer: Tokenizer for encoding/decoding dataloader: Validation dataloader (for sampling sequences) num_generation_samples: Number of samples to generate max_length: Maximum length of sequences to use num_diffusion_steps: Number of diffusion steps for generation temperature: Temperature for sampling (0.0 = deterministic) mask_token_id: Token ID used for masking Returns: List of dictionaries with original text, masked text, and generated text """ if dataloader is None: LOG.warning("No validation dataloader provided, cannot generate samples") return [] unwrapped_model = model.module if hasattr(model, "module") else model training = unwrapped_model.training unwrapped_model.eval() # Resolve device robustly (some modules don't expose `.device`) device = getattr(unwrapped_model, "device", None) if device is None: try: device = next(unwrapped_model.parameters()).device except StopIteration: device = torch.device("cpu") generations = [] # Sample sequences from validation dataset sampled_sequences = _sample_sequences_from_dataloader( dataloader, num_generation_samples, max_length, device ) LOG.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset") # Generate samples using reverse diffusion process with torch.no_grad(): for sample in sampled_sequences: if isinstance(sample, dict): original_sequence = sample.get("input_ids") labels_seq = sample.get("labels") attn_seq = sample.get("attention_mask") else: original_sequence = sample labels_seq = None attn_seq = None generation_result = generate( unwrapped_model, tokenizer, original_sequence, num_diffusion_steps, temperature, mask_token_id, mode=mode, completion_tokens=completion_tokens, target_mask_ratio=target_mask_ratio, labels=labels_seq, attention_mask=attn_seq, ) generations.append(generation_result) # Restore prior training state if training: unwrapped_model.train() else: unwrapped_model.eval() return generations def _sample_sequences_from_dataloader( dataloader: Any, num_samples: int, max_length: int, device: torch.device ) -> List[Any]: """Sample sequences from validation dataloader.""" sampled_sequences: list[dict[str, torch.Tensor] | torch.Tensor] = [] sample_count = 0 # Skip a random number of batches (we could be more clever about this) skip_batches = torch.randint(0, 10, (1,)).item() batch_count = 0 for batch in dataloader: # Skip some batches for variety if batch_count < skip_batches: batch_count += 1 continue if sample_count >= num_samples: break batch_count += 1 input_ids = batch["input_ids"] attention_mask = batch.get("attention_mask") labels = batch.get("labels") # Randomly sample from sequences in this batch batch_indices = torch.randperm(input_ids.size(0)).tolist() for i in batch_indices: if sample_count >= num_samples: break # Get actual sequence length (non-padded) if attention_mask is not None: seq_len = attention_mask[i].sum().item() else: seq_len = input_ids.size(1) if seq_len < 10: continue # Determine truncation length max_total = min(seq_len, max_length) if labels is not None: labels_i = labels[i][:seq_len] answer_mask = labels_i != -100 if not answer_mask.any(): # No answer tokens; skip for SFT masking continue first_ans_idx = int( torch.nonzero(answer_mask, as_tuple=False)[0].item() ) prompt_len = first_ans_idx if prompt_len >= max_total: # Prompt alone reaches cap; cannot include any answer continue remaining_answer = int(answer_mask[prompt_len:].sum().item()) allowed_answer = max_total - prompt_len take_answer = min(remaining_answer, allowed_answer) if take_answer <= 0: continue actual_length = prompt_len + take_answer else: actual_length = max_total # Extract the (possibly truncated) sequence sequence = input_ids[i][:actual_length].unsqueeze(0).to(device) attn_seq = ( attention_mask[i][:actual_length].unsqueeze(0).to(device) if attention_mask is not None else None ) if labels is not None: labels_seq = labels[i][:actual_length].unsqueeze(0).to(device) sampled_sequences.append( { "input_ids": sequence, "labels": labels_seq, "attention_mask": attn_seq, } ) else: if attn_seq is not None: sampled_sequences.append( {"input_ids": sequence, "attention_mask": attn_seq} ) else: sampled_sequences.append(sequence) sample_count += 1 return sampled_sequences def generate( model: torch.nn.Module, tokenizer: Any, original_sequence: torch.Tensor, num_diffusion_steps: int, temperature: float, mask_token_id: int, *, mode: Literal["random", "completion"] = "random", completion_tokens: int = 0, target_mask_ratio: Optional[float] = None, labels: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, ) -> dict: """Generate a single sample using reverse diffusion.""" # Get original text for comparison original_text = tokenizer.decode( original_sequence[0].cpu(), skip_special_tokens=True ) # Build masked sequence if ( labels is not None and labels.numel() > 0 and (labels == -100).any() and (labels != -100).any() ): # SFT case: completely mask all answer tokens (labels != -100) total_tokens = original_sequence.size(1) masked_indices = (labels != -100).to(dtype=torch.bool) masked_sequence = original_sequence.clone() masked_sequence[masked_indices] = mask_token_id masked_tokens = int(masked_indices.sum().item()) mask_ratio = masked_tokens / max(int(total_tokens), 1) elif mode == "completion" and completion_tokens > 0: # Append mask tokens to the right for completion total_tokens = original_sequence.size(1) + int(completion_tokens) masked_indices = torch.zeros( 1, total_tokens, dtype=torch.bool, device=original_sequence.device ) masked_indices[0, -int(completion_tokens) :] = True append = torch.full( (1, int(completion_tokens)), mask_token_id, device=original_sequence.device ) masked_sequence = torch.cat([original_sequence, append], dim=1) masked_tokens = int(completion_tokens) mask_ratio = masked_tokens / total_tokens else: # Apply random masking with optional fixed ratio total_tokens = original_sequence.size(1) if target_mask_ratio is None: min_ratio, max_ratio = 0.1, 0.7 target_mask_ratio = ( torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio ) target_masked_tokens = max(1, int(total_tokens * float(target_mask_ratio))) # Create random mask indices mask_positions = torch.randperm(total_tokens)[:target_masked_tokens] masked_indices = torch.zeros( 1, total_tokens, dtype=torch.bool, device=original_sequence.device ) masked_indices[0, mask_positions] = True # Create masked sequence masked_sequence = original_sequence.clone() masked_sequence[masked_indices] = mask_token_id # Calculate actual mask ratio masked_tokens = masked_indices.sum().item() mask_ratio = masked_tokens / total_tokens # Get masked text for comparison masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False) masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id) # Run reverse diffusion process sequence = masked_sequence.clone() attention_mask = create_bidirectional_attention_mask( sequence, attention_mask, sample_packing=attention_mask is not None ) for step in range(num_diffusion_steps): sequence = _diffusion_step( model, sequence, step, num_diffusion_steps, temperature, mask_token_id, attention_mask, ) generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True) # Collect diagnostic info final_ids = sequence[0].detach().cpu().tolist() orig_ids_for_render = original_sequence[0].detach().cpu().tolist() if masked_indices is not None: masked_positions = ( torch.where(masked_indices[0])[0].detach().cpu().tolist() if masked_indices.ndim == 2 else [] ) else: masked_positions = [] result = { "original": original_text, "masked": masked_text, "generated": generated_text, "mask_ratio": mask_ratio, "masked_tokens": masked_tokens, "total_tokens": total_tokens, "generated_ids": final_ids, "masked_positions": masked_positions, "orig_ids": orig_ids_for_render, "formatted": ( f"Original: '{original_text}' → Masked: '{masked_text}' " f"({mask_ratio:.1%}) → Generated: '{generated_text}'" ), } return result def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str: """Clean up masked text for display.""" mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False) cleaned = masked_text.replace(mask_token_repr, "[MASK]") # Remove literal special token strings if hasattr(tokenizer, "special_tokens_map"): for token_value in tokenizer.special_tokens_map.values(): if token_value and isinstance(token_value, str): cleaned = cleaned.replace(token_value, "") # Normalize whitespace but preserve newlines cleaned = cleaned.replace("\r\n", "\n").replace("\r", "\n") cleaned = re.sub(r"[ \t]+", " ", cleaned) cleaned = "\n".join(line.rstrip() for line in cleaned.split("\n")).strip() return cleaned def _diffusion_step( model: torch.nn.Module, sequence: torch.Tensor, step: int, num_diffusion_steps: int, temperature: float, mask_token_id: int, attention_mask: torch.Tensor | None = None, ) -> torch.Tensor: """Perform a single diffusion step with remasking.""" # Only process if there are masked tokens remaining current_mask = sequence == mask_token_id if not current_mask.any(): return sequence # Create or use provided attention mask if attention_mask is None: batch_size, seq_len = sequence.shape attention_mask = torch.ones( batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device ) # Forward pass outputs = model(input_ids=sequence, attention_mask=attention_mask) logits = shift_logits_to_input_positions(outputs.logits) # Only sample at currently masked positions if current_mask.any(): masked_logits = logits[current_mask] # Apply temperature scaling if temperature > 0: scaled_logits = masked_logits / temperature else: scaled_logits = masked_logits # Suppress mask token in outputs scaled_logits[:, mask_token_id] = -float("inf") if temperature > 0: # Add Gumbel noise for sampling gumbel_noise = -torch.log( -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32)) ) gumbel_logits = scaled_logits + gumbel_noise predicted_tokens = torch.argmax(gumbel_logits, dim=-1) else: predicted_tokens = torch.argmax(scaled_logits, dim=-1) # Calculate probabilities for confidence scoring probs = torch.softmax(scaled_logits, dim=-1) predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens] # Determine how many tokens to unmask this step remaining_masked = current_mask.sum().item() if step == num_diffusion_steps - 1: num_to_unmask = remaining_masked else: unmask_ratio = 1.0 / (num_diffusion_steps - step) num_to_unmask = max(1, int(remaining_masked * unmask_ratio)) # Select highest confidence predictions to unmask if num_to_unmask >= remaining_masked: sequence[current_mask] = predicted_tokens else: _, top_indices = predicted_token_probs.topk(num_to_unmask) mask_positions = torch.where(current_mask)[1] positions_to_unmask = mask_positions[top_indices] sequence[0, positions_to_unmask] = predicted_tokens[top_indices] return sequence ================================================ FILE: src/axolotl/integrations/diffusion/plugin.py ================================================ """Diffusion LM training plugin for Axolotl.""" from peft import PeftModel from transformers import PreTrainedModel from axolotl.integrations.base import BasePlugin from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from .trainer import DiffusionTrainer LOG = get_logger(__name__) class DiffusionPlugin(BasePlugin): """ Plugin for diffusion language model training. This plugin enables diffusion-based training using the LLaDA approach, which uses random masking and bidirectional attention to train language models. """ def __init__(self): super().__init__() self.cfg = None def get_input_args(self) -> str: """Returns the pydantic model for LLaDA plugin arguments.""" return "axolotl.integrations.diffusion.DiffusionArgs" def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel): """Perform actions after model is loaded.""" self.cfg = cfg def get_trainer_cls(self, cfg: DictDefault) -> type[DiffusionTrainer] | None: """Return custom trainer class for diffusion training.""" return DiffusionTrainer def post_trainer_create(self, cfg: DictDefault, trainer: DiffusionTrainer): """Configure trainer after creation.""" trainer.set_config(cfg) ================================================ FILE: src/axolotl/integrations/diffusion/trainer.py ================================================ """Custom trainer for diffusion LM training.""" from typing import Any, Literal import torch import torch.nn.functional as F from torch import nn from axolotl.core.trainers.base import AxolotlTrainer from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from .callbacks import DiffusionGenerationCallback from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions LOG = get_logger(__name__) class DiffusionTrainer(AxolotlTrainer): """Custom trainer for diffusion LM training that overrides loss computation.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.cfg = None self._special_token_ids = None def set_config(self, config: DictDefault): """Set config for diffusion training.""" self.cfg = config self._cache_special_token_ids() self._resolve_mask_token_id() token_id = int(getattr(self.cfg.diffusion, "mask_token_id", 0)) LOG.info(f"Diffusion: using mask_token_id={token_id}") if getattr(config.diffusion, "generate_samples", True): generation_callback = DiffusionGenerationCallback(self) self.add_callback(generation_callback) def _resolve_mask_token_id(self) -> None: """Ensure mask_token_id is valid for the current tokenizer.""" from .utils import resolve_mask_token_id tokenizer = getattr(self, "processing_class", None) if tokenizer is None: return mid = resolve_mask_token_id( tokenizer, self.cfg, allow_add=True, model=getattr(self, "model", None), ) try: self.cfg.diffusion.mask_token_id = int(mid) except Exception: pass def compute_loss( self, model: nn.Module, inputs: dict[str, torch.Tensor], return_outputs: bool = False, num_items_in_batch: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]: """Override compute_loss to use diffusion loss.""" input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask") labels = inputs.get("labels") if input_ids is None: raise ValueError("input_ids is required for diffusion training") loss, outputs = self._compute_diffusion_loss( model, input_ids, attention_mask, labels ) if return_outputs: return loss, outputs return loss def _cache_special_token_ids(self): """Cache special token IDs to avoid repeated tokenizer access.""" if self.processing_class is None: self._special_token_ids = set() return tokenizer = self.processing_class special_tokens = set() if hasattr(tokenizer, "bos_token_id") and tokenizer.bos_token_id is not None: special_tokens.add(tokenizer.bos_token_id) if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None: special_tokens.add(tokenizer.eos_token_id) if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None: special_tokens.add(tokenizer.pad_token_id) self._special_token_ids = special_tokens def _forward_process( self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, eps: float = 1e-3, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Forward noising process. A timestep is sampled along the process, and tokens are masked with probability determined by the configured noise schedule. Args: input_ids: Input token ids [batch_size, seq_len]. attention_mask: Attention mask [batch_size, seq_len]. labels: Labels for SFT training [batch_size, seq_len]. eps: Small epsilon value for minimum masking probability. Returns: noisy_batch: Input with some tokens masked. masked_indices: Boolean mask indicating which tokens were masked. p_mask: Masking probabilities for each token [batch_size, seq_len]. """ batch_size, seq_len = input_ids.shape device = input_ids.device # Sample random timesteps for each sample in batch t = torch.rand(batch_size, device=device) p_mask = (1 - eps) * t + eps # [batch_size] p_mask = p_mask[:, None].repeat(1, seq_len) # [batch_size, seq_len] # Don't mask padding tokens if attention_mask is provided if attention_mask is not None: valid_mask = attention_mask.bool() p_mask = p_mask * valid_mask.float() # Create mask to exclude special tokens special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool) if self._special_token_ids: for token_id in self._special_token_ids: special_token_mask |= input_ids == token_id # Create random mask based on p_mask masked_indices = torch.rand((batch_size, seq_len), device=device) < p_mask masked_indices = masked_indices & ~special_token_mask if attention_mask is not None: masked_indices = masked_indices & attention_mask.bool() # For SFT data, only mask answer tokens if labels is not None: answer_mask = labels != -100 masked_indices = masked_indices & answer_mask # Create masked input mask_token_id = int(self.cfg.diffusion.mask_token_id) mask_value = torch.full_like(input_ids, mask_token_id) noisy_batch = torch.where(masked_indices, mask_value, input_ids) return noisy_batch, masked_indices, p_mask def _compute_diffusion_loss( self, model: nn.Module, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | Any]: """ Compute diffusion loss. Args: model: The model to compute loss for. input_ids: Ground truth token ids [batch_size, seq_len]. attention_mask: Attention mask [batch_size, seq_len]. labels: Labels for SFT training [batch_size, seq_len]. Returns: loss: Cross-entropy loss. metrics: Dictionary of metrics. """ # Short-circuit empty sequences if input_ids is None or input_ids.numel() == 0 or input_ids.shape[1] == 0: zero = torch.tensor( 0.0, device=(input_ids.device if input_ids is not None else None), requires_grad=True, ) return zero, {} # If an attention_mask is provided and all positions are padding for every # sample in this batch, skip the step. if attention_mask is not None: if attention_mask.dim() == 2 and (attention_mask.sum(dim=1) == 0).all(): zero = torch.tensor(0.0, device=input_ids.device, requires_grad=True) return zero, {} # Apply forward process noisy_batch, masked_indices, p_mask = self._forward_process( input_ids, attention_mask, labels, self.cfg.diffusion.eps ) # Create bidirectional attention mask bidirectional_mask = create_bidirectional_attention_mask( input_ids, attention_mask, sample_packing=self.cfg.sample_packing ) # Forward pass outputs = model( input_ids=noisy_batch.long(), attention_mask=bidirectional_mask, ) logits = shift_logits_to_input_positions(outputs.logits) if masked_indices.sum() > 0: valid_indices = torch.where(masked_indices) batch_indices, seq_indices = valid_indices masked_logits = logits[batch_indices, seq_indices] masked_targets = input_ids[batch_indices, seq_indices] masked_p_mask = p_mask[batch_indices, seq_indices] # Compute cross-entropy loss without reduction token_loss = F.cross_entropy( masked_logits.float(), masked_targets, reduction="none" ) if self.cfg.diffusion.importance_weighting: masked_p_mask = masked_p_mask.float() weighted_loss = token_loss / masked_p_mask else: weighted_loss = token_loss if labels is not None: # For SFT data: normalize by answer token count per sample answer_mask = labels != -100 answer_lengths = answer_mask.sum(dim=1).float() # [batch_size] # Get batch indices for masked tokens masked_batch_indices = batch_indices # Sum losses per sample and divide by answer length batch_size = input_ids.shape[0] loss_per_sample = torch.zeros(batch_size, device=input_ids.device) for i in range(batch_size): sample_mask = masked_batch_indices == i if sample_mask.sum() > 0: sample_loss = weighted_loss[sample_mask].sum() denom = answer_lengths[i].clamp(min=1.0) loss_per_sample[i] = sample_loss / denom loss = loss_per_sample.mean() else: # Non-SFT: when importance weighting is enabled, use unbiased estimator # (sum(loss/p) / total_tokens). Otherwise, average over masked tokens # for stable scaling across varying mask ratios. if self.cfg.diffusion.importance_weighting: loss = weighted_loss.sum() / ( input_ids.shape[0] * input_ids.shape[1] ) else: loss = weighted_loss.mean() ce_loss = token_loss.mean() # Compute accuracy on masked tokens with torch.no_grad(): pred_tokens = masked_logits.argmax(dim=-1) accuracy = (pred_tokens == masked_targets).float().mean() else: loss = torch.tensor(0.0, device=input_ids.device, requires_grad=True) accuracy = torch.tensor(0.0, device=input_ids.device) ce_loss = torch.tensor(0.0, device=input_ids.device) masked_p_mask = torch.tensor(1.0, device=input_ids.device) avg_p_mask = ( p_mask[masked_indices].mean().item() if masked_indices.any() else 0.0 ) metrics = { "loss": loss.item(), "accuracy": accuracy.item(), "mask_ratio": masked_indices.float().mean().item(), "num_masked_tokens": (masked_indices.sum().item(), "sum"), "avg_p_mask": avg_p_mask, "ce_loss": ce_loss.item(), } # If doing SFT training, log answer-specific metrics if self.cfg.datasets is not None: with torch.no_grad(): answer_mask = labels != -100 answer_lengths = answer_mask.sum(dim=1).float() # type: ignore total_answer_tokens = answer_mask.sum().item() # type: ignore total_tokens = labels.numel() # type: ignore metrics["answer_ratio"] = total_answer_tokens / max(total_tokens, 1) metrics["avg_answer_length"] = answer_lengths.mean().item() if self.cfg.diffusion.importance_weighting: metrics["importance_weight_avg"] = (1.0 / masked_p_mask).mean().item() train_eval: Literal["train", "eval"] = "train" if model.training else "eval" self.store_metrics(metrics, train_eval=train_eval) return loss, outputs ================================================ FILE: src/axolotl/integrations/diffusion/utils.py ================================================ """Shared utilities for diffusion integration.""" from __future__ import annotations from typing import Any, Optional import torch from axolotl.utils.dict import DictDefault def resolve_mask_token_id( tokenizer: Any, cfg: DictDefault, *, allow_add: bool, model: Any | None = None, default_token: str = "<|diffusion_mask|>", ) -> int: """Resolve mask token id. Training may add a new special token; inference won't.""" # Determine vocab size if available vocab_size = None if tokenizer is not None: if hasattr(tokenizer, "vocab_size") and tokenizer.vocab_size is not None: try: vocab_size = int(tokenizer.vocab_size) # type: ignore[arg-type] except Exception: vocab_size = None elif hasattr(tokenizer, "__len__"): try: vocab_size = int(len(tokenizer)) except Exception: vocab_size = None # Use explicit id from config if provided diffusion_cfg = getattr(cfg, "diffusion", None) # Fallback to top-level attr names only if nested missing (shouldn't happen) cfg_id = ( getattr(diffusion_cfg, "mask_token_id", None) if diffusion_cfg is not None else getattr(cfg, "diffusion_mask_token_id", None) ) if isinstance(cfg_id, int) and cfg_id >= 0: if vocab_size is None or cfg_id < vocab_size: return int(cfg_id) def _existing_special_token_id(token_str: str | None) -> int | None: """Attempt to resolve an existing special token string to a real ID.""" if not token_str or not hasattr(tokenizer, "convert_tokens_to_ids"): return None try: token_id = tokenizer.convert_tokens_to_ids(token_str) except Exception: return None if not isinstance(token_id, int) or token_id < 0: return None # Ensure it's registered as special and not UNK, and within vocab unk_id = getattr(tokenizer, "unk_token_id", None) specials = set(getattr(tokenizer, "all_special_tokens", []) or []) addl = set(getattr(tokenizer, "additional_special_tokens", []) or []) is_special = token_str in specials or token_str in addl in_vocab = vocab_size is None or token_id < vocab_size if ( (unk_id is not None and token_id == unk_id) or not is_special or not in_vocab ): return None return token_id # Try mask token string if provided token_str = ( getattr(diffusion_cfg, "mask_token_str", None) if diffusion_cfg is not None else getattr(cfg, "diffusion_mask_token_str", None) ) for candidate in (token_str, default_token): token_id = _existing_special_token_id(candidate) if isinstance(token_id, int): try: if diffusion_cfg is None: cfg.diffusion_mask_token_id = int(token_id) # legacy fallback else: diffusion_cfg.mask_token_id = int(token_id) except Exception: pass return int(token_id) # Optionally add and return a dedicated special token during training if allow_add and hasattr(tokenizer, "add_special_tokens"): token_to_add = token_str or default_token try: tokenizer.add_special_tokens({"additional_special_tokens": [token_to_add]}) # Resize embeddings if possible if ( model is not None and hasattr(tokenizer, "__len__") and hasattr(model, "resize_token_embeddings") ): try: model.resize_token_embeddings(len(tokenizer)) except Exception: pass new_id = tokenizer.convert_tokens_to_ids(token_to_add) if isinstance(new_id, int) and new_id >= 0: try: if diffusion_cfg is None: cfg.diffusion_mask_token_id = int(new_id) # legacy fallback else: diffusion_cfg.mask_token_id = int(new_id) except Exception: pass return int(new_id) except Exception: pass # Fallback to unk or 0 (do not update cfg) fallback = getattr(tokenizer, "unk_token_id", 0) or 0 return int(fallback) def create_bidirectional_attention_mask( input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, sample_packing: bool = False, ) -> torch.Tensor: """ Create bidirectional attention mask to override default causal masking. Handles sample-packed sequences where different samples are identified by different attention mask values. Args: input_ids: Input token ids [batch_size, seq_len] attention_mask: Attention mask [batch_size, seq_len] sample_packing: Whether sample packing is enabled Returns: bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len] """ batch_size, seq_len = input_ids.shape device = input_ids.device if attention_mask is None or not sample_packing: return torch.ones( batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=device ) # Handle sample packing: tokens can only attend within their sample mask_i = attention_mask.unsqueeze(2) # [batch_size, seq_len, 1] mask_j = attention_mask.unsqueeze(1) # [batch_size, 1, seq_len] # Tokens can attend to each other if they have the same non-zero sample ID bidirectional_mask = (mask_i == mask_j) & (mask_i > 0) # Add head dimension: [batch_size, 1, seq_len, seq_len] return bidirectional_mask.unsqueeze(1) def shift_logits_to_input_positions(logits: torch.Tensor) -> torch.Tensor: """Align next-token logits with their input token positions for diffusion.""" if logits.size(1) <= 1: return logits return torch.cat([logits[:, :1], logits[:, :-1]], dim=1) ================================================ FILE: src/axolotl/integrations/grokfast/LICENSE ================================================ MIT License Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: src/axolotl/integrations/grokfast/README.md ================================================ # Grokfast Optimizer See https://github.com/ironjr/grokfast ## Usage ```yaml plugins: - axolotl.integrations.grokfast.GrokfastPlugin grokfast_alpha: 2.0 grokfast_lamb: 0.98 ``` ## Citation ```bib @article{lee2024grokfast, title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients}, author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu}, journal={arXiv preprint arXiv:2405.20233}, year={2024} } ``` ================================================ FILE: src/axolotl/integrations/grokfast/__init__.py ================================================ """ Grokfast plugin for Axolotl """ from transformers.trainer_callback import TrainerCallback from axolotl.utils.logging import get_logger from ..base import BasePlugin from .args import GrokfastArgs as GrokfastArgs from .optimizer import gradfilter_ema LOG = get_logger(__name__) class GrokfastCallbackHandler(TrainerCallback): """ Transformer trainer callbacks for Grokfast """ def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs): super().__init__(*args_, **kwargs) self.grads = None self.alpha = alpha self.lamb = lamb def on_train_begin(self, *args_, **kwargs): self.grads = None def on_pre_optimizer_step(self, args_, state, control, **kwargs): model = kwargs.pop("model") self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb) return control class GrokfastPlugin(BasePlugin): """ Plugin for Grokfast optimizer integraton with Axolotl. """ def get_input_args(self): return "axolotl.integrations.grokfast.GrokfastArgs" def add_callbacks_post_trainer(self, cfg, trainer): LOG.info("Adding Grokfast callback to the trainer") callback = GrokfastCallbackHandler( alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb ) return [callback] ================================================ FILE: src/axolotl/integrations/grokfast/args.py ================================================ """ config args for grokfast plugin """ from typing import Optional from pydantic import BaseModel class GrokfastArgs(BaseModel): """ Input args for Grokfast optimizer. """ grokfast_alpha: Optional[float] = 0.98 grokfast_lamb: Optional[float] = 2.0 ================================================ FILE: src/axolotl/integrations/grokfast/optimizer.py ================================================ # Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee # Reference: https://github.com/ironjr/grokfast from collections import deque from typing import Dict, Literal, Optional import torch import torch.nn as nn def gradfilter_ma( m: nn.Module, grads: Optional[Dict[str, deque]] = None, window_size: int = 100, lamb: float = 5.0, filter_type: Literal["mean", "sum"] = "mean", warmup: bool = True, trigger: bool = False, # For ablation study. ) -> Dict[str, deque]: if grads is None: grads = { n: deque(maxlen=window_size) for n, p in m.named_parameters() if p.requires_grad and p.grad is not None } for n, p in m.named_parameters(): if p.requires_grad and p.grad is not None: grads[n].append(p.grad.data.detach()) # .cpu()) # Modify the gradients. if not warmup or len(grads[n]) == window_size and not trigger: if filter_type == "mean": avg = sum(grads[n]) / len(grads[n]) elif filter_type == "sum": avg = sum(grads[n]) else: raise ValueError(f"Unrecognized filter_type {filter_type}") p.grad.data = p.grad.data + avg * lamb return grads def gradfilter_ema( m: nn.Module, grads: Optional[Dict[str, torch.Tensor]] = None, alpha: float = 0.98, lamb: float = 2.0, ) -> Dict[str, torch.Tensor]: if grads is None: grads = { n: p.grad.data.detach() for n, p in m.named_parameters() if p.requires_grad and p.grad is not None } for n, p in m.named_parameters(): if p.requires_grad and p.grad is not None: grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha) p.grad.data = p.grad.data + grads[n] * lamb return grads ================================================ FILE: src/axolotl/integrations/kd/README.md ================================================ # Knowledge Distillation ## Usage ```yaml plugins: - "axolotl.integrations.kd.KDPlugin" kd_trainer: True kd_ce_alpha: 0.1 kd_alpha: 0.9 kd_temperature: 1.0 torch_compile: True # torch>=2.6.0, recommended to reduce vram datasets: - path: ... type: "axolotl.integrations.kd.chat_template" field_messages: "messages_combined" logprobs_field: "llm_text_generation_vllm_logprobs" # for kd only, field of logprobs ``` An example dataset can be found at [`axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample`](https://huggingface.co/datasets/axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample) ================================================ FILE: src/axolotl/integrations/kd/__init__.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Plugin init to add KD support to Axolotl. """ from typing import Any from transformers import Trainer from axolotl.integrations.base import BasePlugin from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback from .args import KDArgs as KDArgs class KDPlugin(BasePlugin): """ Plugin for KD support in Axolotl. """ def get_input_args(self): return "axolotl.integrations.kd.KDArgs" def get_training_args_mixin(self): return "axolotl.integrations.kd.args.KDTrainingArgsMixin" def get_trainer_cls(self, cfg): if cfg.kd_trainer: from .trainer import AxolotlKDTrainer return AxolotlKDTrainer return None def get_training_args(self, cfg): return { "kd_ce_alpha": cfg.kd_ce_alpha, "kd_alpha": cfg.kd_alpha, "kd_temperature": cfg.kd_temperature, "kd_beta": cfg.kd_beta, "kd_normalize_topk": cfg.kd_normalize_topk, } def get_collator_cls_and_kwargs(self, cfg, is_eval=False): if not cfg.kd_trainer: return None, None from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq use_batch_sampler_collator = False if is_eval is False and cfg.sample_packing: use_batch_sampler_collator = True if cfg.eval_sample_packing and is_eval: use_batch_sampler_collator = True if cfg.kd_online_server_base_url: from .collator_online_teacher import OnlineTeacherCollator return OnlineTeacherCollator, { "kd_online_server_base_url": cfg.kd_online_server_base_url, "kd_online_topk": cfg.kd_online_topk, "kd_temperature": cfg.kd_temperature, "kd_online_server": cfg.kd_online_server, "kd_online_timeout": cfg.kd_online_timeout, "kd_normalize_topk": cfg.kd_normalize_topk, } if use_batch_sampler_collator: return KDBatchSamplerDataCollatorForSeq2Seq, {} return DataCollatorForKD, {} def pre_model_load(self, cfg): from .kernels.models import apply_kernel apply_kernel(cfg.model_config_type) def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list: """ Adds temp scheduler callback to the Trainer instance. Args: cfg (Any): Configuration object containing the sparse recipe. trainer (Trainer): Huggingface Trainer instance. Returns: list: List containing the configured callback instances. """ if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url: callback = KDTemperatureSchedulerCallback( cfg.kd_temperature, cfg.kd_temperature_min, trainer, ) return [callback] return [] ================================================ FILE: src/axolotl/integrations/kd/args.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Plugin args for KD support. """ from dataclasses import dataclass from enum import Enum from pydantic import BaseModel, Field class InferenceServerType(str, Enum): """ Online inferences server types to handle different request args """ vllm = "vllm" sglang = "sglang" class KDArgs(BaseModel): """ Input args for knowledge distillation. """ kd_trainer: float | None = None # whether to use KD trainer kd_ce_alpha: float | None = ( None # loss coefficient for cross-entropy loss during KD ) kd_alpha: float | None = None # loss coefficient for KD loss kd_temperature: float | None = None # temperature for sampling during KD kd_beta: float | None = 0.0 # beta coefficient for ratio of fwd and reverse KL kd_normalize_topk: bool | None = ( None # whether to normalize student logits during KD ) # TODO online kd kd_online_server_base_url: str | None = None kd_online_topk: int | None = None kd_online_server: InferenceServerType | None = Field( default_factory=lambda: InferenceServerType.vllm ) kd_online_timeout: int | None = 120 kd_temperature_min: float | None = ( None # kd temperature scheduling during online kd ) @dataclass class KDTrainingArgsMixin: """ Additional args for KD training. """ kd_ce_alpha: float | None = ( None # loss coefficient for cross-entropy loss during KD ) kd_alpha: float | None = None # loss coefficient for KD loss kd_temperature: float | None = None # temperature for sampling during KD kd_beta: float | None = None # beta coefficient for ratio of fwd and reverse KL kd_normalize_topk: float | None = ( None # whether to normalize student logits during KD ) ================================================ FILE: src/axolotl/integrations/kd/callbacks.py ================================================ """ Transformers trainer callbacks to schedule the KD temperature during training """ import math from transformers.trainer_callback import TrainerCallback class KDTemperatureSchedulerCallback(TrainerCallback): """ KD temperature scheduler callback for the trainer. """ def __init__(self, temperature_start, temperature_min, trainer): self.temperature_start = temperature_start self.temperature_min = temperature_min self.temperature = temperature_start self.trainer = trainer def on_step_end(self, args, state, control, **kwargs): # cosine decay temperature over the max steps progress = state.global_step / state.max_steps # Cosine decay factor: 0.5 * (1 + cos(pi * progress)) # This factor goes from 1 (at progress=0) to 0 (at progress=1) decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress)) self.temperature = self.temperature_start - ( (self.temperature_start - self.temperature_min) * (1.0 - decay_factor) ) if hasattr(self.trainer.data_collator, "kd_temperature"): self.trainer.data_collator.kd_temperature = self.temperature ================================================ FILE: src/axolotl/integrations/kd/chat_template.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Chat template prompt strategy loader with KD support """ import logging from typing import Any, Dict import torch from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader LOG = logging.getLogger(__name__) class ChatTemplateStrategyWithKD(ChatTemplateStrategy): """ Handle fields for logprob KD """ def __init__( self, prompter, tokenizer, train_on_inputs, sequence_len, roles_to_train=None, train_on_eos=None, train_on_eot=None, eot_tokens=None, split_thinking: bool | None = False, logprobs_field="logprobs", gen_temperature=1.0, kd_temperature=1.0, ): self.logprobs_field = logprobs_field self.gen_temperature = gen_temperature self.kd_temperature = kd_temperature super().__init__( prompter, tokenizer, train_on_inputs, sequence_len, roles_to_train=roles_to_train, train_on_eos=train_on_eos, train_on_eot=train_on_eot, eot_tokens=eot_tokens, split_thinking=split_thinking, ) @property def supports_batched(self) -> bool: # batching doesn't work well for logprob data return False def transform_logprobs(self, sample): """ Transform logprobs to target format for KD training """ logprobs = sample.pop(self.logprobs_field) target_seq_len = len(logprobs) input_seq_len = len(sample["input_ids"]) input_padding_len = input_seq_len - target_seq_len # get non-zero top-k (prune None logprobs from vllm data step) top_k_vals = [ len(logprobs[i]) for i in range(len(logprobs)) if logprobs[i] is not None and len(logprobs[i]) ] max_top_k = max(set(top_k_vals), key=top_k_vals.count) min_top_k = min(set(top_k_vals), key=top_k_vals.count) top_k = min(max_top_k, min_top_k) if top_k == 0: raise ValueError("No non-zero top-k logprobs found.") target_logprobs = [] target_token_ids = [] target_mask = [] if input_padding_len < 0: # logprobs is longer than target_seq_len, # so we need to slice from the left/beginning of logprobs logprobs = logprobs[:-input_seq_len] input_padding_len = 0 # target_seq_len = input_seq_len # truncate the second dimension of the logprobs to top_k logprobs = [row[:top_k] for row in logprobs] # fill with -inf for padding_len tokens for top_k tokens # extend target_logprobs with a padding_len x top_k 2D list filled with -inf # we shift for causal models in the trainer, so start the range from 0 for _ in range(0, input_padding_len): target_logprobs.append([-float("inf")] * top_k) target_token_ids.append(list(range(top_k))) target_mask.append([0] * top_k) for position in range(input_padding_len, input_seq_len): if sample["labels"][position] == -100: target_mask.append([0] * top_k) else: target_mask.append([1] * top_k) for _, token_pos_logprobs in enumerate(logprobs): # Initialize collections for logprobs and token_ids position_logprobs = [] position_token_ids = [] # Process each token probability entry for entry in token_pos_logprobs: # Extract logprob value logprob = entry["logprob"] # Parse token_id from the "token_id:###" format token_id = int(entry["token"].split(":")[1]) # Append to our collections position_logprobs.append(logprob) position_token_ids.append(token_id) # Convert to a tensor for easier manipulation position_logprobs_tensor = torch.tensor( position_logprobs, dtype=torch.float ) # Now we have distribution at T1 in log form, i.e. log p_{T1}(k). # Next, re-scale to T2 = self.kd_temperature via exponent-based trick # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z # # Convert from log to probability teacher_probs_t1 = position_logprobs_tensor.exp() # normalize probabilities to sum to 1 in case they aren't already teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True) if teacher_probs_t1_sum > 1e-9: teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum if self.kd_temperature != self.gen_temperature: # Exponentiate by factor (T1 / T2) exponent = self.gen_temperature / self.kd_temperature teacher_probs_t2 = teacher_probs_t1**exponent else: teacher_probs_t2 = teacher_probs_t1 # Re-normalize teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum( dim=0, keepdim=True ) # Convert back to log position_logprobs_tensor = torch.log(teacher_probs_t2) # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor position_logprobs_scaled = position_logprobs_tensor.tolist() target_logprobs.append(position_logprobs_scaled) target_token_ids.append(position_token_ids) # Update sample with transformed logprobs sample["target_logprobs"] = target_logprobs sample["target_token_ids"] = target_token_ids sample["target_mask"] = target_mask return sample def _tokenize_single_prompt(self, prompt): logprobs = prompt.pop(self.logprobs_field) tokenized_prompt = super()._tokenize_single_prompt(prompt) tokenized_prompt[self.logprobs_field] = logprobs # let subclasses add fields before transform tokenized_prompt = self._prepare_kd_fields(tokenized_prompt, prompt) tokenized_prompt = self.transform_logprobs(tokenized_prompt) return tokenized_prompt def _prepare_kd_fields(self, tokenized_prompt, original_prompt): """ Hook for subclasses to prepare additional KD fields before transform """ return tokenized_prompt class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD): """ Strat for datasets with complete structured KD logprob data """ def transform_logprobs(self, sample): """ Transform logprobs to target format for KD training """ logprobs = sample.pop(self.logprobs_field) target_seq_len = len(logprobs) input_seq_len = len(sample["input_ids"]) input_padding_len = input_seq_len - target_seq_len # get non-zero top-k (prune None logprobs from vllm data step) top_k_vals = [ len(logprobs[i]) for i in range(len(logprobs)) if logprobs[i] is not None and len(logprobs[i]) ] max_top_k = max(set(top_k_vals), key=top_k_vals.count) min_top_k = min(set(top_k_vals), key=top_k_vals.count) top_k = min(max_top_k, min_top_k) if top_k == 0: raise ValueError("No non-zero top-k logprobs found.") target_logprobs = [] target_token_ids = [] target_mask = [] if input_padding_len < 0: # logprobs is longer than target_seq_len, # so we need to slice from the left/beginning of logprobs logprobs = logprobs[:-input_seq_len] input_padding_len = 0 # target_seq_len = input_seq_len # truncate the second dimension of the logprobs to top_k logprobs = [row[:top_k] for row in logprobs] # fill with -inf for padding_len tokens for top_k tokens # extend target_logprobs with a padding_len x top_k 2D list filled with -inf # we shift for causal models in the trainer, so start the range from 0 for _ in range(0, input_padding_len): target_logprobs.append([-float("inf")] * top_k) target_token_ids.append(list(range(top_k))) target_mask.append([0] * top_k) for position in range(input_padding_len, input_seq_len): if sample["labels"][position] == -100: target_mask.append([0] * top_k) else: target_mask.append([1] * top_k) for token_pos_logprobs, pos_target_token_ids in zip( logprobs, sample["target_token_ids"], strict=False ): # Convert to a tensor for easier manipulation position_logprobs_tensor = torch.tensor( token_pos_logprobs, dtype=torch.float ) # Now we have distribution at T1 in log form, i.e. log p_{T1}(k). # Next, re-scale to T2 = self.kd_temperature via exponent-based trick # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z # # Convert from log to probability teacher_probs_t1 = position_logprobs_tensor.exp() # normalize probabilities to sum to 1 in case they aren't already teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True) if teacher_probs_t1_sum > 1e-9: teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum if self.kd_temperature != self.gen_temperature: # Exponentiate by factor (T1 / T2) exponent = self.gen_temperature / self.kd_temperature teacher_probs_t2 = teacher_probs_t1**exponent else: teacher_probs_t2 = teacher_probs_t1 # Re-normalize teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum( dim=0, keepdim=True ) # Convert back to log position_logprobs_tensor = torch.log(teacher_probs_t2) # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor position_logprobs_scaled = position_logprobs_tensor.tolist() target_logprobs.append(position_logprobs_scaled) target_token_ids.append(pos_target_token_ids) # Update sample with transformed logprobs sample["target_logprobs"] = target_logprobs sample["target_token_ids"] = target_token_ids sample["target_mask"] = target_mask return sample def _prepare_kd_fields(self, tokenized_prompt, original_prompt): """ Add pre-tokenized target_token_ids for v2 format """ target_token_ids = original_prompt.pop("target_token_ids", None) if target_token_ids is not None: tokenized_prompt["target_token_ids"] = target_token_ids return tokenized_prompt class KDStrategyLoader(StrategyLoader): """ Load ChatTemplateStrategy with KD support using StrategyLoader. """ def _get_strategy_cls(self, cfg): return ChatTemplateStrategyWithKD def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]): strategy_params = super()._get_strategy_params(cfg, ds_cfg) if logprobs_field := ds_cfg.get("logprobs_field"): strategy_params["logprobs_field"] = logprobs_field if gen_temperature := ds_cfg.get("temperature"): strategy_params["gen_temperature"] = gen_temperature if kd_temperature := cfg.get("kd_temperature"): strategy_params["kd_temperature"] = kd_temperature return strategy_params class KDStrategyLoaderV2(KDStrategyLoader): """ Load KD chat template datasets with pre-tokenized logprob data """ def _get_strategy_cls(self, cfg): return ChatTemplateStrategyWithKDv2 load_legacy = KDStrategyLoader() load = KDStrategyLoaderV2() ================================================ FILE: src/axolotl/integrations/kd/collator.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ DataCollator for axolotl to handle KD fields without using -inf for padding, and with a teacher_mask to identify padded positions. """ from dataclasses import dataclass from typing import Any, Optional, Union import numpy as np import torch from transformers import PreTrainedTokenizerBase from transformers.utils import PaddingStrategy from axolotl.utils.collators.batching import DataCollatorForSeq2Seq @dataclass class DataCollatorForKD(DataCollatorForSeq2Seq): """ Data collator for KD, including handling KD-specific fields. This version avoids using -inf and instead uses a large negative value for padding target_logprobs. It also creates a teacher_mask to indicate which entries are valid. """ tokenizer: PreTrainedTokenizerBase model: Optional[Any] = None padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None label_pad_token_id: int = -100 position_pad_token_id: int = 0 return_tensors: str = "pt" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True def __call__(self, features, return_tensors=None): if return_tensors is None: return_tensors = self.return_tensors padding_side = self.tokenizer.padding_side max_len = 0 # Pad labels and position_ids first for feature_name, pad_token_id in [ ("labels", self.label_pad_token_id), ("position_ids", self.position_pad_token_id), ]: if feature_name in features[0]: feat = [f[feature_name] for f in features] max_len = max(len(x) for x in feat) if self.pad_to_multiple_of is not None: max_len = ( (max_len + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of ) * self.pad_to_multiple_of for f in features: remainder = [pad_token_id] * (max_len - len(f[feature_name])) if isinstance(f[feature_name], list): f[feature_name] = ( f[feature_name] + remainder if padding_side == "right" else remainder + f[feature_name] ) else: # If they are numpy arrays if padding_side == "right": f[feature_name] = np.concatenate( [f[feature_name], remainder] ).astype(np.int64) else: f[feature_name] = np.concatenate( [remainder, f[feature_name]] ).astype(np.int64) # Handle target_logprobs and target_token_ids manually target_logprobs_list = [] target_token_ids_list = [] target_mask_list = [] has_teacher_data = ("target_logprobs" in features[0]) and ( "target_token_ids" in features[0] ) if has_teacher_data: # Extract and remove from features for f in features: target_logprobs_list.append(f.pop("target_logprobs")) target_token_ids_list.append(f.pop("target_token_ids")) target_mask_list.append(f.pop("target_mask")) # Determine max lengths max_teacher_seq_len = max_len or max( len(seq) for seq in target_logprobs_list ) max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq) padded_target_logprobs = [] padded_target_token_ids = [] padded_teacher_mask_list = [] for t_logprobs, t_ids, t_mask in zip( target_logprobs_list, target_token_ids_list, target_mask_list, strict=False, ): t_logprobs_padded = [] t_ids_padded = [] t_mask_padded = [] for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False): lp_len = len(lp) if lp_len < max_k: # Use -1e9 for padding logprobs and 0 for token_ids pad_len = max_k - lp_len lp = lp + [-1e9] * pad_len ids = ids + [0] * pad_len mask = mask + [0] * pad_len else: lp = lp[:max_k] ids = ids[:max_k] mask = mask[:max_k] t_logprobs_padded.append(lp) t_ids_padded.append(ids) t_mask_padded.append(mask) seq_len_diff = max_teacher_seq_len - len(t_logprobs_padded) if seq_len_diff > 0: # Pad sequences fully if needed t_logprobs_padded.extend( [[-1e9] * max_k for _ in range(seq_len_diff)] ) t_ids_padded.extend([[0] * max_k for _ in range(seq_len_diff)]) t_mask_padded.extend([[0] * max_k for _ in range(seq_len_diff)]) padded_target_logprobs.append(t_logprobs_padded) padded_target_token_ids.append(t_ids_padded) padded_teacher_mask_list.append(t_mask_padded) # Convert to tensors padded_target_logprobs = torch.tensor( padded_target_logprobs, dtype=torch.float ) padded_target_token_ids = torch.tensor( padded_target_token_ids, dtype=torch.long ) padded_teacher_mask_list = torch.tensor( padded_teacher_mask_list, dtype=torch.int ) # Pad using tokenizer for regular fields features = self.tokenizer.pad( features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=return_tensors, ) # Add back teacher data if present if has_teacher_data: features["target_logprobs"] = padded_target_logprobs features["target_token_ids"] = padded_target_token_ids features["target_mask"] = padded_teacher_mask_list # Prepare decoder_input_ids if the model supports it if ( "labels" in features and self.model is not None and hasattr(self.model, "prepare_decoder_input_ids_from_labels") ): decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels( labels=features["labels"] ) features["decoder_input_ids"] = decoder_input_ids return features class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD): """ Collator for multipack (batch of sub-batches) specifically for KD. Adapts DataCollatorForKD so it can pack multiple sequences in a single batch item. """ def __call__(self, features, return_tensors=None): """ Expects that `features` could be either: - a single list of dicts, OR - a list of lists of dicts (the "sub-batches" to be packed). """ # 1) If we are *not* dealing with multiple sequences per batch element, # just pass straight to parent. if not isinstance(features[0], list): return super().__call__(features, return_tensors=return_tensors) # 2) Otherwise, we *are* dealing with multiple sequences in each batch item. # We want to produce a single "merged" feature dict for each sub-batch. out_features = [{} for _ in features] for i, sub_features in enumerate(features): # sub_features is a list of dicts, each dict = one sequence’s features # We'll merge them into out_features[i]. # # NOTE: You can customize how you combine fields as needed (e.g. summation # or offset for attention_mask). Below is a straightforward concatenation/extension. for field_name in sub_features[0].keys(): # Some fields you might want to skip or treat specially: if field_name == "length": continue # If it’s a KD field that’s a list-of-lists (e.g. target_logprobs), # you typically just want to flatten them by extending. if field_name in ["target_logprobs", "target_token_ids", "target_mask"]: combined = [] for feat in sub_features: combined.extend(feat[field_name]) out_features[i][field_name] = combined elif field_name == "attention_mask": # Here we apply the (j+1) factor to differentiate each sub-sample # within this merged batch item. arrays = [] for j, feat in enumerate(sub_features): if field_name in feat: arrays.append((j + 1) * np.array(feat[field_name])) out_features[i][field_name] = np.concatenate(arrays) else: # By default, just concatenate them if they are arrays # or extend them if they are lists. # For example, input_ids or labels are often arrays. arrays = [] for feat in sub_features: if field_name in feat and isinstance( feat[field_name], (list, torch.Tensor) ): if isinstance(feat[field_name][0], (dict, str)): continue arr = np.array(feat[field_name]) arrays.append(arr) if arrays: out_features[i][field_name] = np.concatenate(arrays) # 3) Now call the parent collator, which will do: # - padding of labels/position_ids # - KD-specific padding for target_logprobs, target_token_ids, etc. # - final conversion to return_tensors return super().__call__(out_features, return_tensors=return_tensors) ================================================ FILE: src/axolotl/integrations/kd/collator_online_teacher.py ================================================ """ Packed data loader for online teacher training supporting vllm and sglang. """ import hashlib import hmac import logging from typing import Any, Dict, List, Optional import requests import torch from orjson import orjson from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq from axolotl.integrations.kd.utils import normalize_logprobs from axolotl.utils.data.utils import retry_on_request_exceptions LOG = logging.getLogger(__name__) def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256): """ Create HMAC-SHA hash from a list of integers Args: int_list: List of integers key: Secret key (string or bytes) hash_func: Hash function (default: sha256) Returns: HMAC digest as hex string """ # Convert key to bytes if it's a string if isinstance(key, str): key = key.encode("utf-8") # Convert list of ints to bytes # Method 1: Convert each int to bytes and concatenate data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list) # Create HMAC h = hmac.new(key, data, hash_func) return h.hexdigest() class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq): """ Collator for online teacher training. """ DEFAULT_LABEL_PAD_TOKEN_ID: int = -100 def __init__( self, *args: Any, kd_online_server_base_url: Optional[str] = None, kd_online_topk: Optional[int] = None, kd_temperature: Optional[float] = 1.0, kd_online_server: Optional[str] = "vllm", kd_online_timeout: Optional[int] = 120, kd_cache_dir: Optional[str] = None, kd_normalize_topk: Optional[bool] = True, **kwargs: Any, ): super().__init__(*args, **kwargs) if kd_online_server_base_url is None: raise ValueError( "kd_online_server_base_url must be provided for OnlineTeacherDataloader" ) if kd_online_topk is None or kd_online_topk <= 0: raise ValueError( "kd_online_topk must be a positive integer for OnlineTeacherDataloader" ) self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/") self.kd_online_topk = kd_online_topk self.kd_temperature = kd_temperature self.kd_online_server = kd_online_server self.http_session = requests.Session() self.kd_online_timeout = kd_online_timeout self.kd_cache_dir = kd_cache_dir self.kd_normalize_topk = kd_normalize_topk def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]: """ Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs. """ if not raw_logprobs or self.kd_online_topk == 0: return ( [-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else [] ) raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32) return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist() @retry_on_request_exceptions(max_retries=10, delay=5) def fetch_online_logprobs_sglang( self, batch_input_ids: List[List[int]], labels: List[List[int]] ): """ Fetches logprobs from an online teacher served by sglang for a batch of input_ids. Assumes API returns token IDs as strings in logprob dictionary keys. """ api_endpoint = f"{self.kd_online_server_base_url}/generate" payload = { "input_ids": batch_input_ids, "return_logprob": True, "top_logprobs_num": self.kd_online_topk, "logprob_start_len": 0, "return_text_in_logprobs": True, "echo": True, "sampling_params": { "max_new_tokens": 0, "temperature": self.kd_temperature, "skip_special_tokens": False, }, } # Initialize with empty lists, so if API call fails, these are returned. ret_data_target_token_ids: List[List[List[int]]] = [] ret_data_target_logprobs: List[List[List[float]]] = [] ret_data_target_mask: List[List[List[int]]] = [] try: response = self.http_session.post( api_endpoint, json=payload, timeout=self.kd_online_timeout ) response.raise_for_status() api_data: list[dict] = response.json() # Ensure api_data is a list, and its length matches batch_input_ids if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids): LOG.error( f"API response format error. Expected a list of {len(batch_input_ids)} " f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}." ) # Return empty data; items processed later will get default empty KD fields return { "target_token_ids": ret_data_target_token_ids, "target_logprobs": ret_data_target_logprobs, "target_mask": ret_data_target_mask, } for sequence_data, seq_input_ids, seq_labels in zip( api_data, batch_input_ids, labels, strict=False ): current_target_logprobs = [] current_target_token_ids = [] current_target_mask = [] meta_info = sequence_data.pop("meta_info", {}) # Ensure input_top_logprobs is a list input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop( "input_top_logprobs", [] ) if not isinstance(input_top_logprobs, list): LOG.warning( f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence." ) input_top_logprobs = [] # Treat as empty # basic check that the logprob data len matches the input len, so no need to handle padding assert len(seq_input_ids) == len(input_top_logprobs) for i, _, label in zip( range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False ): if i < len(input_top_logprobs) and input_top_logprobs[i] is None: # this is always the case for the first token. # there is never logprob data for the first token since that's a true input # so we replace the None value with padding data current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append([0] * self.kd_online_topk) current_target_mask.append([0] * self.kd_online_topk) elif ( i < len(input_top_logprobs) and input_top_logprobs[i] is not None ): pos_top_logprobs_data = input_top_logprobs[i] # Ensure pos_top_logprobs_data is a list of lists as expected if not ( isinstance(pos_top_logprobs_data, list) and all( isinstance(item, list) for item in pos_top_logprobs_data ) and len(pos_top_logprobs_data) > 0 and len(pos_top_logprobs_data[0]) == 3 ): # [logprob, token_id, token_str] LOG.warning( f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position." ) current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append([0] * self.kd_online_topk) current_target_mask.append([0] * self.kd_online_topk) continue # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids pos_logprobs_raw, pos_token_ids, _ = [ list(row) for row in zip(*pos_top_logprobs_data, strict=False) ] # Ensure correct length (top_k) if len(pos_logprobs_raw) < self.kd_online_topk: pad_len = self.kd_online_topk - len(pos_logprobs_raw) pos_logprobs_raw.extend([-float("inf")] * pad_len) pos_token_ids.extend([0] * pad_len) # Pad with 0 token_id # truncate to top_k in case the response was longer current_target_token_ids.append( pos_token_ids[: self.kd_online_topk] ) if self.kd_normalize_topk: normalized_logprobs_for_position = self._normalize_logprobs( pos_logprobs_raw[: self.kd_online_topk] ) current_target_logprobs.append( normalized_logprobs_for_position ) else: current_target_logprobs.append( pos_logprobs_raw[: self.kd_online_topk] ) # Mask depends on the corresponding label for the student if label == self.DEFAULT_LABEL_PAD_TOKEN_ID: current_target_mask.append([0] * self.kd_online_topk) else: current_target_mask.append([1] * self.kd_online_topk) else: # Pad if no logprobs for this position (either due to length mismatch or None entry) current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append([0] * self.kd_online_topk) current_target_mask.append([0] * self.kd_online_topk) ret_data_target_token_ids.append(current_target_token_ids) ret_data_target_logprobs.append(current_target_logprobs) ret_data_target_mask.append(current_target_mask) except requests.exceptions.RequestException as e: LOG.error(f"Error fetching logprobs from online teacher: {e}") raise e # ret_logprobs_data will be returned with empty lists, handled by the caller. except Exception as e: # Catch other potential errors during processing LOG.error( f"Unexpected error processing API response in fetch_online_logprobs: {e}", exc_info=True, ) raise e return { "target_token_ids": ret_data_target_token_ids, "target_logprobs": ret_data_target_logprobs, "target_mask": ret_data_target_mask, } @retry_on_request_exceptions(max_retries=10, delay=5) def fetch_online_logprobs_vllm( self, batch_input_ids: List[List[int]], labels: List[List[int]] ): """ Fetches logprobs from an online teacher served by vllm for a batch of input_ids. Assumes API returns token IDs as strings in logprob dictionary keys. """ api_endpoint = f"{self.kd_online_server_base_url}/v1/completions" payload = { "prompt": batch_input_ids, "echo": True, "logprobs": True, "prompt_logprobs": self.kd_online_topk, "top_logprobs": self.kd_online_topk, "max_new_tokens": 0, "skip_special_tokens": False, "temperature": self.kd_temperature, "sampling_params": { "max_tokens": 0, }, } # Initialize with empty lists, so if API call fails, these are returned. ret_data_target_token_ids: List[List[List[int]]] = [] ret_data_target_logprobs: List[List[List[float]]] = [] ret_data_target_mask: List[List[List[int]]] = [] try: headers = {"Accept-Encoding": "deflate, gzip, br, zstd"} response = self.http_session.post( api_endpoint, json=payload, headers=headers, timeout=self.kd_online_timeout, ) response.raise_for_status() api_data: dict = orjson.loads(response.content) choices: list[dict] = api_data["choices"] # Ensure api_data is a list, and its length matches batch_input_ids if not isinstance(choices, list) or len(choices) != len(batch_input_ids): LOG.error( f"API response format error. Expected a list of {len(batch_input_ids)} " f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}." ) # Return empty data; items processed later will get default empty KD fields return { "target_token_ids": ret_data_target_token_ids, "target_logprobs": ret_data_target_logprobs, "target_mask": ret_data_target_mask, } for sequence_data, seq_input_ids, seq_labels in zip( choices, batch_input_ids, labels, strict=False ): # seq_input_ids: List[int] # seq_labels: List[int] current_target_logprobs = [] current_target_token_ids = [] current_target_mask = [] # Ensure input_top_logprobs is a list input_top_logprobs: Optional[list[None | dict[str, dict]]] = ( sequence_data.pop("prompt_logprobs", []) ) if not isinstance(input_top_logprobs, list): LOG.warning( f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence." ) input_top_logprobs = [] # Treat as empty # basic check that the logprob data len matches the input len, so no need to handle padding assert len(seq_input_ids) == len(input_top_logprobs) seq_len = len(seq_input_ids) for i, _, label in zip( range(seq_len), seq_input_ids, seq_labels, strict=False ): if i < len(input_top_logprobs) and input_top_logprobs[i] is None: # this is always the case for the first token. # there is never logprob data for the first token since that's a true input continue if ( i < len(input_top_logprobs) and input_top_logprobs[i] is not None ): pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i] # type: ignore[assignment] # Ensure pos_top_logprobs_data is a list of lists as expected if not ( isinstance(pos_top_logprobs_data, dict) and all( isinstance(item, dict) for item in pos_top_logprobs_data.values() ) and len(pos_top_logprobs_data.keys()) > 0 ): # [logprob, token_id, token_str] LOG.warning( f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position." ) current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append( list(range(self.kd_online_topk)) ) current_target_mask.append([0] * self.kd_online_topk) continue # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids pos_token_ids_str = list(pos_top_logprobs_data.keys()) pos_logprobs_dict = pos_top_logprobs_data.values() pos_token_ids = [ int(token_id) for token_id in pos_token_ids_str ] pos_logprobs_raw = [ float(logprob.get("logprob", -float("inf"))) for logprob in pos_logprobs_dict ] # Ensure correct length (top_k) if len(pos_logprobs_raw) < self.kd_online_topk: pad_len = self.kd_online_topk - len(pos_logprobs_raw) LOG.warning( f"Padding position {i} with {pad_len} top-k tokens and logprobs." ) pos_logprobs_raw.extend([-float("inf")] * pad_len) pos_token_ids.extend([0] * pad_len) # Pad with 0 token_id # truncate to top_k in case the response was longer current_target_token_ids.append( pos_token_ids[: self.kd_online_topk] ) if self.kd_normalize_topk: normalized_logprobs_for_position = self._normalize_logprobs( pos_logprobs_raw[: self.kd_online_topk] ) current_target_logprobs.append( normalized_logprobs_for_position ) else: current_target_logprobs.append( pos_logprobs_raw[: self.kd_online_topk] ) # Mask depends on the corresponding label for the student if label == self.DEFAULT_LABEL_PAD_TOKEN_ID: current_target_mask.append([0] * self.kd_online_topk) else: current_target_mask.append([1] * self.kd_online_topk) else: # Pad if no logprobs for this position (either due to length mismatch or None entry) current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append( list(range(self.kd_online_topk)) ) current_target_mask.append([0] * self.kd_online_topk) for _ in range(max(0, seq_len - len(current_target_logprobs))): current_target_logprobs.append( [-float("inf")] * self.kd_online_topk ) current_target_token_ids.append(list(range(self.kd_online_topk))) current_target_mask.append([0] * self.kd_online_topk) ret_data_target_token_ids.append(current_target_token_ids) ret_data_target_logprobs.append(current_target_logprobs) ret_data_target_mask.append(current_target_mask) # TODO save and load targets to disk for caching for next epoch # generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int # if self.kd_cache_dir: # hash_input_ids = hmac_sha_from_int_list( # seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}" # ) # with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f: # pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False) except requests.exceptions.RequestException as e: LOG.error(f"Error fetching logprobs from online teacher: {e}") raise e # ret_logprobs_data will be returned with empty lists, handled by the caller. except Exception as e: # Catch other potential errors during processing LOG.error( f"Unexpected error processing API response in fetch_online_logprobs: {e}", exc_info=True, ) raise e return { "target_token_ids": ret_data_target_token_ids, "target_logprobs": ret_data_target_logprobs, "target_mask": ret_data_target_mask, } def __call__( self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None ) -> Dict[str, Any]: if not features: return super().__call__(features, return_tensors=return_tensors) for ( sub_batch_features ) in features: # sub_batch_features is List[Dict[str, Any]] if not sub_batch_features: continue input_ids_for_api_call: List[List[int]] = [] labels_for_api_call: List[List[int]] = [] # Store references to the original item dictionaries to update them in-place items_for_api_call: List[Dict[str, Any]] = [] for item_dict in sub_batch_features: if not isinstance(item_dict, dict): LOG.warning( f"Skipping non-dict item in sub_batch_features: {item_dict}" ) continue current_input_ids = item_dict.get("input_ids") current_labels = item_dict.get("labels") if current_input_ids is not None and current_labels is not None: # Ensure input_ids and labels are lists of ints for JSON serialization input_ids_list = ( current_input_ids.tolist() if hasattr(current_input_ids, "tolist") else list(current_input_ids) ) labels_list = ( current_labels.tolist() if hasattr(current_labels, "tolist") else list(current_labels) ) input_ids_for_api_call.append(input_ids_list) labels_for_api_call.append(labels_list) items_for_api_call.append(item_dict) else: # This item will not get teacher logprobs from the API. # Initialize KD fields to empty lists so downstream collators handle them uniformly. item_dict.setdefault("target_token_ids", []) item_dict.setdefault("target_logprobs", []) item_dict.setdefault("target_mask", []) # print(items_for_api_call) if items_for_api_call: # Only call API if there's something to process if self.kd_online_server == "sglang": api_responses_for_sub_batch = self.fetch_online_logprobs_sglang( input_ids_for_api_call, labels_for_api_call ) else: api_responses_for_sub_batch = self.fetch_online_logprobs_vllm( input_ids_for_api_call, labels_for_api_call ) # api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask" # Each value is a list, corresponding to items_for_api_call for i, item_to_update in enumerate(items_for_api_call): # TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly. if api_responses_for_sub_batch and i < len( api_responses_for_sub_batch["target_token_ids"] ): # Check bounds assert len( api_responses_for_sub_batch["target_token_ids"][i] ) == len(item_to_update["input_ids"]) assert len( api_responses_for_sub_batch["target_logprobs"][i] ) == len(item_to_update["input_ids"]) assert len( api_responses_for_sub_batch["target_mask"][i] ) == len(item_to_update["labels"]) item_to_update["target_token_ids"] = ( api_responses_for_sub_batch["target_token_ids"][i] ) item_to_update["target_logprobs"] = api_responses_for_sub_batch[ "target_logprobs" ][i] item_to_update["target_mask"] = api_responses_for_sub_batch[ "target_mask" ][i] else: # API call failed for this item, or response was shorter than expected. # Ensure KD fields are initialized as empty lists. LOG.warning( f" (index {i}), or API response was too short. " f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}" ) item_to_update.setdefault("target_token_ids", []) item_to_update.setdefault("target_logprobs", []) item_to_update.setdefault("target_mask", []) return super().__call__(features, return_tensors=return_tensors) ================================================ FILE: src/axolotl/integrations/kd/kernels/__init__.py ================================================ """ Liger Chunked loss optimizations module """ from .liger import LigerFusedLinearKLTopKLogprobLoss from .models import apply_kernel __all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"] ================================================ FILE: src/axolotl/integrations/kd/kernels/liger.py ================================================ """ Liger Kernels for Chunked Top-K Log-Prob Distillation """ import torch import torch.nn.functional as F from liger_kernel.chunked_loss.fused_linear_distillation import ( LigerFusedLinearDistillationBase, ) from axolotl.integrations.kd.utils import normalize_logprobs class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase): """ Chunked kl-div loss for top-k logprobs """ @staticmethod def distillation_loss_fn( student_logits_temp_scaled: torch.Tensor, # [chunk_size, vocab_size], already temp-scaled target_token_ids_chunk: torch.Tensor, # [chunk_size, top_k] target_logprobs_chunk: torch.Tensor, # [chunk_size, top_k], already temp-scaled and normalized logprobs target_mask_chunk: torch.Tensor, # [chunk_size, top_k] beta: float = 0.0, normalize_topk: bool = True, ) -> torch.Tensor: """ Compute Top-K KL divergence loss for a chunk. Args: student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V). target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K). target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K). target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K). beta: Controls the type of KL divergence. 0.0 for Forward KL (P_teacher || P_student). 1.0 for Reverse KL (P_student || P_teacher). 0.5 for Symmetric KL (average of Forward and Reverse). normalize_topk: Whether to normalize the log probabilities Returns: Sum of KL divergence losses for the chunk. """ topk = target_token_ids_chunk.shape[-1] student_logits_temp_scaled = ( # [chunk_size, vocab_size] student_logits_temp_scaled.float() ) target_logprobs_chunk = target_logprobs_chunk.float() # Gather student logits for the top-k teacher token IDs # target_token_ids_chunk: [chunk_size, top_k] # student_logits_topk_temp_scaled: [chunk_size, top_k] student_logits_topk_temp_scaled = torch.gather( student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk ) # Student log-probabilities for the gathered top-k tokens student_lse = torch.logsumexp( student_logits_temp_scaled, dim=-1, keepdim=True ) # [chunk_size, 1] student_logprobs_topk_temp_scaled = ( student_logits_topk_temp_scaled - student_lse ) # we have the top-k student logprobs, normalize them if normalize_topk: student_logprobs_topk_temp_scaled = normalize_logprobs( student_logprobs_topk_temp_scaled, topk ) valid_mask = target_mask_chunk.to(torch.bool) # [chunk_size, top_k] student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask] teacher_logprobs_valid = target_logprobs_chunk[valid_mask] # Teacher probabilities P(y|x_teacher) from logprobs # target_logprobs_valid are already normalized (log(softmax(teacher_logits/T))) teacher_probs_valid = teacher_logprobs_valid.exp() # Student probabilities P_student from log P_student student_probs_topk_valid = student_logprobs_topk_valid.exp() # kd_loss_per_token = torch.zeros_like(target_logprobs_valid) # KL divergence: sum(P_teacher * (log P_teacher - log P_student)) # = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student) # The distillation loss is often formulated as -sum(P_teacher * log P_student) # or as sum(P_teacher * (log_softmax_teacher - log_softmax_student)) # Here, target_logprobs_valid are log_softmax_teacher. # student_logprobs_topk_valid are log_softmax_student (for the selected K indices). if beta == 0.0: # Contribution from Forward KL fwd_kl_per_token = teacher_probs_valid * ( teacher_logprobs_valid - student_logprobs_topk_valid ) kd_loss = fwd_kl_per_token.sum() elif beta == 1.0: # Contribution from Reverse KL rev_kl_per_token = student_probs_topk_valid * ( student_logprobs_topk_valid - teacher_logprobs_valid ) kd_loss = rev_kl_per_token.sum() else: # JSD - Jensen-Shannon Divergence / Symmetric mean_probs = ( 1 - beta ) * student_probs_topk_valid + beta * teacher_probs_valid log_mean_probs = mean_probs.log() student_kl = F.kl_div( log_mean_probs, student_logprobs_topk_valid, reduction="sum", log_target=True, ) teacher_kl = F.kl_div( log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True ) jsd_loss = beta * teacher_kl + (1 - beta) * student_kl kd_loss = jsd_loss return kd_loss @staticmethod def _compute_loss_kl_topk( student_input_chunk: torch.Tensor, student_weight: torch.Tensor, # Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value # or through `partial`. Let's make them explicit here for clarity. target_token_ids_chunk: torch.Tensor, target_logprobs_chunk: torch.Tensor, target_mask_chunk: torch.Tensor, target_chunk: torch.Tensor, # For hard loss (true labels) student_bias: torch.Tensor = None, # This will be one of the grad targets # Other params passed via `partial` from `forward` distillation_loss_fn=None, ignore_index: int = -100, weight_hard_loss: float = 0.5, weight_soft_loss: float = 0.5, compute_ce_loss: bool = True, temperature: float = 1.0, beta: float = 0.0, normalize_topk: bool = True, ): # Compute student logits for the chunk from hidden states and LM head # student_input_chunk: [chunk_size, hidden_dim] # student_lm_head_weight: [vocab_size, hidden_dim] # student_logits_chunk: [chunk_size, vocab_size] student_logits_chunk = F.linear( student_input_chunk, student_weight, student_bias ) ce_loss = torch.tensor( 0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype ) if compute_ce_loss and weight_hard_loss > 0.0: ce_loss = F.cross_entropy( student_logits_chunk.view(-1, student_logits_chunk.shape[-1]), target_chunk.view(-1), reduction="sum", ignore_index=ignore_index, ) soft_loss = torch.tensor( 0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype ) if weight_soft_loss > 0.0: student_logits_chunk_temp_scaled = student_logits_chunk / temperature # Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max() # No explicit padding here; user must ensure vocab alignment or pre-pad student_weight. soft_loss = distillation_loss_fn( student_logits_chunk_temp_scaled, target_token_ids_chunk, target_logprobs_chunk, target_mask_chunk, beta=beta, normalize_topk=normalize_topk, ) return soft_loss, ce_loss @classmethod def forward( cls, ctx, student_input: torch.Tensor, # [batch_size, seq_len, dim] student_lm_head_weight: torch.Tensor, # [dim, vocab_size] target_token_ids: torch.Tensor, # [batch_size, seq_len, top_k] target_logprobs: torch.Tensor, # [batch_size, seq_len, top_k] target_mask: torch.Tensor, # [batch_size, seq_len, top_k] true_labels: torch.Tensor, # [batch_size, seq_len] student_lm_head_bias: torch.Tensor = None, weight_hard_loss: float = 0.5, weight_soft_loss: float = 0.5, ignore_index: int = -100, temperature: float = 1.0, beta: float = 0.0, compiled: bool = False, chunk_size: int = 1024, compute_ce_loss: bool = True, normalize_topk: bool = True, ): CHUNK_SIZE = chunk_size grad_weight_acc = torch.zeros_like(student_lm_head_weight) grad_inputs_list = [] grad_bias_acc = ( torch.zeros_like(student_lm_head_bias) if student_lm_head_bias is not None else None ) kd_loss_acc = torch.zeros( (), device=student_input.device, dtype=student_input.dtype ) ce_loss_acc = torch.zeros( (), device=student_input.device, dtype=student_input.dtype ) # This function will be what torch.func.grad_and_value differentiates. # It takes student_input_chunk, student_weight (full), student_bias (full) as primals. # Other necessary data (target_*, etc.) are passed as non-differentiable arguments. def loss_fn_for_grad( _student_input_chunk, _student_lm_head_weight, # full weight _student_lm_head_bias, # full bias # Fixed arguments for a given chunk, not differentiated: _target_token_ids_chunk, _target_logprobs_chunk, _target_mask_chunk, _true_labels_chunk, ): return cls._compute_loss_kl_topk( student_input_chunk=_student_input_chunk, student_weight=_student_lm_head_weight, target_token_ids_chunk=_target_token_ids_chunk, target_logprobs_chunk=_target_logprobs_chunk, target_mask_chunk=_target_mask_chunk, target_chunk=_true_labels_chunk, student_bias=_student_lm_head_bias, distillation_loss_fn=cls.distillation_loss_fn, ignore_index=ignore_index, weight_hard_loss=weight_hard_loss, weight_soft_loss=weight_soft_loss, compute_ce_loss=compute_ce_loss, temperature=temperature, beta=beta, normalize_topk=normalize_topk, ) def accumulate_chunk_grads( student_input_chunk_ac, target_token_ids_chunk_ac, target_logprobs_chunk_ac, target_mask_chunk_ac, true_labels_chunk_ac, ): # student_weight and student_bias are closed over from the outer scope (full tensors) if student_lm_head_bias is not None: ( (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (chunk_kd_loss, chunk_ce_loss), ) = torch.func.grad_and_value( loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True )( student_input_chunk_ac, student_lm_head_weight, student_lm_head_bias, # primals target_token_ids_chunk_ac, target_logprobs_chunk_ac, target_mask_chunk_ac, true_labels_chunk_ac, ) # non-primals grad_bias_acc.add_(chunk_grad_bias) else: argnums_for_grad = (0, 1) # Differentiate wrt input_chunk, weight ( (chunk_grad_input, chunk_grad_weight), # No grad for bias (chunk_kd_loss, chunk_ce_loss), ) = torch.func.grad_and_value( loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True )( student_input_chunk_ac, student_lm_head_weight, None, # Pass None for student_bias primal target_token_ids_chunk_ac, target_logprobs_chunk_ac, target_mask_chunk_ac, true_labels_chunk_ac, ) grad_weight_acc.add_(chunk_grad_weight) kd_loss_acc.add_(chunk_kd_loss) ce_loss_acc.add_(chunk_ce_loss) return chunk_grad_input if compiled: accumulate_chunk_grads_compiled = torch.compile( accumulate_chunk_grads, dynamic=True, backend="inductor" ) # dynamic=True often helpful else: accumulate_chunk_grads_compiled = accumulate_chunk_grads # Use the same chunking logic as LigerFusedLinearDistillationBase.forward B, N, D = student_input.shape K = target_token_ids.shape[-1] student_input_flat = student_input.reshape(-1, student_input.shape[-1]) target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1]) target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1]) target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1]) # pad and shift for cross entropy loss true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index) true_labels_flat = true_labels[:, 1:].contiguous().view(-1) num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE) _student_input_chunks = torch.chunk( student_input_flat, chunks=num_chunks, dim=0 ) _target_token_ids_chunks = torch.chunk( target_token_ids_flat, chunks=num_chunks, dim=0 ) _target_logprobs_chunks = torch.chunk( target_logprobs_flat, chunks=num_chunks, dim=0 ) _target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0) _true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0) for i in range(num_chunks): grad_input_chunk = accumulate_chunk_grads_compiled( _student_input_chunks[i], _target_token_ids_chunks[i], _target_logprobs_chunks[i], _target_mask_chunks[i], _true_labels_chunks[i], ) grad_inputs_list.append(grad_input_chunk) grad_inputs_combined = torch.cat(grad_inputs_list, dim=0) ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc) # For matching None returns in backward for non-tensor/non-grad_requiring inputs ctx.hyperparams_count = 9 # Corresponds to number of hyperparams after main tensors in fwd signature ctx.bias_was_none = student_lm_head_bias is None ctx.orig_dims = (B, N, D, K) # since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum # we still need to scale the kd_loss by the temp^2 kd_loss_acc = kd_loss_acc * (temperature**2) final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc return final_loss @staticmethod def backward(ctx, grad_output): grad_input_flat, grad_weight, grad_bias_maybe = ( ctx.saved_tensors ) # grad_input_flat is (B*N, D) # Scale gradients by grad_output if it's not 1.0 if not torch.equal( grad_output, torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype), ): grad_input_flat = grad_input_flat * grad_output grad_weight = grad_weight * grad_output if grad_bias_maybe is not None: grad_bias_maybe = grad_bias_maybe * grad_output # Reshape grad_input_flat to match original student_input shape (B, N, D) # ctx.orig_dims stores (B, N, D, K) # We need the first three dimensions for student_input's shape. # Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors if ( ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0 and grad_input_flat.numel() == 0 ): # If original input was empty, gradient should also be empty with correct shape grad_input_reshaped = torch.zeros( ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2], dtype=grad_input_flat.dtype, device=grad_input_flat.device, ) elif grad_input_flat.numel() == 0 and not ( ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0 ): # This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad) # but as a safeguard: grad_input_reshaped = torch.zeros( ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2], dtype=grad_input_flat.dtype, device=grad_input_flat.device, ) else: grad_input_reshaped = grad_input_flat.view( ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2] ) nones_for_hyperparams = [None] * ctx.hyperparams_count grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None return ( grad_input_reshaped, # Gradient for student_input (reshaped) grad_weight, # Gradient for student_lm_head_weight None, # Gradient for target_token_ids None, # Gradient for target_logprobs None, # Gradient for target_mask None, # Gradient for true_labels grad_bias_return, # Gradient for student_lm_head_bias *nones_for_hyperparams, # Grads for weight_hard_loss, ..., compute_ce_loss ) class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module): """ wrapper for chunked top-k logprob kl-d """ def __init__( self, weight_hard_loss: float = 0.5, weight_soft_loss: float = 0.5, temperature: float = 1.0, # This is the kd_temperature beta: float = 1.0, ignore_index: int = -100, compiled: bool = True, chunk_size: int = 1024, compute_ce_loss: bool = True, normalize_topk: bool = True, ): super().__init__() if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0): raise ValueError("Loss weights must be between 0.0 and 1.0.") if temperature <= 0: raise ValueError("Temperature must be positive.") self.weight_hard_loss = weight_hard_loss self.weight_soft_loss = weight_soft_loss self.temperature = temperature self.beta = beta self.ignore_index = ignore_index self.compiled = compiled self.chunk_size = chunk_size self.compute_ce_loss = compute_ce_loss self.normalize_topk = normalize_topk if not self.compute_ce_loss and self.weight_hard_loss > 0.0: print( f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero." ) # self.weight_hard_loss = 0.0 # Or let user manage this if self.weight_soft_loss == 0.0: print( "Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed." ) def forward( self, lm_head_weight: torch.Tensor, # Weights of the linear layer in the LM head student_hidden_states: torch.Tensor, # student_hidden_states before the lm_head target_token_ids: torch.Tensor, target_logprobs: torch.Tensor, target_mask: torch.Tensor, true_labels: torch.Tensor, student_bias: torch.Tensor = None, ) -> torch.Tensor: return LigerFusedLinearKLTopKLogprobFunction.apply( student_hidden_states, lm_head_weight, target_token_ids, target_logprobs, target_mask, true_labels, student_bias, self.weight_hard_loss, self.weight_soft_loss, self.ignore_index, self.temperature, self.beta, self.compiled, self.chunk_size, self.compute_ce_loss, self.normalize_topk, ) ================================================ FILE: src/axolotl/integrations/kd/kernels/models.py ================================================ """ model patcher for chunked top-k kl-div """ from typing import Optional, Union, Unpack import torch from transformers import Cache from transformers.modeling_outputs import CausalLMOutputWithPast try: from transformers.modeling_flash_attention_utils import FlashAttentionKwargs from transformers.utils import LossKwargs class TransformersKwargs(FlashAttentionKwargs, LossKwargs): """ placeholder kwargs for hf model classes """ except ImportError: from transformers.utils.generic import ( # type: ignore[no-redef] TransformersKwargs, ) from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix def kldiv_forward_llama_like( self, input_ids: Optional[torch.LongTensor] = None, target_logprobs: Optional[torch.Tensor] = None, target_token_ids: Optional[torch.LongTensor] = None, target_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], # type: ignore[misc] ) -> CausalLMOutputWithPast: output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, cache_position=cache_position, **kwargs, ) hidden_states = outputs.last_hidden_state # Only compute necessary logits, and do not upcast them to float if we are not computing the loss # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100 # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss loss = self._loss_function( self.lm_head.weight, hidden_states, target_token_ids, target_logprobs, target_mask, true_labels=labels, ) num_items_in_batch = kwargs.pop("num_items_in_batch", -1) if num_items_in_batch is not None and num_items_in_batch > 0: loss = loss / num_items_in_batch return CausalLMOutputWithPast( loss=loss, logits=None, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def apply_kernel(model_type): # Dynamically import the module and attention class module_path = f"transformers.models.{model_type}.modeling_{model_type}" model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type) module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]) model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM") model_cls.forward = kldiv_forward_llama_like ================================================ FILE: src/axolotl/integrations/kd/topk_logprob/__init__.py ================================================ ================================================ FILE: src/axolotl/integrations/kd/topk_logprob/forward_kl.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ loss for top_k KL divergence """ import torch from torch import nn @torch.jit.script def loss( student_logits: torch.Tensor, target_token_ids: torch.Tensor, target_logprobs: torch.Tensor, target_mask: torch.Tensor, num_items_in_batch: int = -1, # Use -1 to indicate "None" kd_temperature: float = 1.0, ) -> torch.Tensor: """ A KD loss function that is TorchScript-friendly. Arguments: student_logits (torch.Tensor): The logits of the student model. Shape: [B, student_seq_len, vocab_size] target_token_ids (torch.Tensor): The top-k teacher/target token IDs Shape: [B, teacher_seq_len, top_k] target_logprobs (torch.Tensor): The top-k teacher/target logprobs, these should already be re-normalized. Shape: [B, teacher_seq_len, top_k] target_mask (torch.Tensor): The mask for valid tokens. Shape: [B, teacher_seq_len, top_k] num_items_in_batch (int, optional): The number of items in the batch. kd_temperature (float, optional): The temperature for KD. Default: 1.0 """ target_logprobs = target_logprobs.float() # Determine the teacher sequence length # target_token_ids shape: [B, teacher_seq_len, K] # student_logits shape: [B, student_seq_len, vocab_size] teacher_seq_len = target_token_ids.shape[1] # Slice student logits to match teacher-provided sequence length student_logits_for_kd = ( student_logits[:, :teacher_seq_len, :] / kd_temperature ) # [B, teacher_seq_len, vocab_size] # keep in full precision for numerical stability of loss student_logits_for_kd = student_logits_for_kd.float() # Gather student logits for teacher's top-K tokens student_logits_topk = torch.gather( student_logits_for_kd, dim=-1, index=target_token_ids ) # [B, teacher_seq_len, K] # Compute logsumexp across full vocabulary student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True) # Convert just the top-k logits to logprobs student_logprobs_topk = student_logits_topk - student_lse # Convert teacher_mask to boolean for indexing # In TorchScript, .bool() is sometimes unsupported, so we do: valid_mask = target_mask.to(torch.bool) # Prune tensors to only keep valid tokens student_logprobs_topk = student_logprobs_topk[valid_mask] target_logprobs = target_logprobs[valid_mask] # Convert teacher logprobs to probabilities teacher_probs = target_logprobs.exp() # Compute forward KL kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk) kd_loss = kd_loss_per_token.sum() # Normalize by number of items (if provided) or by valid tokens if num_items_in_batch > 0: kd_loss = kd_loss / float(num_items_in_batch) else: # Fall back to average over valid tokens kd_loss = kd_loss / float(kd_loss_per_token.size(0)) return kd_loss class ChunkedTopKKDLoss(nn.Module): """ A wrapper that chunks (splits) the student and teacher outputs along the time dimension to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies. Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs. """ def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0): super().__init__() self.num_output_chunks = num_output_chunks self.kd_temperature = kd_temperature def forward( self, student_logits: torch.Tensor, # [B, seq_len, vocab_size] target_token_ids: torch.Tensor, # [B, seq_len, K] target_logprobs: torch.Tensor, # [B, seq_len, K] target_mask: torch.Tensor, # [B, seq_len, K] num_items_in_batch: int = -1, # optional batch size for normalization ) -> torch.Tensor: # 1. Split along the "token" dimension (dim=1). student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1) token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1) logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1) mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1) # We'll accumulate a global "sum of losses" and "sum of valid tokens" # so that our final average is consistent with the entire sequence/batch. total_loss = 0.0 total_valid_tokens = 0 # 2. Loop over each chunk and compute a chunk-specific loss. for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip( student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks, strict=False, ): # We pass num_items_in_batch=-1 so that the kd_loss # will average over *this chunk's* valid tokens only. chunk_loss = loss( student_logits=st_chunk, target_token_ids=tid_chunk, target_logprobs=lp_chunk, target_mask=msk_chunk, num_items_in_batch=-1, # ensure per-chunk averaging by valid tokens kd_temperature=self.kd_temperature, ) # kd_loss returns an average over the chunk's valid tokens. # We want a global average in the end, so we need to re‐weight # by the number of valid tokens in this chunk and keep track of the total. chunk_valid_mask = msk_chunk.to(torch.bool) chunk_valid_count = chunk_valid_mask.sum() # scalar tensor # Re-scale "chunk average" back to "chunk sum" chunk_loss_sum = chunk_loss * chunk_valid_count total_loss += chunk_loss_sum total_valid_tokens += chunk_valid_count # 3. Normalize *once* at the end. if num_items_in_batch > 0: # If the user gave us a manual denominator (e.g. total items in batch), # we divide by it. Typically used if each item is of different length. final_loss = total_loss / float(num_items_in_batch) else: # Otherwise, divide by total valid tokens across all chunks. # to get the same result as a non-chunked approach. final_loss = total_loss / float(total_valid_tokens) return final_loss ================================================ FILE: src/axolotl/integrations/kd/trainer.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ KD trainer """ from typing_extensions import override from axolotl.core.trainers.base import AxolotlTrainer from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss class AxolotlKDTrainer(AxolotlTrainer): """ Custom trainer subclass for Knowledge Distillation (KD) """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_accepts_loss_kwargs = True loss_fn = LigerFusedLinearKLTopKLogprobLoss( self.args.kd_ce_alpha, # hard label loss self.args.kd_alpha, # kd loss self.args.kd_temperature, self.args.kd_beta or 0.0, compute_ce_loss=bool(self.args.kd_ce_alpha), normalize_topk=self.args.kd_normalize_topk, ) target = self.model # Unwrap PEFT wrapper if hasattr(target, "get_base_model"): target = target.get_base_model() # Set on the actual model instance target._loss_function = loss_fn def _set_signature_columns_if_needed(self): super()._set_signature_columns_if_needed() columns_to_add = [] if self._signature_columns: if "target_logprobs" not in self._signature_columns: columns_to_add.append("target_logprobs") if "target_token_ids" not in self._signature_columns: columns_to_add.append("target_token_ids") if "target_mask" not in self._signature_columns: columns_to_add.append("target_mask") if columns_to_add: self._signature_columns += columns_to_add @override def compute_loss( self, model, inputs, return_outputs=False, num_items_in_batch=None, ): """ How the loss is computed by Trainer. By default, all models return the loss in the first element. Subclass and override for custom behavior. """ if ( self.args.sample_packing and hasattr(inputs, "attention_mask") and hasattr(inputs, "position_ids") ): del inputs["attention_mask"] if num_items_in_batch is None and "labels" in inputs: num_items_in_batch = (inputs["labels"] != -100).sum().item() if self.model_accepts_loss_kwargs: loss_kwargs = {} if num_items_in_batch is not None: loss_kwargs["num_items_in_batch"] = num_items_in_batch inputs = {**inputs, **loss_kwargs} outputs = model(**inputs) if isinstance(outputs, dict): loss = outputs["loss"] elif isinstance(outputs, tuple): loss = outputs[0] else: loss = outputs.loss if hasattr(outputs, "loss") else outputs return (loss, outputs) if return_outputs else loss ================================================ FILE: src/axolotl/integrations/kd/utils.py ================================================ """Helper KD utils""" import math from typing import List, Union import numpy as np import torch from torch import FloatTensor, Tensor def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor: """ Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs. """ # Ensure raw_logprobs matches kd_online_topk length for tensor operations # This should ideally be handled by the caller ensuring correct padding/truncation first if logprobs.shape[-1] != topk: # pad last dimension of logprobs to match topk length with -inf padding_len = topk - logprobs.shape[-1] padding_tensor = torch.full( ( *logprobs.shape[:-1], padding_len, ), # Takes all dimensions of logprobs except the last, then appends padding_needed float("-inf"), dtype=logprobs.dtype, device=logprobs.device, ) logprobs = torch.cat((logprobs, padding_tensor), dim=-1) # Convert logprobs at T_online to probabilities # use log sum exp trick to avoid underflow position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True) teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse) # Normalize probabilities (sum to 1) # This is important if the top-k from server aren't a full distribution teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True) teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum final_logprobs_tensor = torch.log(teacher_probs_t_online) return final_logprobs_tensor def strided_chunk_views( tensor: Union[np.ndarray, torch.Tensor], chunks: int, dim: int = 0, stride: int = 1, chunk_size: int | None = None, ) -> List[Union[np.ndarray, torch.Tensor]]: """ Split a tensor into chunks along a dimension with striding, prioritizing views over copies. Args: tensor: Input tensor (numpy array or torch tensor) chunks: Number of chunks to create dim: Dimension along which to chunk (default: 0) stride: Stride between chunk starting positions (default: 1) chunk_size: Size of each chunk. If None, calculated automatically (default: None) Returns: List of tensor chunks (views when possible, copies when necessary) """ # Get the size of the specified dimension dim_size = tensor.shape[dim] # Calculate chunk size if not provided if chunk_size is None: chunk_size = (dim_size + chunks - 1) // chunks # Ceiling division chunks_list = [] for i in range(chunks): start_idx = i * stride end_idx = min(start_idx + chunk_size, dim_size) # Break if we've gone beyond the tensor if start_idx >= dim_size: break # Create slice objects for all dimensions slices = [slice(None)] * tensor.ndim slices[dim] = slice(start_idx, end_idx) chunk = tensor[tuple(slices)] chunks_list.append(chunk) return chunks_list def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1): dim_size = input_tensor.shape[dim] stride = math.ceil(dim_size / chunks) return strided_chunk_views( input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap ) ================================================ FILE: src/axolotl/integrations/kernels/README.md ================================================ # Kernels Integration MoE (Mixture of Experts) kernels speed up training for MoE layers and reduce VRAM costs. In transformers v5, `batched_mm` and `grouped_mm` were integrated as built-in options via the `experts_implementation` config kwarg: ```python class ExpertsInterface(GeneralInterface): _global_mapping = { "batched_mm": batched_mm_experts_forward, "grouped_mm": grouped_mm_experts_forward, } ``` In our custom integration, we add support for **ScatterMoE** and **SonicMoE**, which are more efficient and faster than `grouped_mm`. ## Usage Add the following to your axolotl YAML config: ```yaml plugins: - axolotl.integrations.kernels.KernelsPlugin use_kernels: true # Choose one (mutually exclusive): use_scattermoe: true # OR use_sonicmoe: true ``` **Important:** Setting `experts_implementation` is incompatible with custom kernel options. ### SonicMoE installation **Prerequisites:** - NVIDIA Hopper (H100, H200) or Blackwell (B200, GB200) GPU - CUDA 12.9+ (13.0+ for B300) - PyTorch 2.7+ (2.9.1 recommended) - For B300: Triton 3.6.0 ```bash pip install --ignore-requires-python --no-deps "sonic-moe @ git+https://github.com/Dao-AILab/sonic-moe.git@116e2df0a41874f77fa0ad269ce7df3f0cfcb956" && pip install nvidia-cutlass-dsl==4.4.0 quack-kernels==0.2.5 ``` See the [SonicMoE installation guide](https://github.com/Dao-AILab/sonic-moe?tab=readme-ov-file#-installation) for the latest prerequisite details. **Note:** Blackwell support is in upstream beta. On Blackwell GPUs, Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels. ## How It Works The `KernelsPlugin` runs before model loading and: ### ScatterMoE 1. Registers the ScatterMoE kernel from the local `libs/scattermoe_lora` package (includes fused LoRA support via Triton kernels). 2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation. ### SonicMoE 1. Resolves the model's MoE block class(es) from `constants.py`. 2. Patches the forward method with SonicMoE's optimized kernels and registers a weight converter for the interleaved gate/up projection format. 3. Supports both softmax->topk and sigmoid->topk routing strategies. Both paths use the shared `resolve_moe_block_classes` utility in `constants.py` for model-type-to-class resolution. #### Supported Models See `constants.py` for the full list of supported model types (Qwen2-MoE, Qwen3-MoE, OLMoE, Mixtral, DeepSeek-V3, GLM-MoE, MiniMax, etc.). ## Limitations ScatterMoE uses a softmax -> topk routing, so results may be different for some model architectures as baseline (GPT-OSS, etc). Incompatible with `GLM_MOE_DSA` (GLM 5) and `GLM4_MOE_LITE` (GLM 4.7 Flash) at the moment. SonicMoE supports both softmax->topk and sigmoid->topk routing, covering a wider range of architectures. ScatterMoE does not work for GLM4.7 Flash (glm4_moe_lite) atm. ## Note on MegaBlocks We tested [MegaBlocks](https://huggingface.co/kernels-community/megablocks) but were unable to ensure numerical accuracy, so we did not integrate it. It was also incompatible with many newer model architectures in transformers. ================================================ FILE: src/axolotl/integrations/kernels/__init__.py ================================================ from .args import KernelsArgs from .plugin import KernelsPlugin __all__ = [ "KernelsArgs", "KernelsPlugin", ] ================================================ FILE: src/axolotl/integrations/kernels/args.py ================================================ from pydantic import BaseModel, model_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class KernelsArgs(BaseModel): use_scattermoe: bool | None = None use_sonicmoe: bool | None = None @model_validator(mode="before") @classmethod def check_mutually_exclusive(cls, data): if data.get("use_scattermoe") and data.get("use_sonicmoe"): raise ValueError( "Cannot use both ScatterMoE and SonicMoE simultaneously. " "Please set only one of `use_scattermoe` or `use_sonicmoe` to true." ) return data @model_validator(mode="before") @classmethod def check_use_kernels(cls, data): if data.get("use_kernels") is not True: LOG.warning( "`use_kernels` must be set to True to use this. Automatically setting it to True." ) data["use_kernels"] = True return data @model_validator(mode="before") @classmethod def check_experts_implementation(cls, data): experts_implementation = data.get("experts_implementation") if experts_implementation is None: # transformers may default to batched_mm when unset data["experts_implementation"] = "eager" elif experts_implementation != "eager": LOG.warning( "`experts_implementation` must be set to 'eager' to use this. Automatically setting it to 'eager'." ) data["experts_implementation"] = "eager" return data @model_validator(mode="before") @classmethod def disable_mlp_kernel(cls, data): if data.get("use_scattermoe") is True or data.get("use_sonicmoe") is True: if data.get("lora_mlp_kernel") is True: LOG.warning( "Disabling lora_mlp_kernel when using custom MoE kernels due to compatibility issues." ) data["lora_mlp_kernel"] = False data["mlp_kernel"] = False return data ================================================ FILE: src/axolotl/integrations/kernels/autotune_callback.py ================================================ """Trainer callback for reporting Triton autotune results from scattermoe-lora kernels.""" import logging import torch from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) LOG = logging.getLogger(__name__) # Give up looking for autotune data after this many training steps. _MAX_POLL_STEP = 5 def _get_gpu_info() -> dict: """Return basic GPU identification for the current device.""" if not torch.cuda.is_available(): return {} try: idx = torch.cuda.current_device() props = torch.cuda.get_device_properties(idx) return { "gpu_name": props.name, "gpu_compute_capability": f"{props.major}.{props.minor}", "gpu_memory_bytes": props.total_memory, } except Exception: # pylint: disable=broad-exception-caught return {} def _get_smem_capacity() -> dict: """Return shared memory capacity from the runtime lora_ops module.""" try: from axolotl.integrations.kernels.autotune_collector import ( _find_lora_ops_module, ) lora_ops = _find_lora_ops_module() if lora_ops is None: return {} fn = getattr(lora_ops, "_get_smem_capacity", None) if fn is None: return {} return {"smem_capacity_bytes": fn()} except Exception: # pylint: disable=broad-exception-caught return {} class AutotuneReportCallback(TrainerCallback): """Reports Triton kernel autotune selections via telemetry. Fires **once** after the first training step completes (step 1), at which point the forward and backward passes have both run and the autotuned kernels have populated their caches. If for some reason the caches are still empty (e.g. the kernel was never invoked), the callback retries on subsequent steps up to ``_MAX_POLL_STEP`` and then stops polling. After reporting (or giving up) every subsequent ``on_step_end`` call short-circuits on the ``_reported`` flag — zero hot-path cost. """ def __init__(self): self._reported = False # pylint: disable=unused-argument def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): if self._reported: return # Lazy import — Triton / scattermoe kernels may not be installed. from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) configs = collect_autotune_configs() if not configs: if state.global_step >= _MAX_POLL_STEP: LOG.debug( "No autotune data found after %d steps; giving up.", state.global_step, ) self._reported = True return self._reported = True from axolotl.telemetry.manager import TelemetryManager telemetry_manager = TelemetryManager.get_instance() if not telemetry_manager.enabled: return properties = { "kernel_count": len(configs), "kernels": configs, } properties.update(_get_gpu_info()) properties.update(_get_smem_capacity()) telemetry_manager.send_event( event_type="scattermoe-autotune", properties=properties, ) LOG.info( "Reported %d scattermoe kernel autotune config(s) to telemetry.", len(configs), ) ================================================ FILE: src/axolotl/integrations/kernels/autotune_collector.py ================================================ """Collect Triton autotune results from scattermoe-lora kernels. This module reads the ``.cache`` attribute from Triton ``@triton.autotune`` decorated kernel objects and returns structured dicts describing the selected configurations. It has **no** telemetry dependency — callers decide what to do with the data. """ import logging import sys from types import ModuleType from typing import Any LOG = logging.getLogger(__name__) # (human-readable name, attribute on the lora_ops module) _KERNEL_REGISTRY: list[tuple[str, str]] = [ ("scatter2scatter_lora_fwd", "_scatter2scatter_lora"), ("scatter2scatter_lora_dX", "_scatter2scatter_lora_dX"), ("group_bwd_lora", "_group_bwd_lora"), ("group_bwd_lora_fused", "_group_bwd_lora_fused"), ] # The autotune key declared on every kernel: key=["M", "N", "K"] _KEY_NAMES: list[str] = ["M", "N", "K"] def _parse_key_tuple(key_tuple: tuple) -> dict[str, Any]: """Turn the autotune cache key tuple into a labelled dict. Triton builds the cache key from the values of the declared ``key`` args (``M``, ``N``, ``K``) followed by dtype signature elements. We label the first three and store the rest under ``_extra``. """ result: dict[str, Any] = {} for i, name in enumerate(_KEY_NAMES): if i < len(key_tuple): result[name] = key_tuple[i] if len(key_tuple) > len(_KEY_NAMES): result["_extra"] = [str(v) for v in key_tuple[len(_KEY_NAMES) :]] return result def _find_lora_ops_module() -> ModuleType | None: """Locate the *runtime* ``lora_ops`` module in ``sys.modules``. The HF ``kernels`` package loads ``scattermoe_lora`` via ``import_from_path`` which registers it in ``sys.modules`` under a hash-suffixed name (e.g. ``scattermoe_lora_a1b2c3d4``). A normal import (``from axolotl.integrations.kernels...``) would create a *separate* module instance whose kernel objects have empty ``.cache`` dicts because autotuning ran on the runtime copy. We search ``sys.modules`` for any module whose name contains ``lora_ops`` and that has the ``_scatter2scatter_lora`` kernel attribute — that is the runtime copy with populated caches. """ for name, module in list(sys.modules.items()): if ( module is not None and "lora_ops" in name and hasattr(module, "_scatter2scatter_lora") ): return module return None def collect_autotune_configs() -> list[dict[str, Any]]: """Read autotune caches from the four scattermoe-lora kernels. Returns a (possibly empty) list of dicts, each containing: * ``kernel`` – human-readable kernel name * ``key`` – dict with the ``M``/``N``/``K`` problem dimensions * ``config`` – dict with the selected tile sizes, ``num_warps``, and ``num_stages`` Returns ``[]`` if the kernel module cannot be found or if no autotune cache entries exist yet. """ lora_ops = _find_lora_ops_module() if lora_ops is None: LOG.debug( "lora_ops module not found in sys.modules; skipping autotune collection" ) return [] results: list[dict[str, Any]] = [] for friendly_name, attr_name in _KERNEL_REGISTRY: kernel_fn = getattr(lora_ops, attr_name, None) if kernel_fn is None: continue cache = getattr(kernel_fn, "cache", None) if not cache: continue for key_tuple, config in cache.items(): config_dict = dict(config.kwargs) config_dict["num_warps"] = config.num_warps config_dict["num_stages"] = config.num_stages if getattr(config, "num_ctas", None) is not None: config_dict["num_ctas"] = config.num_ctas results.append( { "kernel": friendly_name, "key": _parse_key_tuple(key_tuple), "config": config_dict, } ) return results ================================================ FILE: src/axolotl/integrations/kernels/constants.py ================================================ """ Supported MoE block mappings for kernel integrations. Maps model_type to the SparseMoeBlock class name(s) in transformers. Used by both ScatterMoE and SonicMoE kernel paths. Values can be a single class name (str) or a list of class names for models with multiple MoE block types (e.g. qwen3_omni_moe has Thinker + Talker). """ import importlib SPARSE_MOE_BLOCK = { # softmax -> topk routing "qwen2_moe": "Qwen2MoeSparseMoeBlock", "qwen3_moe": "Qwen3MoeSparseMoeBlock", "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock", "qwen3_5_moe_text": "Qwen3_5MoeSparseMoeBlock", "qwen3_next": "Qwen3NextSparseMoeBlock", "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock", # qwen3_omni_moe: Thinker (standard) + Talker (shared experts + shared_expert_gate) "qwen3_omni_moe": [ "Qwen3OmniMoeThinkerTextSparseMoeBlock", "Qwen3OmniMoeTalkerTextSparseMoeBlock", ], "olmoe": "OlmoeSparseMoeBlock", "mixtral": "MixtralSparseMoeBlock", "minimax": "MiniMaxSparseMoeBlock", # softmax -> topk routing (with group-based expert selection) "mistral4": "Mistral4MoE", # sigmoid -> topk routing (with group-based expert selection) "glm_moe_dsa": "GlmMoeDsaMoE", "deepseek_v3": "DeepseekV3MoE", "glm4_moe": "Glm4MoeMoE", "glm4_moe_lite": "Glm4MoeLiteMoE", "glm4v_moe": "Glm4vMoeTextMoE", # sigmoid -> topk routing (no group selection) "minimax_m2": "MiniMaxM2SparseMoeBlock", # Models below need custom routing (not yet implemented): # "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock", # softmax->topk, e_score_correction_bias between softmax and topk # "deepseek_v2": "DeepseekV2Moe", # softmax->topk, group_limited_greedy, different attr names (num_group) # "hunyuan_v1_moe": "HunYuanMoEV1Moe", # softmax->topk, gate.wg (not gate.weight), scatter routing # "gpt_oss": "GptOssMLP", # topk->softmax, transposed layout [E,H,2*I], custom GLU, expert biases } def resolve_moe_block_classes(model_type: str): """Resolve all MoE block classes from transformers for the given model type. Returns a list of classes (one for most models, multiple for models with distinct MoE block types like qwen3_omni_moe). """ entry = SPARSE_MOE_BLOCK.get(model_type) if entry is None: raise ValueError( f"Unsupported MoE model type '{model_type}'. " f"Supported types: {list(SPARSE_MOE_BLOCK.keys())}" ) cls_names = entry if isinstance(entry, list) else [entry] module_path = f"transformers.models.{model_type}.modeling_{model_type}" try: module = importlib.import_module(module_path) except ModuleNotFoundError: # Text sub-model types (e.g. qwen3_5_moe_text) share the parent module if model_type.endswith("_text"): parent_type = model_type.removesuffix("_text") module_path = f"transformers.models.{parent_type}.modeling_{parent_type}" module = importlib.import_module(module_path) else: raise classes = [] for cls_name in cls_names: moe_cls = getattr(module, cls_name, None) if moe_cls is None: raise ValueError(f"Could not find class '{cls_name}' in '{module_path}'") classes.append(moe_cls) return classes ================================================ FILE: src/axolotl/integrations/kernels/libs/__init__.py ================================================ ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/__init__.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 from . import layers from .lora_ops import ParallelExperts from .parallel_experts import flatten_sort_count, parallel_linear from .parallel_linear_lora import ScatterMoELoRA, parallel_linear_lora __all__ = [ "layers", "ParallelExperts", "flatten_sort_count", "parallel_linear", "ScatterMoELoRA", "parallel_linear_lora", "lora_ops", ] ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/__init__.py ================================================ # SPDX-License-Identifier: Apache-2.0 # # Original work Copyright (c) Shawn Tan and ScatterMoE Contributors # Adapted from https://github.com/shawntan/scattermoe # See https://github.com/shawntan/scattermoe/blob/main/LICENSE # # Modifications and LoRA adaptation Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 from . import lora_ops, ops __all__ = ["ops", "lora_ops"] ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Fused ScatterMoE + LoRA Triton Kernels ======================================= Provides fused forward and backward kernels for ScatterMoE with LoRA adapters. Forward: Y = X @ W + scaling * (X @ A^T) @ B^T Backward (LoRA training, W frozen): - dX = dY @ W^T + scaling * (dY @ B) @ A (input gradient) - dA = scaling * (dY @ B)^T @ X (LoRA A gradient) - dB = scaling * dY^T @ (X @ A^T) (LoRA B gradient) LoRA weight layout (from PEFT ParamWrapper): - A: [r*E, K] -- for expert e, rows [e*r : (e+1)*r] give A_e of shape [r, K] - B: [N, r*E] -- for expert e, cols [e*r : (e+1)*r] give B_e of shape [N, r] Key design decisions: - The forward kernel fuses X@W and X@A^T in the same K-loop for data reuse on X, then computes (X@A^T) @ B^T in the epilogue. - The backward dA/dB kernel operates on grouped (expert-contiguous) data and iterates over tokens per expert, accumulating gradients in registers. - R (LoRA rank) is a tl.constexpr, allowing tl.arange(0, R). We pad R to a power-of-2 for Triton tile compatibility; typical ranks (4, 8, 16, 32, 64) already satisfy this. """ from itertools import product from typing import Optional import torch import triton import triton.language as tl # ============================================================================= # Configuration # ============================================================================= BLOCK_M = 128 ALLOW_TF32 = True def _next_power_of_2(n: int) -> int: """Round up to next power of 2.""" n -= 1 n |= n >> 1 n |= n >> 2 n |= n >> 4 n |= n >> 8 n |= n >> 16 return n + 1 # Triton tl.dot requires minimum tile dimensions of 16 on modern GPUs. MIN_TRITON_DOT_SIZE = 16 def _block_r_for_rank(r: int) -> int: """Compute BLOCK_R: next power-of-2 >= max(r, MIN_TRITON_DOT_SIZE).""" return _next_power_of_2(max(r, MIN_TRITON_DOT_SIZE)) # ============================================================================= # Token Rounding: pad expert counts to BLOCK_M multiples # ============================================================================= def round_expert_counts( sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, expert_offsets: torch.Tensor, E: int, block_m: int = BLOCK_M, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Pad each expert's token count to a multiple of block_m to eliminate partial-tile waste in the backward kernel. Padding is done by duplicating the last valid token index for each expert. The kernel's M_mask = M_idx < real_end_idx masks these padding entries, so correctness is preserved (they contribute 0 to the accumulation via other=0.0). This only helps the backward dA/dB kernel where per-expert iteration is explicit. The forward scatter2scatter kernel handles partial tiles via masking. Args: sorted_expert_idxs: Expert assignments sorted [M*k] sorted_scattered_idxs: Original indices sorted [M*k] expert_offsets: Cumulative token counts per expert [E] E: Number of experts block_m: Block size for token dimension (default: BLOCK_M) Returns: padded_expert_idxs: [M_padded] expert assignments with padding padded_scattered_idxs: [M_padded] original indices with padding padded_offsets: [E] cumulative padded counts (for kernel iteration range) real_offsets: [E] original cumulative counts (for M_mask in kernel) """ device = sorted_expert_idxs.device # Compute per-expert counts counts = torch.zeros(E, dtype=torch.int64, device=device) prev = 0 for e in range(E): curr = expert_offsets[e].item() counts[e] = curr - prev prev = curr # Round up each count to multiple of block_m padded_counts = ((counts + block_m - 1) // block_m) * block_m # Experts with 0 tokens stay at 0 padded_counts = torch.where( counts > 0, padded_counts, torch.zeros_like(padded_counts) ) total_padded = padded_counts.sum().item() padded_expert_idxs = torch.empty( total_padded, dtype=sorted_expert_idxs.dtype, device=device ) padded_scattered_idxs = torch.empty( total_padded, dtype=sorted_scattered_idxs.dtype, device=device ) src_offset = 0 dst_offset = 0 for e in range(E): count = counts[e].item() padded_count = padded_counts[e].item() if count > 0: # Copy original tokens padded_expert_idxs[dst_offset : dst_offset + count] = sorted_expert_idxs[ src_offset : src_offset + count ] padded_scattered_idxs[dst_offset : dst_offset + count] = ( sorted_scattered_idxs[src_offset : src_offset + count] ) # Pad with last valid token (masked out by kernel via M_mask) if padded_count > count: padded_expert_idxs[dst_offset + count : dst_offset + padded_count] = ( sorted_expert_idxs[src_offset + count - 1] ) padded_scattered_idxs[ dst_offset + count : dst_offset + padded_count ] = sorted_scattered_idxs[src_offset + count - 1] src_offset += count dst_offset += padded_count # Padded offsets: cumulative padded counts (for iteration range in kernel) padded_offsets = padded_counts.cumsum(-1).to(expert_offsets.dtype) # Real offsets: original cumulative counts (for M_mask in kernel) real_offsets = expert_offsets.clone() return padded_expert_idxs, padded_scattered_idxs, padded_offsets, real_offsets # ============================================================================= # Autotuning: SMEM estimation and config pruning # ============================================================================= _SMEM_CAPACITY: int | None = None def _get_smem_capacity() -> int: """Get device shared memory capacity (bytes). Cached after first call.""" global _SMEM_CAPACITY if _SMEM_CAPACITY is None: props = triton.runtime.driver.active.utils.get_device_properties( torch.cuda.current_device() ) _SMEM_CAPACITY = props["max_shared_mem"] return _SMEM_CAPACITY def _estimate_smem_usage( num_stages: int, BLOCK_M: int, BLOCK_N: int, BLOCK_K: int, dtype_bytes: int = 2 ) -> int: """Estimate shared memory in bytes for a GEMM-style tile. Formula: stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N Multiply by dtype_bytes (2 for fp16/bf16). """ return ( num_stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N ) * dtype_bytes # Conservative margin (bytes) subtracted from SMEM capacity to account for # estimation inaccuracies and kernel overhead (registers spilled to SMEM, etc.) _SMEM_SLACK = 10_000 def _estimate_register_pressure( num_warps: int, *tile_sizes: tuple[int, int], ) -> float: """Rough estimate of per-thread register footprint from live tile sizes. This is a heuristic, NOT an accurate register count. Triton uses tensor core MMA fragments that pack multiple elements per register, and can spill to local memory when the hardware limit (255 regs/thread) is exceeded. The estimate is used to prune only truly extreme configs that would cause excessive spilling or compilation failures. The threshold is set high (``_MAX_REGS_SOFT_LIMIT``) because the heuristic overestimates — it doesn't account for MMA fragment packing. Configs like M=64,N=64,K=64 (est ~520) work fine in practice via spilling. Returns estimated registers per thread. """ # Each thread in a warp holds ~1/32 of the tile elements tile_regs = sum(r * c for r, c in tile_sizes) / 32 scalar_overhead = 40 return tile_regs + scalar_overhead # Soft limit for register pressure pruning. Only prune configs with extreme # tile products (e.g. M=128,K=256,N=256) that reliably crash on Blackwell. # Moderate configs (M=64,N=64,K=64, est ~520) work via register spilling. _MAX_REGS_SOFT_LIMIT = 1024 # ============================================================================= # Forward Kernel: scatter2scatter with fused LoRA # ============================================================================= @triton.jit def _compute_expert_block_lora( E_idx, E_mask, M_in_idx, N_block, N_mask, # Base weight X_ptr, stride_xm, stride_xk, W_ptr, stride_we, stride_wk, stride_wn, # LoRA weights A_ptr, stride_ar, stride_ak, # A: [r*E, K], stride_ar = stride for r*E dim, stride_ak = stride for K dim B_ptr, stride_bn, stride_br, # B: [N, r*E], stride_bn = stride for N dim, stride_br = stride for r*E dim # Dimensions K, ACTUAL_R: tl.constexpr, # True LoRA rank (for indexing into weight arrays) acc, no_k_mask, BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_R: tl.constexpr, # Padded tile size >= max(ACTUAL_R, 16) scaling, allow_tf32: tl.constexpr, ): """ Compute Y_block = X_block @ W_e + scaling * (X_block @ A_e^T) @ B_e^T for tokens in this M-block assigned to expert E_idx. ACTUAL_R is the true LoRA rank used for indexing into A[e*r:(e+1)*r, :]. BLOCK_R >= ACTUAL_R is the padded tile dimension (must be >= 16 for tl.dot). When BLOCK_R > ACTUAL_R, loads are masked on the R dimension. """ K_block = tl.arange(0, BLOCK_K) R_block = tl.arange(0, BLOCK_R) R_mask = R_block < ACTUAL_R # Mask for padding when BLOCK_R > ACTUAL_R # Base weight pointers: W[E_idx, :, :] is [K, N], load [BLOCK_K, BLOCK_N] X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk W_blk_ptrs = ( W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn ) # LoRA A pointers: A[e*ACTUAL_R:(e+1)*ACTUAL_R, :] for expert e, shape [r, K] A_expert_offset = E_idx * ACTUAL_R A_blk_ptrs = ( A_ptr + (A_expert_offset + R_block)[:, None] * stride_ar + K_block[None, :] * stride_ak ) iters = tl.cdiv(K, BLOCK_K) # Accumulator for X @ A^T: [BLOCK_M, BLOCK_R] xa_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32) # Determine the input element type for consistent casting. # Masked tl.load with other=0.0 can upcast bf16->fp32 in some Triton versions, # causing dtype mismatches in tl.dot. We cast all tiles to the same type. INPUT_DTYPE = X_ptr.dtype.element_ty for i in range(iters): if no_k_mask: x = tl.load(X_blk_ptrs, mask=E_mask[:, None], other=0.0).to(INPUT_DTYPE) w = tl.load(W_blk_ptrs, mask=N_mask[None, :], other=0.0).to(INPUT_DTYPE) a = tl.load(A_blk_ptrs, mask=R_mask[:, None], other=0.0).to(INPUT_DTYPE) else: K_mask = (i * BLOCK_K + K_block) < K x = tl.load( X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :], other=0.0 ).to(INPUT_DTYPE) w = tl.load( W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :], other=0.0 ).to(INPUT_DTYPE) a = tl.load( A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # Base: acc += X @ W ([M, K] @ [K, N] -> [M, N]) acc += tl.dot(x, w, allow_tf32=allow_tf32).to(tl.float32) # LoRA: xa_acc += X @ A^T ([M, K] @ [K, R] -> [M, R]) xa_acc += tl.dot(x, tl.trans(a), allow_tf32=allow_tf32).to(tl.float32) X_blk_ptrs += BLOCK_K * stride_xk W_blk_ptrs += BLOCK_K * stride_wk A_blk_ptrs += BLOCK_K * stride_ak # Epilogue: load B[e] and compute (X @ A^T) @ B^T # B[e] is B[:, e*ACTUAL_R:(e+1)*ACTUAL_R], shape [N, r]. Load [BLOCK_N, BLOCK_R]. B_expert_offset = E_idx * ACTUAL_R B_blk_ptrs = ( B_ptr + N_block[:, None] * stride_bn + (B_expert_offset + R_block)[None, :] * stride_br ) b = tl.load( B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0 ) # [BLOCK_N, BLOCK_R] # tl.dot requires non-float32 inputs (tensor cores); cast back to input dtype b_inp = b.to(INPUT_DTYPE) # (X @ A^T) @ B^T: [M, R] @ [R, N] -> [M, N] lora_out = tl.dot(xa_acc.to(INPUT_DTYPE), tl.trans(b_inp), allow_tf32=allow_tf32) acc += scaling * lora_out return acc def _scatter2scatter_lora_configs(): """Generate forward kernel autotune configs. Search space includes BLOCK_M to allow trading token-tile size for larger BLOCK_K/BLOCK_N tiles. On GPUs with ~99KB SMEM, BLOCK_M=128 forces BLOCK_K=32 and BLOCK_N=32; BLOCK_M=64 allows BLOCK_K=128 (4× fewer inner-loop iterations). Search space: BLOCK_M: {32, 64, 128} BLOCK_N: {32, 64, 128, 256} BLOCK_K: {32, 64, 128} num_warps: {4, 8} num_stages: {3, 4, 5} """ configs = [] for block_m, block_n, block_k, warps, stages in product( [32, 64, 128], # BLOCK_M [32, 64, 128, 256], # BLOCK_N [32, 64, 128], # BLOCK_K [4, 8], # num_warps [3, 4, 5], # num_stages ): configs.append( triton.Config( {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k}, num_stages=stages, num_warps=warps, ) ) return configs def _prune_fwd_configs(configs, named_args, **kwargs): """Prune forward configs based on SMEM capacity and register pressure. The forward kernel inner loop loads three tiles per pipeline stage: X[BLOCK_M, BLOCK_K], W[BLOCK_K, BLOCK_N], A[BLOCK_R, BLOCK_K]. The base estimate only accounts for X and W. We add: - A tile [BLOCK_R, BLOCK_K] per pipeline stage (loaded in the inner loop) - B tile [BLOCK_N, BLOCK_R] loaded once in the epilogue - Extra headroom for compiler overhead (register spills, metadata) """ smem_cap = _get_smem_capacity() # Get BLOCK_R from named_args if available, else assume worst case block_r = named_args.get("BLOCK_R", 64) scored = [] for config in configs: block_m = config.kwargs["BLOCK_M"] block_n = config.kwargs["BLOCK_N"] block_k = config.kwargs["BLOCK_K"] # Base: stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N smem_base = _estimate_smem_usage(config.num_stages, block_m, block_n, block_k) # A tile [BLOCK_R, BLOCK_K] loaded per stage in the inner loop smem_lora_loop = config.num_stages * block_r * block_k * 2 # B tile [BLOCK_N, BLOCK_R] loaded once in epilogue smem_lora_epilogue = block_n * block_r * 2 smem = smem_base + smem_lora_loop + smem_lora_epilogue # Register pressure: live tiles are acc[M,N], xa_acc[M,R], # x[M,K], w[K,N], a[R,K], plus epilogue b[N,R] est_regs = _estimate_register_pressure( config.num_warps, (block_m, block_n), # acc (block_m, block_r), # xa_acc (block_m, block_k), # x tile (block_k, block_n), # w tile (block_r, block_k), # a tile (block_n, block_r), # b tile (epilogue) ) if est_regs > _MAX_REGS_SOFT_LIMIT: continue scored.append((smem, config)) pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK] if pruned: return pruned if scored: # All surviving configs exceed SMEM — return the one with smallest usage scored.sort(key=lambda x: x[0]) return [scored[0][1]] # All configs pruned by register pressure — fall back to smallest tiles return [ min( configs, key=lambda c: ( c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_N"] * c.kwargs["BLOCK_K"] ), ) ] @triton.autotune( configs=_scatter2scatter_lora_configs(), key=["M", "N", "K"], prune_configs_by={"early_config_prune": _prune_fwd_configs}, ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _scatter2scatter_lora( # Input/Output X_ptr, stride_xm: tl.constexpr, stride_xk: tl.constexpr, W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr, Y_ptr, stride_ym: tl.constexpr, stride_yn: tl.constexpr, # Bias Bias_ptr, stride_bias_e: tl.constexpr, stride_bias_n: tl.constexpr, # LoRA weights LA_ptr, stride_la_r, stride_la_k, # A: [r*E, K] LB_ptr, stride_lb_n, stride_lb_r, # B: [N, r*E] # Routing grouped_idx_ptr, expert_idxs_ptr, # Dimensions FAN_OUT: tl.constexpr, M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr, ACTUAL_R: tl.constexpr, # True LoRA rank (for weight indexing) # Block sizes BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_R: tl.constexpr, # Padded tile size >= max(ACTUAL_R, 16) # Config ACC_TYPE: tl.constexpr, scaling, allow_tf32: tl.constexpr, x_grouped: tl.constexpr, y_grouped: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, ): """ Fused scatter2scatter with LoRA: Y = X @ W + scaling * (X @ A^T) @ B^T + bias """ pid = tl.program_id(axis=0) N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N) M_block_id = pid // N_BLOCK_COUNT N_block_id = pid % N_BLOCK_COUNT M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M) N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_block < N M_boundary_mask = M_block < (FAN_OUT * M) E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E) no_k_mask = NO_K_MASK acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) E_first_idx = tl.min(E_idxs) E_last_idx = tl.minimum(tl.max(E_idxs), E - 1) M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32) for E_idx in range(E_first_idx, E_last_idx + 1): E_mask = E_idxs == E_idx if x_grouped: M_in_idx = M_block else: M_in_idx = M_idx // FAN_OUT acc = _compute_expert_block_lora( E_idx, E_mask, M_in_idx, N_block, N_mask, X_ptr, stride_xm, stride_xk, W_ptr, stride_we, stride_wk, stride_wn, LA_ptr, stride_la_r, stride_la_k, LB_ptr, stride_lb_n, stride_lb_r, K, ACTUAL_R, acc, no_k_mask, BLOCK_M, BLOCK_K, BLOCK_N, BLOCK_R, scaling, allow_tf32=allow_tf32, ) # Add bias if present if Bias_ptr is not None: B_blk_ptrs = ( Bias_ptr + E_idxs[:, None] * stride_bias_e + N_block[None, :] * stride_bias_n ) acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :]) # Store output if y_grouped: M_out_idx = M_block else: M_out_idx = M_idx Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn) tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :]) def _scatter2scatter_lora_split( X: torch.Tensor, W: torch.Tensor, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, k: int, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float, b: Optional[torch.Tensor] = None, x_grouped: bool = False, y_grouped: bool = False, out: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Split base+LoRA forward: 3 scatter2scatter calls, no fused LoRA kernel. Faster for models with few large experts (e.g. Mixtral E=8, I=14336) because the base kernel runs at full speed without LoRA SMEM overhead, and the LoRA matmuls (R=16) are tiny separate passes. Y = scatter(X, W) + scaling * scatter(scatter(X, A^T), B^T) """ from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import ( scatter2scatter, ) E = W.size(0) R = lora_A.size(0) // E K = W.size(1) N = W.size(2) # 1. Base: Y_base = X @ W (uses base kernel with optimal tile sizes) output = scatter2scatter( X=X, W=W, b=b, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=k, x_grouped=x_grouped, y_grouped=y_grouped, out=out, ) # 2. XA = X @ A^T (tiny: output is [M*k, R]) # Reshape A: [R*E, K] → [E, K, R] (expert weights for scatter2scatter) W_A = lora_A.reshape(E, R, K).permute(0, 2, 1).contiguous() XA = scatter2scatter( X=X, W=W_A, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=k, x_grouped=x_grouped, y_grouped=True, ) # 3. Y_lora = XA @ B^T (R is tiny, so this is very fast) # Reshape B: [N, R*E] → [E, R, N] W_B = lora_B.T.reshape(E, R, N).contiguous() Y_lora = scatter2scatter( X=XA, W=W_B, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=1, x_grouped=True, y_grouped=y_grouped, ) # 4. Y = Y_base + scaling * Y_lora output.add_(Y_lora, alpha=scaling) return output # Threshold for switching from fused to split LoRA forward. # Split wins when per-expert matmul is large (bandwidth-bound LoRA tile # loads dominate in the fused kernel's inner loop). # Empirically: split wins for E<=32 with K*N > 20M (e.g. Mixtral, Phi-MoE). _SPLIT_LORA_FWD_THRESHOLD = 20_000_000 # per-expert K*N _SPLIT_LORA_FWD_MAX_EXPERTS = 32 def scatter2scatter_lora( X: torch.Tensor, W: torch.Tensor, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, k: int, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float, b: Optional[torch.Tensor] = None, x_grouped: bool = False, y_grouped: bool = False, out: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Scatter2scatter with LoRA: Y[i] = X[i] @ W[e] + scaling * (X[i] @ A[e]^T) @ B[e]^T + b[e] Automatically selects between: - Fused kernel: single Triton kernel with LoRA in the inner loop. Best for many small experts (E>=64, small K*N). - Split dispatch: 3 separate scatter2scatter calls (base + XA + lora). Best for few large experts (E<=32, large K*N like Mixtral). Args: X: Input [M, K] or [M*k, K] if x_grouped W: Expert weights [E, K, N] sorted_expert_idxs: Expert assignments sorted [M*k] sorted_scattered_idxs: Original indices sorted [M*k] k: Fan-out (top-k) lora_A: LoRA A weights [r*E, K] lora_B: LoRA B weights [N, r*E] scaling: LoRA scaling factor (alpha/r) b: Optional bias [E, N] x_grouped: Input pre-grouped by expert y_grouped: Keep output grouped out: Optional pre-allocated output buffer Returns: Y: Output [M*k, N] """ E = W.size(0) K = W.size(1) N = W.size(2) # Dispatch: split for few large experts, fused for many small experts if E <= _SPLIT_LORA_FWD_MAX_EXPERTS and K * N >= _SPLIT_LORA_FWD_THRESHOLD: return _scatter2scatter_lora_split( X, W, sorted_expert_idxs, sorted_scattered_idxs, k, lora_A, lora_B, scaling, b, x_grouped, y_grouped, out, ) assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0) assert sorted_scattered_idxs.size(0) == X.size(0) * k R = lora_A.size(0) // E # Pad R to power of 2 for Triton tile size BLOCK_R = _block_r_for_rank(R) L_scattered = sorted_expert_idxs.size(0) if out is None: output = torch.empty((L_scattered, N), device=X.device, dtype=X.dtype) else: assert out.size(0) == L_scattered and out.size(1) == N output = out def grid(META): return ( triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), ) if b is None: stride_be = stride_bn = 0 b_ptr = None else: stride_be, stride_bn = b.stride() b_ptr = b _scatter2scatter_lora[grid]( X, X.stride(0), X.stride(1), W, W.stride(0), W.stride(1), W.stride(2), output, output.stride(0), output.stride(1), b_ptr, stride_be, stride_bn, lora_A, lora_A.stride(0), lora_A.stride(1), lora_B, lora_B.stride(0), lora_B.stride(1), sorted_scattered_idxs, sorted_expert_idxs, FAN_OUT=k, M=X.size(0), K=K, N=N, E=E, ACTUAL_R=R, BLOCK_R=BLOCK_R, ACC_TYPE=tl.float32, scaling=scaling, allow_tf32=ALLOW_TF32, x_grouped=x_grouped, y_grouped=y_grouped, ) return output # ============================================================================= # Backward Kernel: Fused dX = dY @ W^T + scaling * (dY @ B) @ A # ============================================================================= @triton.jit def _compute_expert_block_lora_dX( E_idx, E_mask, M_in_idx, K_block, K_mask, # Input: DY (gradient w.r.t. output) DY_ptr, stride_dym, stride_dyn, # Base weight W^T: we load W[e] as [K, N] and index as W^T[e] = [N, K] W_ptr, stride_we, stride_wk, stride_wn, # LoRA weights A_ptr, stride_ar, stride_ak, # A: [r*E, K] B_ptr, stride_bn, stride_br, # B: [N, r*E] # Dimensions N, ACTUAL_R: tl.constexpr, acc, no_n_mask, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_R: tl.constexpr, scaling, allow_tf32: tl.constexpr, ): """ Compute dX_block = DY_block @ W_e^T + scaling * (DY_block @ B_e) @ A_e for tokens in this M-block assigned to expert E_idx. Inner loop over N dimension (reduction dim for dY @ W^T and dY @ B). Output dimension is K. Epilogue computes (dY @ B) @ A. Transpose mapping from forward: Forward: X@W (K-loop), X@A^T (K-loop), (X@A^T)@B^T (epilogue) Backward: DY@W^T (N-loop), DY@B (N-loop), (DY@B)@A (epilogue) """ N_block = tl.arange(0, BLOCK_N) R_block = tl.arange(0, BLOCK_R) R_mask = R_block < ACTUAL_R # DY pointers: DY is [M_total, N], load [BLOCK_M, BLOCK_N] DY_blk_ptrs = ( DY_ptr + M_in_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn ) # W^T pointers: W[e] is [K, N], W^T[e] is [N, K]. We load W^T as [BLOCK_N, BLOCK_K]. # W stored as [E, K, N], so W^T[e][n, k] = W[e][k, n] = W_ptr + e*stride_we + k*stride_wk + n*stride_wn # As [BLOCK_N, BLOCK_K] tile: row=n, col=k WT_blk_ptrs = ( W_ptr + E_idx * stride_we + N_block[:, None] * stride_wn # row = n dimension + K_block[None, :] * stride_wk ) # col = k dimension # B pointers: B[e] is B[:, e*R:(e+1)*R], shape [N, R]. Load [BLOCK_N, BLOCK_R]. B_expert_offset = E_idx * ACTUAL_R B_blk_ptrs = ( B_ptr + N_block[:, None] * stride_bn + (B_expert_offset + R_block)[None, :] * stride_br ) iters = tl.cdiv(N, BLOCK_N) # Accumulator for DY @ B: [BLOCK_M, BLOCK_R] dy_b_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32) # Determine the input element type for consistent casting. INPUT_DTYPE = DY_ptr.dtype.element_ty for i in range(iters): if no_n_mask: dy = tl.load(DY_blk_ptrs, mask=E_mask[:, None], other=0.0).to(INPUT_DTYPE) wt = tl.load(WT_blk_ptrs, mask=K_mask[None, :], other=0.0).to(INPUT_DTYPE) b = tl.load(B_blk_ptrs, mask=R_mask[None, :], other=0.0).to(INPUT_DTYPE) else: N_mask_iter = (i * BLOCK_N + N_block) < N dy = tl.load( DY_blk_ptrs, mask=E_mask[:, None] & N_mask_iter[None, :], other=0.0 ).to(INPUT_DTYPE) wt = tl.load( WT_blk_ptrs, mask=N_mask_iter[:, None] & K_mask[None, :], other=0.0 ).to(INPUT_DTYPE) b = tl.load( B_blk_ptrs, mask=N_mask_iter[:, None] & R_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # Base: acc += DY @ W^T ([M, N] @ [N, K] -> [M, K]) acc += tl.dot(dy, wt, allow_tf32=allow_tf32).to(tl.float32) # LoRA: dy_b_acc += DY @ B ([M, N] @ [N, R] -> [M, R]) dy_b_acc += tl.dot(dy, b, allow_tf32=allow_tf32).to(tl.float32) DY_blk_ptrs += BLOCK_N * stride_dyn WT_blk_ptrs += BLOCK_N * stride_wn B_blk_ptrs += BLOCK_N * stride_bn # Epilogue: load A[e] and compute (DY @ B) @ A # A[e] is A[e*R:(e+1)*R, :], shape [R, K]. Load [BLOCK_R, BLOCK_K]. A_expert_offset = E_idx * ACTUAL_R A_blk_ptrs = ( A_ptr + (A_expert_offset + R_block)[:, None] * stride_ar + K_block[None, :] * stride_ak ) a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to( INPUT_DTYPE ) # (DY @ B) @ A: [M, R] @ [R, K] -> [M, K] # tl.dot requires non-float32 inputs (tensor cores); cast accumulator back to input dtype lora_dx = tl.dot(dy_b_acc.to(INPUT_DTYPE), a_e, allow_tf32=allow_tf32) acc += scaling * lora_dx return acc def _scatter2scatter_lora_dX_configs(): """Generate backward dX kernel autotune configs. The inner loop is over N (not K as in forward). The output dimension is K. So BLOCK_K tiles the output and BLOCK_N tiles the reduction. BLOCK_M is now autotunable (was fixed at 128). Search space: BLOCK_M: {32, 64, 128} (token tile) BLOCK_K: {32, 64, 128, 256} (output tile) BLOCK_N: {32, 64, 128, 256} (reduction tile) num_warps: {4, 8} num_stages: {3, 4, 5} """ configs = [] for block_m, block_k, block_n, warps, stages in product( [32, 64, 128], # BLOCK_M [32, 64, 128, 256], # BLOCK_K (output dimension) [32, 64, 128, 256], # BLOCK_N (reduction dimension) [4, 8], # num_warps [3, 4, 5], # num_stages ): configs.append( triton.Config( {"BLOCK_M": block_m, "BLOCK_K": block_k, "BLOCK_N": block_n}, num_stages=stages, num_warps=warps, ) ) return configs def _prune_dX_configs(configs, named_args, **kwargs): """Prune backward dX configs based on SMEM capacity and register pressure. The dX kernel inner loop loads three tiles per pipeline stage: DY[BLOCK_M, BLOCK_N], W^T[BLOCK_N, BLOCK_K], B[BLOCK_N, BLOCK_R]. The base estimate only accounts for DY and W^T. We add: - B tile [BLOCK_N, BLOCK_R] per pipeline stage (loaded in the inner loop) - A tile [BLOCK_R, BLOCK_K] loaded once in the epilogue - Extra headroom for compiler overhead (register spills, metadata) """ smem_cap = _get_smem_capacity() # Get BLOCK_R from named_args if available, else assume worst case block_r = named_args.get("BLOCK_R", 64) scored = [] for config in configs: block_m = config.kwargs["BLOCK_M"] block_k = config.kwargs["BLOCK_K"] block_n = config.kwargs["BLOCK_N"] # Base: stages * BLOCK_N * (BLOCK_M + BLOCK_K) + BLOCK_M * BLOCK_K smem_base = _estimate_smem_usage(config.num_stages, block_m, block_k, block_n) # B tile [BLOCK_N, BLOCK_R] loaded per stage in the inner loop smem_lora_loop = config.num_stages * block_n * block_r * 2 # A tile [BLOCK_R, BLOCK_K] loaded once in epilogue smem_lora_epilogue = block_r * block_k * 2 smem = smem_base + smem_lora_loop + smem_lora_epilogue # Register pressure: live tiles are acc[M,K], dy_b_acc[M,R], # dy[M,N], wt[N,K], b[N,R], plus epilogue a[R,K] est_regs = _estimate_register_pressure( config.num_warps, (block_m, block_k), # acc (block_m, block_r), # dy_b_acc (block_m, block_n), # dy tile (block_n, block_k), # wt tile (block_n, block_r), # b tile (block_r, block_k), # a tile (epilogue) ) if est_regs > _MAX_REGS_SOFT_LIMIT: continue scored.append((smem, config)) pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK] if pruned: return pruned if scored: # All surviving configs exceed SMEM — return the one with smallest usage scored.sort(key=lambda x: x[0]) return [scored[0][1]] # All configs pruned by register pressure — fall back to smallest tiles return [ min( configs, key=lambda c: ( c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_K"] * c.kwargs["BLOCK_N"] ), ) ] @triton.autotune( configs=_scatter2scatter_lora_dX_configs(), key=["M", "N", "K"], prune_configs_by={"early_config_prune": _prune_dX_configs}, ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _scatter2scatter_lora_dX( # Input: DY (gradient w.r.t. output, grouped) DY_ptr, stride_dym: tl.constexpr, stride_dyn: tl.constexpr, # Base weight: W [E, K, N] (we compute DY @ W^T) W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr, # Output: dX DX_ptr, stride_dxm: tl.constexpr, stride_dxk: tl.constexpr, # LoRA weights LA_ptr, stride_la_r, stride_la_k, # A: [r*E, K] LB_ptr, stride_lb_n, stride_lb_r, # B: [N, r*E] # Routing grouped_idx_ptr, expert_idxs_ptr, # Dimensions FAN_OUT: tl.constexpr, M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr, ACTUAL_R: tl.constexpr, # Block sizes BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_R: tl.constexpr, # Config ACC_TYPE: tl.constexpr, scaling, allow_tf32: tl.constexpr, dy_grouped: tl.constexpr, dx_grouped: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, ): """ Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A DY is in expert-grouped order (x_grouped=True). dX is output in ungrouped or grouped order based on dx_grouped. Grid: (cdiv(M_total, BLOCK_M) * cdiv(K, BLOCK_K),) """ pid = tl.program_id(axis=0) K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K) M_block_id = pid // K_BLOCK_COUNT K_block_id = pid % K_BLOCK_COUNT M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M) K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K) K_mask = K_block < K M_boundary_mask = M_block < (FAN_OUT * M) E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E) no_n_mask = NO_N_MASK acc = tl.zeros((BLOCK_M, BLOCK_K), dtype=ACC_TYPE) E_first_idx = tl.min(E_idxs) E_last_idx = tl.minimum(tl.max(E_idxs), E - 1) M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32) for E_idx in range(E_first_idx, E_last_idx + 1): E_mask = E_idxs == E_idx if dy_grouped: M_in_idx = M_block else: M_in_idx = M_idx // FAN_OUT acc = _compute_expert_block_lora_dX( E_idx, E_mask, M_in_idx, K_block, K_mask, DY_ptr, stride_dym, stride_dyn, W_ptr, stride_we, stride_wk, stride_wn, LA_ptr, stride_la_r, stride_la_k, LB_ptr, stride_lb_n, stride_lb_r, N, ACTUAL_R, acc, no_n_mask, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_R, scaling, allow_tf32=allow_tf32, ) # Store output if dx_grouped: M_out_idx = M_block else: M_out_idx = M_idx DX_blk_ptrs = DX_ptr + ( M_out_idx[:, None] * stride_dxm + K_block[None, :] * stride_dxk ) tl.store(DX_blk_ptrs, acc, mask=M_boundary_mask[:, None] & K_mask[None, :]) def scatter2scatter_lora_dX( DY: torch.Tensor, W: torch.Tensor, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, k: int, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float, dy_grouped: bool = True, dx_grouped: bool = False, out: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A Replaces the separate: 1. base_ops.scatter2scatter(DY, W^T, x_grouped=True, ...) 2. _compute_lora_input_grad(DY, A, B, ...) Args: DY: Gradient w.r.t. output [M*k, N] (grouped by expert) W: Expert weights [E, K, N] (NOT transposed — kernel handles W^T internally) sorted_expert_idxs: Expert assignments sorted [M*k] sorted_scattered_idxs: Original indices sorted [M*k] k: Fan-out (top-k) lora_A: LoRA A weights [r*E, K] lora_B: LoRA B weights [N, r*E] scaling: LoRA scaling factor dy_grouped: Whether DY is in grouped (expert-sorted) order (default True) dx_grouped: Whether to output dX in grouped order (default False) out: Optional pre-allocated output buffer Returns: dX: Input gradient [M*k, K] """ assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0) E = W.size(0) K = W.size(1) N = W.size(2) R = lora_A.size(0) // E BLOCK_R = _block_r_for_rank(R) L_scattered = sorted_expert_idxs.size(0) # M for the kernel is DY.size(0) when dy_grouped, else the original M if dy_grouped: M = DY.size(0) fan_out = 1 # DY is already expanded else: M = DY.size(0) fan_out = k if out is None: output = torch.empty((L_scattered, K), device=DY.device, dtype=DY.dtype) else: assert out.size(0) == L_scattered and out.size(1) == K output = out def grid(META): return ( triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(K, META["BLOCK_K"]), ) _scatter2scatter_lora_dX[grid]( DY, DY.stride(0), DY.stride(1), W, W.stride(0), W.stride(1), W.stride(2), output, output.stride(0), output.stride(1), lora_A, lora_A.stride(0), lora_A.stride(1), lora_B, lora_B.stride(0), lora_B.stride(1), sorted_scattered_idxs, sorted_expert_idxs, FAN_OUT=fan_out, M=M, K=K, N=N, E=E, ACTUAL_R=R, # BLOCK_M is autotuned (injected by triton.autotune from Config kwargs) BLOCK_R=BLOCK_R, ACC_TYPE=tl.float32, scaling=scaling, allow_tf32=ALLOW_TF32, dy_grouped=dy_grouped, dx_grouped=dx_grouped, ) return output # ============================================================================= # Backward Kernel: LoRA gradient computation (dA, dB) # ============================================================================= def _group_bwd_lora_configs(): """Generate backward (dA/dB) kernel autotune configs. Search space includes smaller tile sizes and fewer pipeline stages to support GPUs with limited shared memory (e.g. ~99KB on some GPUs). Search space: BLOCK_M: {32, 64, 128, 256} (token-loop tile) BLOCK_K: {32, 64, 128, 256} BLOCK_N: {32, 64, 128, 256} num_warps: {4, 8} num_stages: {3, 4, 5} The backward kernel also uses BLOCK_R (from LoRA rank), but that is determined by the rank and not autotunable. """ configs = [] for block_m, block_k, block_n, warps, stages in product( [32, 64, 128, 256], # BLOCK_M [32, 64, 128, 256], # BLOCK_K [32, 64, 128, 256], # BLOCK_N [4, 8], # num_warps [3, 4, 5], # num_stages ): configs.append( triton.Config( {"BLOCK_M": block_m, "BLOCK_K": block_k, "BLOCK_N": block_n}, num_stages=stages, num_warps=warps, ) ) return configs def _prune_bwd_lora_configs(configs, named_args, **kwargs): """Prune backward configs based on SMEM capacity and register pressure. The backward kernel loads X[BLOCK_M, BLOCK_K] and DY[BLOCK_M, BLOCK_N] in the inner loop, plus holds A[BLOCK_R, BLOCK_K] and B[BLOCK_N, BLOCK_R] for the full expert. We estimate SMEM based on the dominant terms. """ smem_cap = _get_smem_capacity() block_r = named_args.get("BLOCK_R", 64) scored = [] for config in configs: block_m = config.kwargs["BLOCK_M"] block_k = config.kwargs["BLOCK_K"] block_n = config.kwargs["BLOCK_N"] # Inner loop loads X[M,K] and DY[M,N], pipeline over M iterations smem_base = _estimate_smem_usage(config.num_stages, block_m, block_n, block_k) # A[BLOCK_R, BLOCK_K] and B[BLOCK_N, BLOCK_R] held for the full expert smem_lora = (block_r * block_k + block_n * block_r) * 2 smem = smem_base + smem_lora # Register pressure: dA_acc[R,K], dB_acc[N,R], x[M,K], dy[M,N], # a[R,K], b[N,R], xa[M,R], dy_b[M,R] est_regs = _estimate_register_pressure( config.num_warps, (block_r, block_k), # dA_acc (block_n, block_r), # dB_acc (block_m, block_k), # x tile (block_m, block_n), # dy tile (block_r, block_k), # a tile (block_n, block_r), # b tile (block_m, block_r), # xa intermediate ) if est_regs > _MAX_REGS_SOFT_LIMIT: continue scored.append((smem, config)) pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK] if pruned: return pruned if scored: # All surviving configs exceed SMEM — return the one with smallest usage scored.sort(key=lambda x: x[0]) return [scored[0][1]] # All configs pruned by register pressure — fall back to smallest tiles return [ min( configs, key=lambda c: ( c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_K"] * c.kwargs["BLOCK_N"] ), ) ] @triton.autotune( configs=_group_bwd_lora_configs(), key=["M", "N", "K"], prune_configs_by={"early_config_prune": _prune_bwd_lora_configs}, reset_to_zero=["DLA_ptr", "DLB_ptr"], ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _group_bwd_lora( # Inputs DY_ptr, stride_dym, stride_dyn, X_ptr, stride_xm, stride_xk, # LoRA weights (needed for cross-terms) LA_ptr, stride_la_r, stride_la_k, # A: [r*E, K] LB_ptr, stride_lb_n, stride_lb_r, # B: [N, r*E] # Gradient outputs DLA_ptr, stride_dla_r, stride_dla_k, DLB_ptr, stride_dlb_n, stride_dlb_r, # Expert offsets expert_offsets_ptr, # Dimensions M, K: tl.constexpr, N: tl.constexpr, ACTUAL_R: tl.constexpr, # True LoRA rank (for weight indexing) BLOCK_R: tl.constexpr, # Padded tile size >= max(ACTUAL_R, 16) scaling, # Block sizes BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_N: tl.constexpr, ACC_TYPE: tl.constexpr, allow_tf32: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, ): """ Compute LoRA gradients for each expert on grouped data. Grid: (E * cdiv(K, BLOCK_K), cdiv(N, BLOCK_N)) For expert e: dA[e] = scaling * (dY @ B[e])^T @ X -> [r, K], accumulate over M tokens dB[e] = scaling * dY^T @ (X @ A[e]^T) -> [N, r], accumulate over M tokens ACTUAL_R is the true LoRA rank. BLOCK_R >= ACTUAL_R is padded for tl.dot min size. """ pid0 = tl.program_id(axis=0) pid1 = tl.program_id(axis=1) K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K) E_idx = pid0 // K_BLOCK_COUNT K_block_id = pid0 % K_BLOCK_COUNT N_block_id = pid1 # Get expert's token range from cumulative offsets if E_idx == 0: start_idx = 0 else: start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32) end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32) num_tokens = end_idx - start_idx if num_tokens > 0: M_block = tl.arange(0, BLOCK_M) K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K) K_mask = K_block < K N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_block < N R_block = tl.arange(0, BLOCK_R) R_mask = R_block < ACTUAL_R # Mask for padding lora_offset = E_idx * ACTUAL_R # Determine input element type for consistent casting. INPUT_DTYPE = X_ptr.dtype.element_ty # Load B[e]: [BLOCK_N, BLOCK_R] (masked on R and N, other=0 for padding) B_blk_ptrs = ( LB_ptr + N_block[:, None] * stride_lb_n + (lora_offset + R_block)[None, :] * stride_lb_r ) b_e = tl.load(B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0).to( INPUT_DTYPE ) # Load A[e]: [BLOCK_R, BLOCK_K] (masked on R and K, other=0 for padding) A_blk_ptrs = ( LA_ptr + (lora_offset + R_block)[:, None] * stride_la_r + K_block[None, :] * stride_la_k ) a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to( INPUT_DTYPE ) # Accumulators dA_acc = tl.zeros((BLOCK_R, BLOCK_K), dtype=ACC_TYPE) dB_acc = tl.zeros((BLOCK_N, BLOCK_R), dtype=ACC_TYPE) iters = tl.cdiv(num_tokens, BLOCK_M) for i in range(iters): M_idx = start_idx + i * BLOCK_M + M_block M_mask = M_idx < end_idx # Load X: [BLOCK_M, BLOCK_K] X_blk_ptrs = ( X_ptr + M_idx[:, None] * stride_xm + K_block[None, :] * stride_xk ) x = tl.load( X_blk_ptrs, mask=M_mask[:, None] & K_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # Load dY: [BLOCK_M, BLOCK_N] DY_blk_ptrs = ( DY_ptr + M_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn ) dy = tl.load( DY_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # X @ A[e]^T: [M, K] @ [K, R] -> [M, R] xa = tl.dot(x, tl.trans(a_e), allow_tf32=allow_tf32) # dY @ B[e]: [M, N] @ [N, R] -> [M, R] dy_b = tl.dot(dy, b_e, allow_tf32=allow_tf32) # Cast intermediates to input dtype for subsequent tl.dot calls # (tl.dot requires both operands to have the same dtype) dy_b_cast = dy_b.to(INPUT_DTYPE) xa_cast = xa.to(INPUT_DTYPE) # dA += (dY @ B)^T @ X: [R, M] @ [M, K] -> [R, K] dA_acc += tl.dot(tl.trans(dy_b_cast), x, allow_tf32=allow_tf32) # dB += dY^T @ (X @ A^T): [N, M] @ [M, R] -> [N, R] dB_acc += tl.dot(tl.trans(dy), xa_cast, allow_tf32=allow_tf32) # Store dA with scaling (atomic add since multiple N_blocks contribute) # Only store the actual R rows, not the padded ones DLA_blk_ptrs = ( DLA_ptr + (lora_offset + R_block)[:, None] * stride_dla_r + K_block[None, :] * stride_dla_k ) tl.atomic_add( DLA_blk_ptrs, (dA_acc * scaling).to(DLA_ptr.dtype.element_ty), mask=R_mask[:, None] & K_mask[None, :], ) # Store dB with scaling (atomic add since multiple K_blocks contribute) DLB_blk_ptrs = ( DLB_ptr + N_block[:, None] * stride_dlb_n + (lora_offset + R_block)[None, :] * stride_dlb_r ) tl.atomic_add( DLB_blk_ptrs, (dB_acc * scaling).to(DLB_ptr.dtype.element_ty), mask=N_mask[:, None] & R_mask[None, :], ) def _group_bwd_split_configs(): """Autotune configs for split dA/dB kernels.""" configs = [] for block_m, block_dim, warps, stages in product( [32, 64, 128], # BLOCK_M (token tile) [32, 64, 128, 256], # BLOCK_DIM (K for dA, N for dB — output tile) [4, 8], # num_warps [3, 4, 5], # num_stages ): configs.append( triton.Config( {"BLOCK_M": block_m, "BLOCK_DIM": block_dim}, num_stages=stages, num_warps=warps, ) ) return configs def _prune_split_configs(configs, named_args, **kwargs): """Prune split kernel configs based on SMEM capacity and register pressure.""" smem_cap = _get_smem_capacity() block_r = named_args.get("BLOCK_R", 64) # Fixed inner tile for reduction dimension BLOCK_INNER = 64 pruned = [] for config in configs: block_m = config.kwargs["BLOCK_M"] block_dim = config.kwargs["BLOCK_DIM"] # Inner loop loads: input[M, INNER] and other[M, INNER_or_DIM] smem = config.num_stages * BLOCK_INNER * (block_m + block_dim) * 2 # LoRA weights held in registers: [INNER, R] or [R, DIM] smem += (block_r * max(block_dim, BLOCK_INNER)) * 2 # Register pressure check est_regs = _estimate_register_pressure( config.num_warps, (block_r, block_dim), # acc (block_m, BLOCK_INNER), # input tile (block_m, block_dim), # other tile (block_r, BLOCK_INNER), # lora weight ) if est_regs > _MAX_REGS_SOFT_LIMIT: continue if smem <= smem_cap - _SMEM_SLACK: pruned.append(config) if pruned: return pruned configs.sort(key=lambda c: c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_DIM"]) return [configs[0]] @triton.autotune( configs=_group_bwd_split_configs(), key=["M", "K", "N"], prune_configs_by={"early_config_prune": _prune_split_configs}, ) @triton.heuristics( { "NO_DIM_MASK": lambda args: ( (args["K"] % args["BLOCK_DIM"]) == 0 if args["COMPUTE_DA"] else (args["N"] % args["BLOCK_DIM"]) == 0 ), } ) @triton.jit def _group_bwd_lora_split( # Data tensors (DY and X are always present) DY_ptr, stride_dym, stride_dyn, X_ptr, stride_xm, stride_xk, # LoRA weight for the inner reduction (B for dA, A for dB) LW_ptr, stride_lw0, stride_lw1, # Output gradient tensor (dA or dB) OUT_ptr, stride_out0, stride_out1, # Expert offsets expert_offsets_ptr, # Dimensions M, K: tl.constexpr, N: tl.constexpr, ACTUAL_R: tl.constexpr, BLOCK_R: tl.constexpr, INNER_DIM: tl.constexpr, # reduction dimension (N for dA, K for dB) scaling, # Mode flag COMPUTE_DA: tl.constexpr, # True = compute dA, False = compute dB # Tile sizes BLOCK_M: tl.constexpr, BLOCK_DIM: tl.constexpr, ACC_TYPE: tl.constexpr, allow_tf32: tl.constexpr, NO_DIM_MASK: tl.constexpr, ): """ Unified split kernel for LoRA gradient computation. When COMPUTE_DA=True: dA[e] = scaling * (dY @ B[e])^T @ X → [R, K] Grid: (E, cdiv(K, BLOCK_DIM)) - outer_ptr/stride = X (read [M, K_block]) - inner reduction over N using DY and B - output shape [BLOCK_R, BLOCK_DIM] When COMPUTE_DA=False: dB[e] = scaling * dY^T @ (X @ A[e]^T) → [N, R] Grid: (E, cdiv(N, BLOCK_DIM)) - outer_ptr/stride = DY (read [M, N_block]) - inner reduction over K using X and A - output shape [BLOCK_DIM, BLOCK_R] No atomic adds — each (E, dim_block) pair is written by exactly one block. """ E_idx = tl.program_id(0) dim_block_id = tl.program_id(1) if E_idx == 0: start_idx = 0 else: start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32) end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32) num_tokens = end_idx - start_idx # Output dimension tile (K for dA, N for dB) if COMPUTE_DA: OUT_DIM: tl.constexpr = K # type: ignore[no-redef] else: OUT_DIM: tl.constexpr = N # type: ignore[no-redef] dim_block = dim_block_id * BLOCK_DIM + tl.arange(0, BLOCK_DIM) dim_mask = dim_block < OUT_DIM R_block = tl.arange(0, BLOCK_R) R_mask = R_block < ACTUAL_R lora_offset = E_idx * ACTUAL_R # Output pointers — layout differs: dA is [R, K], dB is [N, R] if COMPUTE_DA: out_blk_ptrs = ( OUT_ptr + (lora_offset + R_block)[:, None] * stride_out0 + dim_block[None, :] * stride_out1 ) out_mask = R_mask[:, None] & dim_mask[None, :] else: out_blk_ptrs = ( OUT_ptr + dim_block[:, None] * stride_out0 + (lora_offset + R_block)[None, :] * stride_out1 ) out_mask = dim_mask[:, None] & R_mask[None, :] if num_tokens > 0: M_block = tl.arange(0, BLOCK_M) INPUT_DTYPE = X_ptr.dtype.element_ty BLOCK_INNER: tl.constexpr = 64 inner_iters = tl.cdiv(INNER_DIM, BLOCK_INNER) if COMPUTE_DA: acc = tl.zeros((BLOCK_R, BLOCK_DIM), dtype=ACC_TYPE) else: acc = tl.zeros((BLOCK_DIM, BLOCK_R), dtype=ACC_TYPE) M_iters = tl.cdiv(num_tokens, BLOCK_M) for i in range(M_iters): M_idx = start_idx + i * BLOCK_M + M_block M_mask = M_idx < end_idx if COMPUTE_DA: # Load X[M, K_block] (the "outer" tensor for dA) outer = tl.load( X_ptr + M_idx[:, None] * stride_xm + dim_block[None, :] * stride_xk, mask=M_mask[:, None] & dim_mask[None, :], other=0.0, ).to(INPUT_DTYPE) # Reduce DY[M, :] @ B[e][:, R] over N → [M, R] reduced = tl.zeros((BLOCK_M, BLOCK_R), dtype=ACC_TYPE) inner_range = tl.arange(0, BLOCK_INNER) for j in range(inner_iters): inn_off = j * BLOCK_INNER + inner_range inn_mask = inn_off < N dy_tile = tl.load( DY_ptr + M_idx[:, None] * stride_dym + inn_off[None, :] * stride_dyn, mask=M_mask[:, None] & inn_mask[None, :], other=0.0, ).to(INPUT_DTYPE) # B layout: [N, r*E] → stride_lw0=N stride, stride_lw1=r*E stride lw_tile = tl.load( LW_ptr + inn_off[:, None] * stride_lw0 + (lora_offset + R_block)[None, :] * stride_lw1, mask=inn_mask[:, None] & R_mask[None, :], other=0.0, ).to(INPUT_DTYPE) reduced += tl.dot(dy_tile, lw_tile, allow_tf32=allow_tf32) # dA += (DY@B)^T @ X: [R, M] @ [M, K_block] → [R, K_block] acc += tl.dot( tl.trans(reduced.to(INPUT_DTYPE)), outer, allow_tf32=allow_tf32 ) else: # Load DY[M, N_block] (the "outer" tensor for dB) outer = tl.load( DY_ptr + M_idx[:, None] * stride_dym + dim_block[None, :] * stride_dyn, mask=M_mask[:, None] & dim_mask[None, :], other=0.0, ).to(INPUT_DTYPE) # Reduce X[M, :] @ A[e][:, :].T over K → [M, R] reduced = tl.zeros((BLOCK_M, BLOCK_R), dtype=ACC_TYPE) inner_range = tl.arange(0, BLOCK_INNER) for j in range(inner_iters): inn_off = j * BLOCK_INNER + inner_range inn_mask = inn_off < K x_tile = tl.load( X_ptr + M_idx[:, None] * stride_xm + inn_off[None, :] * stride_xk, mask=M_mask[:, None] & inn_mask[None, :], other=0.0, ).to(INPUT_DTYPE) # A layout: [r*E, K] → stride_lw0=r*E stride, stride_lw1=K stride # We want A[e]^T: [K, R], so load as [K_inner, R] lw_tile = tl.load( LW_ptr + (lora_offset + R_block)[None, :] * stride_lw0 + inn_off[:, None] * stride_lw1, mask=inn_mask[:, None] & R_mask[None, :], other=0.0, ).to(INPUT_DTYPE) reduced += tl.dot(x_tile, lw_tile, allow_tf32=allow_tf32) # dB += DY^T @ (X@A^T): [N_block, M] @ [M, R] → [N_block, R] acc += tl.dot( tl.trans(outer), reduced.to(INPUT_DTYPE), allow_tf32=allow_tf32 ) tl.store( out_blk_ptrs, (acc * scaling).to(OUT_ptr.dtype.element_ty), mask=out_mask ) else: # Zero out this expert's slice — needed because output uses empty_like if COMPUTE_DA: tl.store( out_blk_ptrs, tl.zeros((BLOCK_R, BLOCK_DIM), dtype=OUT_ptr.dtype.element_ty), mask=out_mask, ) else: tl.store( out_blk_ptrs, tl.zeros((BLOCK_DIM, BLOCK_R), dtype=OUT_ptr.dtype.element_ty), mask=out_mask, ) def group_bwd_lora( DY: torch.Tensor, X: torch.Tensor, lora_A: torch.Tensor, lora_B: torch.Tensor, expert_offsets: torch.Tensor, E: int, scaling: float, sorted_scattered_idxs: Optional[torch.Tensor] = None, k: int = 1, ) -> tuple[torch.Tensor, torch.Tensor]: """ Compute LoRA gradients for A and B on expert-grouped data. Uses split dA/dB kernels that eliminate atomic adds by giving each (expert, output_block) pair its own thread block. Args: DY: Gradient w.r.t. output [M_total, N] (grouped by expert) X: Input [M_total, K] (grouped by expert) lora_A: LoRA A weights [r*E, K] lora_B: LoRA B weights [N, r*E] expert_offsets: Cumulative token counts per expert [E] E: Number of experts scaling: LoRA scaling factor Returns: dA: Gradient for A [r*E, K] dB: Gradient for B [N, r*E] """ R = lora_A.size(0) // E K = X.size(1) N = DY.size(1) # No zero-init needed: the split kernels write zeros for experts with # zero routed tokens directly in the kernel (else branch). dA = torch.empty_like(lora_A) dB = torch.empty_like(lora_B) BLOCK_R = _block_r_for_rank(R) def grid_dA(META): return (E, triton.cdiv(K, META["BLOCK_DIM"])) _group_bwd_lora_split[grid_dA]( DY, DY.stride(0), DY.stride(1), X, X.stride(0), X.stride(1), lora_B, lora_B.stride(0), lora_B.stride(1), dA, dA.stride(0), dA.stride(1), expert_offsets, M=DY.size(0), K=K, N=N, ACTUAL_R=R, BLOCK_R=BLOCK_R, INNER_DIM=N, scaling=scaling, COMPUTE_DA=True, ACC_TYPE=tl.float32, allow_tf32=ALLOW_TF32, ) def grid_dB(META): return (E, triton.cdiv(N, META["BLOCK_DIM"])) _group_bwd_lora_split[grid_dB]( DY, DY.stride(0), DY.stride(1), X, X.stride(0), X.stride(1), lora_A, lora_A.stride(0), lora_A.stride(1), dB, dB.stride(0), dB.stride(1), expert_offsets, M=DY.size(0), K=K, N=N, ACTUAL_R=R, BLOCK_R=BLOCK_R, INNER_DIM=K, scaling=scaling, COMPUTE_DA=False, ACC_TYPE=tl.float32, allow_tf32=ALLOW_TF32, ) return dA, dB # ============================================================================= # Backward Kernel: Fused gather + LoRA gradient (dA, dB) — eliminates group() # ============================================================================= @triton.autotune( configs=_group_bwd_lora_configs(), key=["M", "N", "K"], prune_configs_by={"early_config_prune": _prune_bwd_lora_configs}, reset_to_zero=["DLA_ptr", "DLB_ptr"], ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _group_bwd_lora_fused( # Inputs (ungrouped or grouped) DY_ptr, stride_dym, stride_dyn, X_ptr, stride_xm, stride_xk, # Scatter indices for gather-on-load sorted_scattered_idxs_ptr, FAN_OUT: tl.constexpr, # LoRA weights (needed for cross-terms) LA_ptr, stride_la_r, stride_la_k, # A: [r*E, K] LB_ptr, stride_lb_n, stride_lb_r, # B: [N, r*E] # Gradient outputs DLA_ptr, stride_dla_r, stride_dla_k, DLB_ptr, stride_dlb_n, stride_dlb_r, # Expert offsets expert_offsets_ptr, # Real expert offsets (for M_mask when using token rounding, else same as expert_offsets_ptr) real_expert_offsets_ptr, # Dimensions M, K: tl.constexpr, N: tl.constexpr, ACTUAL_R: tl.constexpr, BLOCK_R: tl.constexpr, scaling, # Block sizes BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_N: tl.constexpr, ACC_TYPE: tl.constexpr, allow_tf32: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, # Whether DY is already in grouped (expert-sorted) order dy_grouped: tl.constexpr = False, ): """ Fused gather + LoRA gradient computation. Same as _group_bwd_lora but reads X from ungrouped buffers using sorted_scattered_idxs for indirect indexing, eliminating the need for a separate group(X) call. When dy_grouped=False (default): both X and DY are read via indirect indexing through sorted_scattered_idxs. This eliminates both group() calls entirely. When dy_grouped=True: DY is already in grouped order (e.g. gate_up_proj backward where grouped_out=True) and is read directly. Only X uses indirect indexing. This avoids the group(X) allocation while still supporting the grouped DY case. Grid: (E * cdiv(K, BLOCK_K), cdiv(N, BLOCK_N)) For expert e: dA[e] = scaling * (dY @ B[e])^T @ X -> [r, K] dB[e] = scaling * dY^T @ (X @ A[e]^T) -> [N, r] Supports token rounding: expert_offsets_ptr gives the iteration range (padded to BLOCK_M multiples), real_expert_offsets_ptr gives the real token count for M_mask (to exclude padding tokens). """ pid0 = tl.program_id(axis=0) pid1 = tl.program_id(axis=1) K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K) E_idx = pid0 // K_BLOCK_COUNT K_block_id = pid0 % K_BLOCK_COUNT N_block_id = pid1 # Get expert's token range from cumulative offsets # start_idx/end_idx from expert_offsets_ptr: iteration range (possibly padded) # real_end_idx from real_expert_offsets_ptr: for M_mask (real token count) if E_idx == 0: start_idx = 0 real_start_idx = 0 else: start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32) real_start_idx = tl.load(real_expert_offsets_ptr + E_idx - 1).to(tl.int32) end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32) real_end_idx = tl.load(real_expert_offsets_ptr + E_idx).to(tl.int32) num_tokens = end_idx - start_idx if num_tokens > 0: M_block = tl.arange(0, BLOCK_M) K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K) K_mask = K_block < K N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_block < N R_block = tl.arange(0, BLOCK_R) R_mask = R_block < ACTUAL_R lora_offset = E_idx * ACTUAL_R # Determine input element type for consistent casting. INPUT_DTYPE = X_ptr.dtype.element_ty # Load B[e] and A[e] — same as non-fused kernel B_blk_ptrs = ( LB_ptr + N_block[:, None] * stride_lb_n + (lora_offset + R_block)[None, :] * stride_lb_r ) b_e = tl.load(B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0).to( INPUT_DTYPE ) A_blk_ptrs = ( LA_ptr + (lora_offset + R_block)[:, None] * stride_la_r + K_block[None, :] * stride_la_k ) a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to( INPUT_DTYPE ) # Accumulators dA_acc = tl.zeros((BLOCK_R, BLOCK_K), dtype=ACC_TYPE) dB_acc = tl.zeros((BLOCK_N, BLOCK_R), dtype=ACC_TYPE) real_num_tokens = real_end_idx - real_start_idx iters = tl.cdiv(num_tokens, BLOCK_M) for i in range(iters): M_idx = start_idx + i * BLOCK_M + M_block # Use real token count for masking (excludes padding tokens) M_local = i * BLOCK_M + M_block M_mask = M_local < real_num_tokens # Fused gather: load scatter indices for indirect X access scatter_idx = tl.load( sorted_scattered_idxs_ptr + M_idx, mask=M_mask, other=0 ).to(tl.int32) X_token_idx = scatter_idx // FAN_OUT # X is [M, K], not expanded by k # Load X via indirect index: [BLOCK_M, BLOCK_K] X_blk_ptrs = ( X_ptr + X_token_idx[:, None] * stride_xm + K_block[None, :] * stride_xk ) x = tl.load( X_blk_ptrs, mask=M_mask[:, None] & K_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # Load DY: indirect via scatter_idx when ungrouped, direct via M_idx when grouped if dy_grouped: DY_blk_ptrs = ( DY_ptr + M_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn ) else: DY_blk_ptrs = ( DY_ptr + scatter_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn ) dy = tl.load( DY_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :], other=0.0 ).to(INPUT_DTYPE) # X @ A[e]^T: [M, K] @ [K, R] -> [M, R] xa = tl.dot(x, tl.trans(a_e), allow_tf32=allow_tf32) # dY @ B[e]: [M, N] @ [N, R] -> [M, R] dy_b = tl.dot(dy, b_e, allow_tf32=allow_tf32) dy_b_cast = dy_b.to(INPUT_DTYPE) xa_cast = xa.to(INPUT_DTYPE) # dA += (dY @ B)^T @ X: [R, M] @ [M, K] -> [R, K] dA_acc += tl.dot(tl.trans(dy_b_cast), x, allow_tf32=allow_tf32) # dB += dY^T @ (X @ A^T): [N, M] @ [M, R] -> [N, R] dB_acc += tl.dot(tl.trans(dy), xa_cast, allow_tf32=allow_tf32) # Store dA with scaling (atomic add since multiple N_blocks contribute) DLA_blk_ptrs = ( DLA_ptr + (lora_offset + R_block)[:, None] * stride_dla_r + K_block[None, :] * stride_dla_k ) tl.atomic_add( DLA_blk_ptrs, (dA_acc * scaling).to(DLA_ptr.dtype.element_ty), mask=R_mask[:, None] & K_mask[None, :], ) # Store dB with scaling (atomic add since multiple K_blocks contribute) DLB_blk_ptrs = ( DLB_ptr + N_block[:, None] * stride_dlb_n + (lora_offset + R_block)[None, :] * stride_dlb_r ) tl.atomic_add( DLB_blk_ptrs, (dB_acc * scaling).to(DLB_ptr.dtype.element_ty), mask=N_mask[:, None] & R_mask[None, :], ) def group_bwd_lora_fused( DY: torch.Tensor, X: torch.Tensor, lora_A: torch.Tensor, lora_B: torch.Tensor, expert_offsets: torch.Tensor, sorted_scattered_idxs: torch.Tensor, E: int, k: int, scaling: float, real_expert_offsets: Optional[torch.Tensor] = None, dy_grouped: bool = False, ) -> tuple[torch.Tensor, torch.Tensor]: """ Fused gather + LoRA gradient computation. Same result as group(X) + group(DY) + group_bwd_lora(DY, X, ...) but without the intermediate grouped buffers. Args: DY: Gradient w.r.t. output [M*k, N]. If dy_grouped=False: ungrouped (original token order), read via indirect indexing through sorted_scattered_idxs. If dy_grouped=True: already in grouped (expert-sorted) order, read directly. X: Input [M, K] (ungrouped, original token order). Always read via indirect indexing through sorted_scattered_idxs. lora_A: LoRA A weights [r*E, K] lora_B: LoRA B weights [N, r*E] expert_offsets: Cumulative token counts per expert [E] (or padded offsets if using token rounding) sorted_scattered_idxs: Maps grouped position -> original position [M*k] (or padded version if using token rounding) E: Number of experts k: Fan-out (top-k) scaling: LoRA scaling factor real_expert_offsets: Original cumulative counts for M_mask when using token rounding. If None, expert_offsets is used for both. dy_grouped: Whether DY is already in grouped order (default False). When True, avoids indirect indexing for DY, used for gate_up_proj backward where grouped_out=True. Returns: dA: Gradient for A [r*E, K] dB: Gradient for B [N, r*E] """ R = lora_A.size(0) // E K = X.size(1) N = DY.size(1) # Zero-init for atomic accumulation dA = torch.zeros_like(lora_A) dB = torch.zeros_like(lora_B) BLOCK_R = _block_r_for_rank(R) if real_expert_offsets is None: real_expert_offsets = expert_offsets def grid(META): return ( E * triton.cdiv(K, META["BLOCK_K"]), triton.cdiv(N, META["BLOCK_N"]), ) _group_bwd_lora_fused[grid]( DY, DY.stride(0), DY.stride(1), X, X.stride(0), X.stride(1), sorted_scattered_idxs, FAN_OUT=k, LA_ptr=lora_A, stride_la_r=lora_A.stride(0), stride_la_k=lora_A.stride(1), LB_ptr=lora_B, stride_lb_n=lora_B.stride(0), stride_lb_r=lora_B.stride(1), DLA_ptr=dA, stride_dla_r=dA.stride(0), stride_dla_k=dA.stride(1), DLB_ptr=dB, stride_dlb_n=dB.stride(0), stride_dlb_r=dB.stride(1), expert_offsets_ptr=expert_offsets, real_expert_offsets_ptr=real_expert_offsets, M=sorted_scattered_idxs.size(0), K=K, N=N, ACTUAL_R=R, BLOCK_R=BLOCK_R, scaling=scaling, ACC_TYPE=tl.float32, allow_tf32=ALLOW_TF32, dy_grouped=dy_grouped, ) return dA, dB ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Adapted from https://github.com/shawntan/scattermoe # Copyright (c) Shawn Tan and ScatterMoE Contributors # Licensed under the Apache License, Version 2.0 # See https://github.com/shawntan/scattermoe/blob/main/LICENSE from typing import Optional import torch import triton import triton.language as tl BLOCK_M = 128 ALLOW_TF32 = True @triton.jit def _compute_expert_block( E_idx, E_mask, M_in_idx, N_block, N_mask, X_ptr, stride_xm, stride_xk, W_ptr, stride_we, stride_wk, stride_wn, K, acc, no_k_mask, BLOCK_K, allow_tf32=True, ): K_block = tl.arange(0, BLOCK_K) X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk W_blk_ptrs = ( W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we ) iters = tl.cdiv(K, BLOCK_K) for K_block_id in range(iters): if no_k_mask: x = tl.load(X_blk_ptrs, mask=E_mask[:, None]) w = tl.load(W_blk_ptrs, mask=N_mask[None, :]) else: K_mask = (K_block_id * BLOCK_K + K_block) < K x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :]) w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :]) X_blk_ptrs += BLOCK_K * stride_xk W_blk_ptrs += BLOCK_K * stride_wk acc = tl.dot(x, w, acc, allow_tf32=allow_tf32) return acc def _scatter2scatter_configs(): return [ triton.Config({"BLOCK_N": 128, "BLOCK_K": 32}, num_stages=4, num_warps=4), ] @triton.autotune( configs=_scatter2scatter_configs(), key=["M", "N", "K"], ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _scatter2scatter( X_ptr, stride_xm: tl.constexpr, stride_xk: tl.constexpr, W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr, Y_ptr, stride_ym: tl.constexpr, stride_yn: tl.constexpr, B_ptr, stride_be: tl.constexpr, stride_bn: tl.constexpr, grouped_idx_ptr, expert_idxs_ptr, # block_start_idx_ptr, FAN_OUT: tl.constexpr, M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, ACC_TYPE: tl.constexpr, # OUT_M, allow_tf32: tl.constexpr, x_grouped: tl.constexpr, y_grouped: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, ): pid = tl.program_id(axis=0) N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N) M_block_id = pid // N_BLOCK_COUNT N_block_id = pid % N_BLOCK_COUNT M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M) N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_block < N M_boundary_mask = M_block < (FAN_OUT * M) E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E) no_k_mask = K % BLOCK_K == 0 acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) E_first_idx = tl.min(E_idxs) E_last_idx = tl.minimum(tl.max(E_idxs), E - 1) M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32) for E_idx in range(E_first_idx, E_last_idx + 1): E_mask = E_idxs == E_idx E_M_idx = M_idx if x_grouped: M_in_idx = M_block else: M_in_idx = E_M_idx // FAN_OUT acc = _compute_expert_block( E_idx, E_mask, M_in_idx, N_block, N_mask, X_ptr, stride_xm, stride_xk, W_ptr, stride_we, stride_wk, stride_wn, K, acc, no_k_mask, BLOCK_K, allow_tf32=allow_tf32, ) if B_ptr is not None: B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :]) if y_grouped: M_out_idx = M_block else: M_out_idx = M_idx Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn) tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :]) def scatter2scatter( X, W, sorted_expert_idxs, sorted_scattered_idxs, k, b=None, x_grouped=False, y_grouped=False, out=None, ): assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0) assert sorted_scattered_idxs.size(0) == X.size(0) * k # Pre-kernel setup y_dim = W.size(-1) L_scattered = sorted_expert_idxs.size(0) if out is None: output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype) else: assert out.size(0) == L_scattered and out.size(1) == y_dim output = out scatter2scatter_compileable( output, W, X, k, sorted_expert_idxs, sorted_scattered_idxs, b, x_grouped, y_grouped, ) return output @torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"}) def scatter2scatter_compileable( output: torch.Tensor, W: torch.Tensor, X: torch.Tensor, k: int, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, b: Optional[torch.Tensor], x_grouped: bool, y_grouped: bool, ) -> None: def grid(META): grid_num = ( triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"]) * triton.cdiv(META["N"], META["BLOCK_N"]), ) return grid_num if b is None: b = None stride_be = stride_bn = 0 else: stride_be, stride_bn = b.stride() _scatter2scatter[grid]( # X_ptr, stride_xm, stride_xk, X, X.stride(0), X.stride(1), # W_ptr, stride_we, stride_wk, stride_wn, W, W.stride(0), W.stride(1), W.stride(2), # Y_ptr, stride_ym, stride_yn, output, output.stride(0), output.stride(1), # B_ptr, stride_be, stride_bn b, stride_be, stride_bn, grouped_idx_ptr=sorted_scattered_idxs, expert_idxs_ptr=sorted_expert_idxs, # block_start_idx_ptr=padded_block_idxs, FAN_OUT=k, M=X.size(0), K=X.size(1), N=output.size(1), E=W.size(0), BLOCK_M=BLOCK_M, ACC_TYPE=tl.float32, allow_tf32=ALLOW_TF32, x_grouped=x_grouped, y_grouped=y_grouped, ) def _config_XtY(): return [ triton.Config( {"BLOCK_N": 128, "BLOCK_K": 128, "BLOCK_M": 32}, num_stages=4, num_warps=4 ), ] def group_bwd_W(DY, X, expert_offsets, E, has_bias=False): DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype) DW = DWt.permute(0, 2, 1) if has_bias: Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype) else: Db = None groupXtY_compileable(E, DW, Db, DY, X, expert_offsets) return DW, Db @torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW", "Db"}) def groupXtY_compileable( E: int, DW: torch.Tensor, Db: Optional[torch.Tensor], DY: torch.Tensor, X: torch.Tensor, expert_offsets: torch.Tensor, ) -> None: def grid(META): grid = ( E * triton.cdiv(META["K"], META["BLOCK_K"]), triton.cdiv(META["N"], META["BLOCK_N"]), ) return grid if Db is None: stride_dbe = 0 stride_dbn = 0 else: stride_dbe, stride_dbn = Db.stride() _groupXtY[grid]( # DY_ptr, stride_dym, stride_dyk, DY, DY.stride(0), DY.stride(1), # X_ptr, stride_xm, stride_xn, X, X.stride(0), X.stride(1), # DW_ptr, stride_dwe, stride_dwk, stride_dwn, DW, DW.stride(0), DW.stride(1), DW.stride(2), # Db_ptr, stride_dwe, stride_dbn, Db, stride_dbe, stride_dbn, # expert_offsets_ptr, expert_offsets, # K: tl.constexpr, N: tl.constexpr, M=DY.size(0), N=DY.size(-1), K=X.size(-1), # ACC_TYPE: tl.constexpr, ACC_TYPE=tl.float32, allow_tf32=ALLOW_TF32, ) @triton.autotune( configs=_config_XtY(), key=["M", "N", "K"], ) @triton.heuristics( { "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0, "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0, } ) @triton.jit def _groupXtY( DY_ptr, stride_dym, stride_dyk, X_ptr, stride_xm, stride_xn, DW_ptr, stride_dwe, stride_dwk, stride_dwn, Db_ptr, stride_dbe, stride_dbn, expert_offsets_ptr, M, K: tl.constexpr, N: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, ACC_TYPE: tl.constexpr, allow_tf32: tl.constexpr, NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr, ): pid0 = tl.program_id(axis=0) pid1 = tl.program_id(axis=1) num0 = tl.num_programs(0) num1 = tl.num_programs(1) # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128) pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4) K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K) E_idx = pid0 // K_BLOCK_COUNT K_block_id = pid0 % K_BLOCK_COUNT N_block_id = pid1 if E_idx == 0: start_idx = 0 else: start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32) end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32) if end_idx > start_idx: M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M) K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K) K_mask = K_block < K K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K) N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_block < N N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N) M_idxs = M_block xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm dy_blk_ptrs = ( DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk ) if (Db_ptr is not None) and (K_block_id == 0): _xty_and_bias( E_idx, start_idx, end_idx, M_block, K_block, K_mask, N_block, N_mask, dy_blk_ptrs, stride_dym, xt_blk_ptrs, stride_xm, DW_ptr, stride_dwe, stride_dwk, stride_dwn, Db_ptr, stride_dbe, stride_dbn, BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE, allow_tf32, NO_K_MASK, NO_N_MASK, compute_bias=True, ) else: _xty_and_bias( E_idx, start_idx, end_idx, M_block, K_block, K_mask, N_block, N_mask, dy_blk_ptrs, stride_dym, xt_blk_ptrs, stride_xm, DW_ptr, stride_dwe, stride_dwk, stride_dwn, Db_ptr, stride_dbe, stride_dbn, BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE, allow_tf32, NO_K_MASK, NO_N_MASK, compute_bias=False, ) @triton.jit def _xty_and_bias( E_idx, start_idx, end_idx, M_block, K_block, K_mask, N_block, N_mask, dy_blk_ptrs, stride_dym, xt_blk_ptrs, stride_xm, DW_ptr, stride_dwe, stride_dwk, stride_dwn, Db_ptr, stride_dbe, stride_dbn, BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE, allow_tf32, NO_K_MASK, NO_N_MASK, compute_bias: tl.constexpr, ): if compute_bias: db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE) else: db_acc = None acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE) iters = tl.cdiv(end_idx - start_idx, BLOCK_M) for i in range(0, iters): M_mask = (i * BLOCK_M + M_block) < end_idx if NO_K_MASK: xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :]) else: xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :]) if NO_N_MASK: dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None]) else: dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :]) acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32) xt_blk_ptrs += BLOCK_M * stride_xm dy_blk_ptrs += BLOCK_M * stride_dym if compute_bias: db_acc += tl.sum(dy, axis=0) DW_blk_ptrs = ( DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn ) acc = acc.to(DW_blk_ptrs.dtype.element_ty) tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :]) if compute_bias: Db_blk_ptrs = Db_ptr + E_idx * stride_dbe + N_block * stride_dbn tl.store(Db_blk_ptrs, db_acc, mask=N_mask) def _config_grouping(): return [ triton.Config({"BLOCK_N": 256, "BLOCK_K": 128}, num_stages=4, num_warps=4), # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4), # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4), ] def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None): N = sorted_expert_idxs.size(0) K = A.size(1) assert A.size(0) * fan_out == N if out is not None: Y = out else: Y = torch.empty((N, K), dtype=A.dtype, device=A.device) group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs) return Y @torch.library.custom_op("scattermoe::group", mutates_args={"Y"}) def group_compileable( A: torch.Tensor, K: int, N: int, Y: torch.Tensor, coeff: Optional[torch.Tensor], has_coeff: bool, fan_out: int, sorted_expert_idxs: torch.Tensor, ) -> None: def grid(META): grid_num = (triton.cdiv(META["N"], META["BLOCK_N"]),) return grid_num _group[grid]( # A_ptr, stride_an, stride_ai, A, A.stride(0), A.stride(1), has_coeff, coeff, fan_out, # Y_ptr, stride_yn, stride_yk, Y, Y.stride(0), Y.stride(1), # grouped_idx_ptr, sorted_expert_idxs, # N: tl.constexpr, K: tl.constexpr, N, K, ) @triton.autotune(configs=_config_grouping(), key=["K"]) @triton.heuristics({"NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0}) @triton.jit def _group( src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr, tgt_ptr, stride_tn, stride_ti, grouped_idx_ptr, N, K: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, NO_K_MASK: tl.constexpr, ): pid = tl.program_id(axis=0) N_block_id = pid N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N) N_mask = N_blk < N N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N) N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0) K_blk = tl.arange(0, BLOCK_K) src_blk_ptrs = ( src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk ) tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti if has_coeff: c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None] iters = tl.cdiv(K, BLOCK_K) for i in range(0, iters): if NO_K_MASK or i < iters - 1: block = tl.load(src_blk_ptrs, mask=N_mask[:, None]) if has_coeff: block *= c tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None]) else: K_mask = (i * BLOCK_K + K_blk) < K mask = N_mask[:, None] & K_mask[None, :] block = tl.load(src_blk_ptrs, mask=mask) if has_coeff: block *= c tl.store(tgt_blk_ptrs, block, mask=mask) src_blk_ptrs += BLOCK_K * stride_sk tgt_blk_ptrs += BLOCK_K * stride_ti ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/single.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Adapted from https://github.com/shawntan/scattermoe # Copyright (c) Shawn Tan and ScatterMoE Contributors # Licensed under the Apache License, Version 2.0 # See https://github.com/shawntan/scattermoe/blob/main/LICENSE import torch import triton import triton.language as tl @triton.jit def _single2scatter( X_ptr, stride_xm, stride_xk, W_ptr, stride_we, stride_wk, stride_wn, Y_ptr, stride_ym, stride_yn, expert_idxs_ptr, FAN_OUT: tl.constexpr, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, ACC_TYPE: tl.constexpr, ): pid0 = tl.program_id(axis=0) pid1 = tl.program_id(axis=1) N_block_id = pid0 if FAN_OUT == 1: in_idx = pid1 else: in_idx = 0 out_idx = pid1 K_block = tl.arange(0, BLOCK_K) N_block = tl.max_contiguous( tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N, ) E_idx = tl.load(expert_idxs_ptr + pid1) X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk W_blk_ptrs = ( W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn ) N_mask = N_block < N acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE) for _K_block_id in range(0, tl.cdiv(K, BLOCK_K)): K_mask = K_block < K x = tl.load(X_blk_ptrs, mask=K_mask[:, None], other=0.0) w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :], other=0.0) acc += tl.sum(x * w, axis=0)[None, :] X_blk_ptrs += BLOCK_K * stride_xk W_blk_ptrs += BLOCK_K * stride_wk K_block += BLOCK_K Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn tl.store(Y_blk_ptrs, acc, mask=N_mask[None, :]) def single2scatter(X, W, expert_idxs): E, xdim, ydim = W.size() k = expert_idxs.size(1) assert X.size(0) == k or X.size(0) == 1 Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype) BLOCK_N = 128 BLOCK_K = 128 grid = triton.cdiv(ydim, BLOCK_N), k _single2scatter[grid]( X, X.stride(0), X.stride(1), W, W.stride(0), W.stride(1), W.stride(2), Y, Y.stride(0), Y.stride(1), expert_idxs, FAN_OUT=Y.size(0) // X.size(0), K=xdim, N=ydim, E=E, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, ACC_TYPE=tl.float32, ) return Y ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py ================================================ # SPDX-License-Identifier: Apache-2.0 # # Original work Copyright (c) Shawn Tan and ScatterMoE Contributors # Adapted from https://github.com/shawntan/scattermoe # See https://github.com/shawntan/scattermoe/blob/main/LICENSE # # Modifications and LoRA adaptation Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ ScatterMoE layer replacements for HuggingFace MoE architectures. Provides drop-in forward replacements that use ScatterMoE kernels for acceleration. When used via the HF ``kernels`` library (``replace_kernel_forward_from_hub``), these classes replace the forward method of the original MoE block. LoRA support ------------ When peft wraps parameters via ``target_parameters``, the ``self.experts`` submodule becomes a chain of ``ParamWrapper`` objects and the ``self.gate`` router may also become a ``ParamWrapper``. The ``HFScatterMoEGatedMLP`` forward detects this and automatically: 1. Unwraps ``self.gate`` to the base router, applying gate LoRA delta 2. Unwraps ``self.experts`` to the base ``OlmoeExperts`` module 3. Extracts LoRA A/B weights and scaling from each wrapper 4. Converts B layout from peft rank-major to scattermoe expert-major 5. Routes to ``parallel_linear_lora`` for fused LoRA computation 6. Passes through ``self.shared_expert`` / ``self.shared_expert_gate`` (peft wraps their linear layers with standard LoRA, no special handling) """ import torch from torch import nn from torch.nn import functional as F from .parallel_experts import flatten_sort_count, parallel_linear from .parallel_linear_lora import get_lora_params_from_wrapper, parallel_linear_lora # ============================================================================= # LoRA layout conversion utilities (peft <-> scattermoe) # ============================================================================= def peft_lora_B_to_scattermoe(peft_B, num_experts, rank): """Convert peft rank-major lora_B ``[out, E*r]`` to scattermoe expert-major ``[N, r*E]``. peft reshapes B to ``[out, r, E]`` (rank-major). scattermoe slices B as ``[:, e*r:(e+1)*r]`` (expert-major). """ N = peft_B.shape[0] return ( peft_B.reshape(N, rank, num_experts) .permute(0, 2, 1) .contiguous() .reshape(N, num_experts * rank) ) def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank): """Convert peft LoRA weights to scattermoe layout (with A<->B swap). peft operates on the parameter in its native storage layout ``[E, dim1, dim2]`` where ``in_features=dim1, out_features=dim2``. ScatterMoE transposes the parameter (``W = param.transpose(2, 1)``) giving ``[E, dim2, dim1]`` with ``K=dim2, N=dim1``. Because of this transposition, peft's A and B roles are swapped relative to scattermoe's convention. peft gives: lora_A ``[r*E, dim1]``, lora_B ``[dim2, r*E]`` scattermoe needs: lora_A ``[r*E, K=dim2]``, lora_B ``[N=dim1, r*E]`` This function swaps A<->B and converts B from rank-major to expert-major. Uses vectorized tensor operations (no Python loop over experts). Works for **both** gate_up_proj and down_proj since the transposition issue is the same for any parameter. """ peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank) dim1 = peft_A.shape[1] # peft in_features -> scattermoe N dim2 = peft_B_em.shape[0] # peft out_features -> scattermoe K # smoe_A: per expert, transpose B_e [dim2, r] -> [r, dim2] # [dim2, E*r] -> [dim2, E, r] -> [E, r, dim2] -> [E*r, dim2] smoe_A = ( peft_B_em.reshape(dim2, num_experts, rank) .permute(1, 2, 0) .contiguous() .reshape(rank * num_experts, dim2) ) # smoe_B: per expert, transpose A_e [r, dim1] -> [dim1, r] # [E*r, dim1] -> [E, r, dim1] -> [dim1, E, r] -> [dim1, E*r] smoe_B = ( peft_A.reshape(num_experts, rank, dim1) .permute(2, 0, 1) .contiguous() .reshape(dim1, num_experts * rank) ) return smoe_A, smoe_B def peft_down_proj_lora_to_scattermoe(peft_A, peft_B, num_experts, rank): """Deprecated alias for :func:`peft_lora_to_scattermoe`.""" return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank) # ============================================================================= # ParamWrapper unwrapping # ============================================================================= def _unwrap_gate_lora(gate_module): """Unwrap peft ``ParamWrapper`` on the router gate. When peft targets ``gate.weight``, ``self.gate`` becomes:: ParamWrapper(weight) -> base_layer: OlmoeTopKRouter (the real module) This function detects the wrapping and returns the base router, its weight tensor, and an optional LoRA delta tensor. Returns: (base_gate, gate_weight, gate_lora_delta_or_None) ``base_gate`` is the original router module (with ``.top_k``, ``.num_experts``, ``.norm_topk_prob``). ``gate_weight`` is the base router weight (may be a DTensor under FSDP). ``gate_lora_delta_or_None`` is the LoRA delta tensor if LoRA is active, else ``None``. Kept separate to avoid mixing DTensor + Tensor in an add. """ if hasattr(gate_module, "base_layer") and hasattr(gate_module, "lora_A"): base_gate = gate_module.base_layer lora_A, lora_B, scaling = get_lora_params_from_wrapper(gate_module) if lora_A is not None: # gate weight: [num_experts, hidden_size] # lora_A: [r, hidden_size], lora_B: [num_experts, r] # delta = scaling * B @ A = [num_experts, hidden_size] delta = scaling * (lora_B @ lora_A) return base_gate, base_gate.weight, delta else: return base_gate, base_gate.weight, None else: # No wrapping — gate is the original module return gate_module, gate_module.weight, None def _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling): """Convert peft LoRA weights to scattermoe layout.""" smoe_A, smoe_B = peft_lora_to_scattermoe(lora_A, lora_B, num_experts, rank) return (smoe_A, smoe_B, scaling) def _unwrap_experts_lora(experts_module): """Walk a peft ``ParamWrapper`` chain on ``self.experts``. When peft targets ``experts.gate_up_proj`` and ``experts.down_proj`` via ``target_parameters``, ``self.experts`` becomes a nested chain:: ParamWrapper(down_proj) -> base_layer: ParamWrapper(gate_up_proj) -> base_layer: OlmoeExperts (the real module) This function walks the chain, collects LoRA params keyed by ``parameter_name``, and returns the base experts module. Returns: (base_experts, gup_lora, down_lora) Each ``*_lora`` is either ``(smoe_A, smoe_B, scaling)`` or ``None``. A/B are already in scattermoe layout. """ # Collect ParamWrapper layers by their parameter_name wrappers = {} module = experts_module while hasattr(module, "base_layer") and hasattr(module, "lora_A"): param_name = getattr(module, "parameter_name", None) if param_name is not None: wrappers[param_name] = module module = module.base_layer base_experts = module if not wrappers: return base_experts, None, None # Determine num_experts from base module num_experts = getattr(base_experts, "num_experts", None) if num_experts is None: # Fallback: infer from parameter shape gup = getattr(base_experts, "gate_up_proj", None) if gup is not None: num_experts = gup.shape[0] # Extract gate_up_proj LoRA (needs A<->B swap due to transposition) gup_lora = None gup_wrapper = wrappers.get("gate_up_proj") if gup_wrapper is not None: lora_A, lora_B, scaling = get_lora_params_from_wrapper(gup_wrapper) if lora_A is not None: rank = lora_A.shape[0] // num_experts gup_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling) # Extract down_proj LoRA (needs A<->B swap due to transposition) down_lora = None down_wrapper = wrappers.get("down_proj") if down_wrapper is not None: lora_A, lora_B, scaling = get_lora_params_from_wrapper(down_wrapper) if lora_A is not None: rank = lora_A.shape[0] // num_experts down_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling) return base_experts, gup_lora, down_lora # ============================================================================= # Routing helpers # ============================================================================= def _softmax_topk_route( moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta ): """Softmax→topk routing (Qwen, OLMoE, Mixtral, MiniMax). Returns: (routing_weights [T, K], selected_experts [T, K], top_k, num_experts) """ router_logits = F.linear(hidden_states, gate_weight) if gate_lora_delta is not None: router_logits = router_logits + F.linear(hidden_states, gate_lora_delta) routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32) top_k = base_gate.top_k num_experts = base_gate.num_experts routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1) if getattr(base_gate, "norm_topk_prob", True): routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) return routing_weights, selected_experts, top_k, num_experts def _sigmoid_topk_route( moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta ): """Sigmoid→topk routing (GLM, DeepSeek V3, MiniMax M2). Supports: - ``e_score_correction_bias`` on gate or moe_block - Group-based expert selection when ``n_group > 1`` - ``routed_scaling_factor`` applied to final weights - Final weights gathered from original sigmoid probs (not bias-corrected) Returns: (routing_weights [T, K], selected_experts [T, K], top_k, num_experts) """ router_logits = F.linear(hidden_states.float(), gate_weight.float()) if gate_lora_delta is not None: router_logits = router_logits + F.linear( hidden_states.float(), gate_lora_delta.float() ) router_probs = router_logits.sigmoid() # [T, E] top_k = getattr(moe_block, "top_k", getattr(base_gate, "top_k", None)) num_experts = getattr(moe_block, "n_routed_experts", gate_weight.shape[0]) # Bias-corrected scores for expert selection (not used for final weights). # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 on the block. e_score_correction_bias = getattr(base_gate, "e_score_correction_bias", None) if e_score_correction_bias is None: e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None) if e_score_correction_bias is not None: scores_for_choice = router_probs + e_score_correction_bias else: scores_for_choice = router_probs # Group-based selection: pick top groups, mask the rest n_group = getattr(moe_block, "n_group", 1) if n_group > 1: group_scores = ( scores_for_choice.view(-1, n_group, num_experts // n_group) .topk(2, dim=-1)[0] .sum(dim=-1) ) # [T, n_group] topk_group = getattr(moe_block, "topk_group", n_group) group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1] group_mask = torch.zeros_like(group_scores) group_mask.scatter_(1, group_idx, 1) score_mask = ( group_mask.unsqueeze(-1) .expand(-1, n_group, num_experts // n_group) .reshape(-1, num_experts) ) scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # Final topk from (possibly masked) scores topk_indices = torch.topk(scores_for_choice, k=top_k, dim=-1, sorted=False)[1] # Gather weights from original sigmoid scores (not bias-corrected) topk_weights = router_probs.gather(1, topk_indices) # Optional renormalization + scaling if getattr(moe_block, "norm_topk_prob", True): topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20) routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0) topk_weights = topk_weights * routed_scaling_factor return topk_weights, topk_indices, top_k, num_experts def _route(moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta): """Dispatch to the correct routing strategy based on block attributes. Detects sigmoid routing by the presence of ``e_score_correction_bias`` on either the gate or the moe_block. """ has_sigmoid = ( getattr(base_gate, "e_score_correction_bias", None) is not None or getattr(moe_block, "e_score_correction_bias", None) is not None ) if has_sigmoid: return _sigmoid_topk_route( moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta ) return _softmax_topk_route( moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta ) # ============================================================================= # Shared expert helpers # ============================================================================= def _compute_shared_expert(moe_block, hidden_states_flat): """Compute shared expert output if the block has one. Handles singular (qwen2_moe: ``shared_expert``), plural (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP (hunyuan_v1_moe: ``shared_mlp``) attribute names. peft wraps individual linear layers inside the shared expert with standard LoRA — calling forward() handles this transparently. """ shared_expert = ( getattr(moe_block, "shared_expert", None) or getattr(moe_block, "shared_experts", None) or getattr(moe_block, "shared_mlp", None) ) if shared_expert is None: return None shared_expert_output = shared_expert(hidden_states_flat) # Optional sigmoid gate (Qwen2MoE pattern). # shared_expert_gate may also be peft-wrapped (standard LoRA # on nn.Linear), its forward() applies LoRA automatically. shared_expert_gate = getattr(moe_block, "shared_expert_gate", None) if shared_expert_gate is not None: shared_expert_output = ( F.sigmoid(shared_expert_gate(hidden_states_flat)) * shared_expert_output ) return shared_expert_output # ============================================================================= # Layer classes # ============================================================================= class ScatterMoEGatedMLP(nn.Module): def forward(self, layer_input): """ Forward pass of the mixture of experts layer. Args: layer_input (Tensor): Input tensor. Returns: Tensor: Output tensor. """ bsz, length, emb_size = layer_input.size() layer_input = layer_input.reshape(-1, emb_size) # compute the top_k routing decision router_logits = self.router.layer(layer_input) routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) routing_weights, selected_experts = torch.topk( routing_weights, self.router.top_k, dim=-1 ) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(layer_input.dtype) sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count( selected_experts, num_experts=self.router.num_experts ) # compute experts gates, h = parallel_linear( layer_input, self.input_linear.weight.transpose(2, 1), self.router.top_k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, grouped_in=False, grouped_out=True, ).chunk(2, dim=-1) h = self.activation(gates) * h layer_output = parallel_linear( h, self.output_linear.weight.transpose(2, 1), 1, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, grouped_in=True, grouped_out=False, gates=routing_weights, ) layer_output = layer_output.view(bsz, length, emb_size) return layer_output class HFScatterMoEGatedMLP(nn.Module): """ ScatterMoE-accelerated forward pass for HF MoEs. Used as a kernel layer via the HF ``kernels`` library. The ``forward`` method replaces the original SparseMoeBlock.forward. Supports: * **Softmax→topk routing**: OLMoE, Qwen2/3MoE, Mixtral, MiniMax * **Sigmoid→topk routing**: GLM, DeepSeek V3, MiniMax M2 * **Full-parameter training**: uses ``parallel_linear`` (base ScatterMoE) * **LoRA fine-tuning**: detects peft ``ParamWrapper`` on ``self.experts``, extracts adapter weights, and uses ``parallel_linear_lora`` (fused kernel) """ @staticmethod def forward(self: nn.Module, layer_input: torch.Tensor): """ Forward pass using ScatterMoE kernels. Args: self: The MoeSparseMoeBlock module containing: - self.gate: Router (or peft ParamWrapper wrapping it) - self.experts: Experts module (or peft ParamWrapper chain) - self.shared_expert(s): Optional shared expert - self.shared_expert_gate: Optional shared expert gate layer_input: Input tensor [batch_size, seq_len, hidden_size] Returns: Tensor: [batch_size, seq_len, hidden_size] """ batch_size, sequence_length, hidden_dim = layer_input.shape hidden_states_flat = layer_input.view(-1, hidden_dim) # ==================================================================== # Shared Expert (if present, e.g. Qwen2MoE, DeepSeek V3) # ==================================================================== shared_expert_output = _compute_shared_expert(self, hidden_states_flat) # ==================================================================== # Router Computation (with optional gate LoRA) # ==================================================================== base_gate, gate_weight, gate_lora_delta = _unwrap_gate_lora(self.gate) routing_weights, selected_experts, top_k, num_experts = _route( self, base_gate, hidden_states_flat, gate_weight, gate_lora_delta ) routing_weights = routing_weights.to(hidden_states_flat.dtype) sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count( selected_experts, num_experts=num_experts ) # ==================================================================== # Detect LoRA (peft ParamWrapper) and extract adapter weights # ==================================================================== experts, gup_lora, down_lora = _unwrap_experts_lora(self.experts) # ==================================================================== # Selective expert weight dequantization # ==================================================================== # When experts are BnB-quantized (quantize_moe_experts), dequantize # only the active experts instead of all E. This saves ~97% memory # for the transient dequant buffer when few experts are active. use_selective = ( getattr(self, "_use_selective_dequant", False) and hasattr(experts, "parametrizations") and "gate_up_proj" in experts.parametrizations ) if use_selective: from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import ( get_active_experts, remap_expert_indices, selective_expert_weights, selective_lora_weights, ) active_experts = get_active_experts(sorted_expert_idxs, num_experts) remapped_expert_idxs, compact_offsets = remap_expert_indices( sorted_expert_idxs, expert_offsets, active_experts, num_experts, ) # Dequantize only active experts' weights gate_up_W = selective_expert_weights( experts, "gate_up_proj", active_experts, ).transpose(2, 1) # [num_active, hidden, 2*inter] # Remap LoRA weights to match compact expert indices if gup_lora is not None: gup_A, gup_B, gup_scaling = gup_lora gup_A, gup_B = selective_lora_weights( gup_A, gup_B, active_experts, num_experts, ) gup_lora = (gup_A, gup_B, gup_scaling) # Use remapped indices for ScatterMoE kernels sei_gup = remapped_expert_idxs eo_gup = compact_offsets else: gate_up_W = experts.gate_up_proj.transpose(2, 1) # [E, hidden, 2*inter] sei_gup = sorted_expert_idxs eo_gup = expert_offsets # ==================================================================== # Gate + Up projection # ==================================================================== if gup_lora is not None: gup_A, gup_B, gup_scaling = gup_lora gup = parallel_linear_lora( hidden_states_flat, gate_up_W, top_k, sei_gup, sorted_scattered_idxs, eo_gup, lora_A=gup_A, lora_B=gup_B, scaling=gup_scaling, grouped_in=False, grouped_out=True, use_fused_dX=True, use_fused_gather=True, ) else: gup = parallel_linear( hidden_states_flat, gate_up_W, top_k, sei_gup, sorted_scattered_idxs, eo_gup, grouped_in=False, grouped_out=True, ) gates, h = gup.chunk(2, dim=-1) h = experts.act_fn(gates) * h # ==================================================================== # Down projection # ==================================================================== if use_selective: down_W = selective_expert_weights( experts, "down_proj", active_experts, ).transpose(2, 1) # [num_active, inter, hidden] if down_lora is not None: down_A, down_B, down_scaling = down_lora down_A, down_B = selective_lora_weights( down_A, down_B, active_experts, num_experts, ) down_lora = (down_A, down_B, down_scaling) sei_down = remapped_expert_idxs eo_down = compact_offsets else: down_W = experts.down_proj.transpose(2, 1) # [E, inter, hidden] sei_down = sorted_expert_idxs eo_down = expert_offsets if down_lora is not None: down_A, down_B, down_scaling = down_lora expert_output = parallel_linear_lora( h, down_W, 1, sei_down, sorted_scattered_idxs, eo_down, lora_A=down_A, lora_B=down_B, scaling=down_scaling, gates=routing_weights, grouped_in=True, grouped_out=False, use_fused_dX=True, use_fused_gather=True, ) else: expert_output = parallel_linear( h, down_W, 1, sei_down, sorted_scattered_idxs, eo_down, grouped_in=True, grouped_out=False, gates=routing_weights, ) # ==================================================================== # Combine with shared expert and reshape # ==================================================================== if shared_expert_output is not None: expert_output = expert_output + shared_expert_output expert_output = expert_output.view(batch_size, sequence_length, hidden_dim) return expert_output ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_ops.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ ParallelExperts module with LoRA support. Provides a drop-in replacement for ScatterMoE's ParallelExperts that uses the fused LoRA kernel when adapter weights are attached. """ from typing import Optional import torch import torch.nn as nn from .parallel_linear_lora import parallel_linear_lora class ParallelExperts(nn.Module): """ Parallel Experts with fused LoRA support. Drop-in replacement for the original ParallelExperts. When LoRA parameters are attached via set_lora(), the forward pass uses a fused kernel: Y = X @ W + scaling * (X @ A^T) @ B^T """ def __init__( self, num_experts: int, input_size: int, output_size: int, bias: bool = False, ) -> None: super().__init__() self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size)) if bias: self.bias = nn.Parameter(torch.empty(num_experts, output_size)) else: self.bias = None self.num_experts = num_experts self.input_size = input_size self.output_size = output_size self._lora_A: torch.Tensor | None = None self._lora_B: torch.Tensor | None = None self._lora_scaling: float | None = None self.reset_parameters() def reset_parameters(self) -> None: nn.init.normal_(self.weight, std=0.02) if self.bias is not None: nn.init.zeros_(self.bias) def extra_repr(self) -> str: return ( f"num_experts={self.num_experts}, " f"input_size={self.input_size}, " f"output_size={self.output_size}" ) def set_lora(self, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float): """Attach LoRA parameters for fused computation.""" self._lora_A = lora_A self._lora_B = lora_B self._lora_scaling = scaling def clear_lora(self): """Remove LoRA parameters.""" self._lora_A = None self._lora_B = None self._lora_scaling = None def forward( self, inputs: torch.Tensor, k: int, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, expert_offsets: torch.Tensor, gates: Optional[torch.Tensor] = None, grouped_in: bool = False, grouped_out: bool = False, ) -> torch.Tensor: return parallel_linear_lora( inputs, self.weight.permute(0, 2, 1), # [E, input, output] k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, lora_A=self._lora_A, lora_B=self._lora_B, scaling=self._lora_scaling if self._lora_scaling is not None else 1.0, expert_biases=self.bias, gates=gates, grouped_in=grouped_in, grouped_out=grouped_out, ) ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Adapted from https://github.com/shawntan/scattermoe # Copyright (c) Shawn Tan and ScatterMoE Contributors # Licensed under the Apache License, Version 2.0 # See https://github.com/shawntan/scattermoe/blob/main/LICENSE from typing import Optional import torch import torch.nn as nn from . import kernels @torch.library.custom_op("scattermoe::bincount", mutates_args={}) def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor: return x.bincount(minlength=minlength) @compileable_bincount.register_fake def _(x: torch.Tensor, minlength: int) -> torch.Tensor: return torch.empty(minlength, dtype=torch.long, device=x.device) @torch.compile def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int): with torch.no_grad(): flattened_expert_idxs = expert_idxs.flatten() sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs) expert_counts = compileable_bincount( flattened_expert_idxs, minlength=num_experts ) expert_offsets = expert_counts.cumsum(-1) return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets class ParallelLinear(torch.autograd.Function): @staticmethod def forward( ctx, x: torch.Tensor, expert_weights: torch.Tensor, k: int, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, expert_offsets: torch.Tensor, expert_biases: Optional[torch.Tensor] = None, gates: Optional[torch.Tensor] = None, grouped_in: bool = False, grouped_out: bool = False, ): with torch.device(x.device): output = kernels.ops.scatter2scatter( X=x, W=expert_weights, b=expert_biases, k=k, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, x_grouped=grouped_in, y_grouped=grouped_out, ) if gates is not None: output_expanded = output.view( gates.size(0), gates.size(1), output.size(-1) ) output = (gates.unsqueeze(1) @ output_expanded).squeeze(1) else: output_expanded = None ctx.save_for_backward( x, expert_weights, expert_biases, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, gates, output_expanded, ) ctx.grouped_in = grouped_in ctx.grouped_out = grouped_out ctx.k = k return output @staticmethod def backward(ctx, grad_out: torch.Tensor): with torch.device(grad_out.device): ( x, expert_weights, expert_biases, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, gates, output_expanded, ) = ctx.saved_tensors k = ctx.k grouped_in = ctx.grouped_in grouped_out = ctx.grouped_out if gates is not None: # calculate gates gradient # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1) d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1) gates_flat = gates.flatten() gate_fan = gates.size(1) grouped_grad_out = output_expanded.flatten( 0, 1 ) # reuse expanded buffer later else: d_gates = None gates_flat = None gate_fan = 1 grouped_grad_out = None if grouped_out: grouped_grad_out = grad_out else: grouped_grad_out = kernels.ops.group( grad_out, sorted_scattered_idxs, fan_out=gate_fan, coeff=gates_flat, out=grouped_grad_out, ) if grouped_in: grouped_x = x d_expanded_input = None else: grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k) d_expanded_input = grouped_x d_weights, d_biases = kernels.ops.group_bwd_W( DY=grouped_grad_out, X=grouped_x, expert_offsets=expert_offsets, E=expert_weights.size(0), has_bias=expert_biases is not None, ) d_expanded_input = kernels.ops.scatter2scatter( X=grouped_grad_out, x_grouped=True, W=expert_weights.permute(0, 2, 1), sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=1, y_grouped=grouped_in, out=d_expanded_input, # Reuse grouped_x buffer ) if k == 1: d_input = d_expanded_input else: d_input = d_expanded_input.view( x.size(0), k, d_expanded_input.size(-1) ).sum(-2) return ( # x, expert_weights, d_input, d_weights, # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, None, None, None, None, # bias, gates d_biases, d_gates, # grouped_in, grouped_out, None, None, ) def parallel_linear( inputs, expert_weights, k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, expert_biases=None, gates=None, grouped_in=False, grouped_out=False, ): results = ParallelLinear.apply( inputs, expert_weights, k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, expert_biases, gates, grouped_in, grouped_out, ) return results class ParallelExperts(nn.Module): def __init__(self, num_experts, input_size, output_size, bias=False) -> None: super().__init__() self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size)) if bias: self.bias = nn.Parameter(torch.empty(num_experts, output_size)) else: self.bias = None self.num_experts = num_experts self.input_size = input_size self.output_size = output_size self.reset_parameters() def extra_repr(self): return "num_experts={}, input_size={}, output_size={}".format( self.num_experts, self.input_size, self.output_size ) def reset_parameters(self) -> None: nn.init.normal_(self.weight, std=0.02) if self.bias is not None: nn.init.zeros_(self.bias) def forward( self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, gates=None, grouped_in=False, grouped_out=False, ): results = parallel_linear( inputs, self.weight.permute(0, 2, 1), k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, expert_biases=self.bias, gates=gates, grouped_in=grouped_in, grouped_out=grouped_out, ) return results ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ ScatterMoE + LoRA Autograd Function ==================================== Provides the autograd function and Python interface for fused ScatterMoE + LoRA. Key design for LoRA training: - Expert weights W are FROZEN (no gradient computed for W). - Only LoRA adapter weights (A, B) receive gradients. - The input gradient dX is still computed (needed for upstream layers). - This avoids the expensive group_bwd_W computation entirely. Forward: Y = X @ W + scaling * (X @ A^T) @ B^T Backward (W frozen): dX = dY @ W^T + scaling * (dY @ B) @ A (via scatter2scatter for base, separate for LoRA) dA = scaling * (dY @ B)^T @ X (per-expert, on grouped data) dB = scaling * dY^T @ (X @ A^T) (per-expert, on grouped data) """ from typing import Optional import torch from .kernels import ops as base_ops from .kernels.lora_ops import ( group_bwd_lora, group_bwd_lora_fused, scatter2scatter_lora, scatter2scatter_lora_dX, ) class ScatterMoELoRA(torch.autograd.Function): """ Autograd function for fused ScatterMoE + LoRA with frozen expert weights. This function is optimized for the LoRA fine-tuning scenario where: - Expert weights W are frozen (requires_grad=False) - Only LoRA A and B matrices receive gradients - Input gradients are computed for upstream layer backprop """ @staticmethod def forward( ctx, x: torch.Tensor, expert_weights: torch.Tensor, k: int, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, expert_offsets: torch.Tensor, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float, expert_biases: Optional[torch.Tensor] = None, gates: Optional[torch.Tensor] = None, grouped_in: bool = False, grouped_out: bool = False, use_fused_dX: bool = False, use_fused_gather: bool = False, ): with torch.device(x.device): # Fused forward: Y = X @ W + scaling * (X @ A^T) @ B^T output = scatter2scatter_lora( X=x, W=expert_weights, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=k, lora_A=lora_A, lora_B=lora_B, scaling=scaling, b=expert_biases, x_grouped=grouped_in, y_grouped=grouped_out, ) # Handle gating (weighted combination of top-k expert outputs) if gates is not None: output_expanded = output.view( gates.size(0), gates.size(1), output.size(-1) ) output = (gates.unsqueeze(1) @ output_expanded).squeeze(1) else: output_expanded = None ctx.save_for_backward( x, lora_A, lora_B, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, gates, output_expanded, ) # Store frozen weights as plain Python attributes instead of # save_for_backward. This avoids: # 1. Version-check conflicts with FSDP unshard/reshard # 2. Pinning all-gathered parameters via saved_tensors hooks # 3. Interfering with activation offloading pack/unpack hooks # Safe because expert_weights are frozen (requires_grad=False). ctx.expert_weights = expert_weights ctx.expert_biases = expert_biases ctx.grouped_in = grouped_in ctx.grouped_out = grouped_out ctx.k = k ctx.scaling = scaling ctx.use_fused_dX = use_fused_dX ctx.use_fused_gather = use_fused_gather return output @staticmethod def backward(ctx, grad_out: torch.Tensor): with torch.device(grad_out.device): ( x, lora_A, lora_B, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, gates, output_expanded, ) = ctx.saved_tensors expert_weights = ctx.expert_weights k = ctx.k scaling = ctx.scaling grouped_in = ctx.grouped_in grouped_out = ctx.grouped_out E = expert_weights.size(0) # ------------------------------------------------------------------ # Gate gradients (if using top-k gating with routing weights) # ------------------------------------------------------------------ if gates is not None: # d_gates[t, j] = output_expanded[t, j, :] . grad_out[t, :] d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1) gates_flat = gates.flatten() gate_fan = gates.size(1) # Reuse output_expanded buffer for grouped_grad_out grouped_grad_out = output_expanded.flatten(0, 1) else: d_gates = None gates_flat = None gate_fan = 1 grouped_grad_out = None # ------------------------------------------------------------------ # LoRA gradients (dA, dB) and setup for dX # ------------------------------------------------------------------ # Fused gather uses sorted_scattered_idxs for indirect X access # in the Triton kernel, avoiding the group(x) allocation. # # can_fuse_gather: X is ungrouped and not too large for scatter loads # - When gates is None and grouped_out=False: both DY and X ungrouped # - When grouped_out=True (gate_up_proj): DY already grouped, X ungrouped # -> use dy_grouped=True in the fused kernel M_total = sorted_scattered_idxs.size(0) K_dim = x.size(-1) N_dim = expert_weights.size(-1) fuse_gather_workload = M_total * max(K_dim, N_dim) _FUSE_GATHER_THRESHOLD = 2**24 # ~16M elements can_fuse_gather = ( ctx.use_fused_gather and not grouped_in # X must be ungrouped for scatter access and gates is None # gate coeff requires multiplicative gather and fuse_gather_workload < _FUSE_GATHER_THRESHOLD ) if can_fuse_gather: # ------------------------------------------------------------------ # Fused path: skip group(x) entirely # ------------------------------------------------------------------ d_expanded_input = None d_lora_A, d_lora_B = group_bwd_lora_fused( DY=grad_out, X=x, lora_A=lora_A, lora_B=lora_B, expert_offsets=expert_offsets, sorted_scattered_idxs=sorted_scattered_idxs, E=E, k=k, scaling=scaling, dy_grouped=grouped_out, ) # Prepare grouped_grad_out for the dX path (needed by both # the fused dX kernel when grouped_out=True, and the non-fused path) if grouped_out: grouped_grad_out = grad_out elif not ctx.use_fused_dX: grouped_grad_out = base_ops.group( grad_out, sorted_scattered_idxs, fan_out=gate_fan, coeff=gates_flat, out=grouped_grad_out, ) else: # ------------------------------------------------------------------ # Original path: explicit group() calls # ------------------------------------------------------------------ if grouped_out: grouped_grad_out = grad_out else: grouped_grad_out = base_ops.group( grad_out, sorted_scattered_idxs, fan_out=gate_fan, coeff=gates_flat, out=grouped_grad_out, ) if grouped_in: grouped_x = x d_expanded_input = None else: grouped_x = base_ops.group(x, sorted_scattered_idxs, fan_out=k) d_expanded_input = grouped_x # Will be overwritten; reuse buffer d_lora_A, d_lora_B = group_bwd_lora( DY=grouped_grad_out, X=grouped_x, lora_A=lora_A, lora_B=lora_B, expert_offsets=expert_offsets, E=E, scaling=scaling, ) # ------------------------------------------------------------------ # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A # ------------------------------------------------------------------ if ctx.use_fused_dX: if can_fuse_gather and not grouped_out: # Fully fused: read ungrouped DY via scatter pattern d_expanded_input = scatter2scatter_lora_dX( DY=grad_out, W=expert_weights, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=1, lora_A=lora_A, lora_B=lora_B, scaling=scaling, dy_grouped=False, dx_grouped=grouped_in, out=d_expanded_input, ) else: # Fused dX only: read from pre-grouped DY d_expanded_input = scatter2scatter_lora_dX( DY=grouped_grad_out, W=expert_weights, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=1, lora_A=lora_A, lora_B=lora_B, scaling=scaling, dy_grouped=True, dx_grouped=grouped_in, out=d_expanded_input, ) else: # Original path: separate base scatter2scatter + LoRA Python loop d_expanded_input = base_ops.scatter2scatter( X=grouped_grad_out, x_grouped=True, W=expert_weights.permute(0, 2, 1), # [E, N, K] sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=1, y_grouped=grouped_in, out=d_expanded_input, ) # LoRA part: dX_lora = scaling * (dY @ B) @ A if scaling != 0.0: d_input_lora_grouped = _compute_lora_input_grad( grouped_grad_out, lora_A, lora_B, expert_offsets, E, scaling, ) if grouped_in: d_expanded_input.add_(d_input_lora_grouped) else: # Scatter-add LoRA gradient directly into d_expanded_input. # Avoids allocating a zeros_like + add result d_expanded_input[sorted_scattered_idxs] += d_input_lora_grouped # Reduce over top-k if k > 1 if k == 1: d_input = d_expanded_input else: d_input = d_expanded_input.view( x.size(0), k, d_expanded_input.size(-1) ).sum(-2) # W is frozen during LoRA training -- skip weight gradient d_weights = ( torch.zeros_like(expert_weights) if expert_weights.requires_grad else None ) d_biases = None return ( d_input, d_weights, None, None, None, None, # k, sorted indices, offsets d_lora_A, d_lora_B, None, # lora_A, lora_B, scaling d_biases, d_gates, None, None, # grouped_in, grouped_out None, # use_fused_dX None, # use_fused_gather ) def _compute_lora_input_grad( grouped_grad_out: torch.Tensor, lora_A: torch.Tensor, lora_B: torch.Tensor, expert_offsets: torch.Tensor, E: int, scaling: float, ) -> torch.Tensor: """ Compute the LoRA contribution to the input gradient: dX_lora = scaling * (dY @ B) @ A Uses PyTorch ops on expert-grouped data. Each expert e: dX_e = scaling * (dY_e @ B_e) @ A_e """ R = lora_A.size(0) // E K = lora_A.size(1) M_total = grouped_grad_out.size(0) d_input_lora = torch.zeros( (M_total, K), device=grouped_grad_out.device, dtype=grouped_grad_out.dtype ) compute_dtype = grouped_grad_out.dtype prev_offset = 0 for e in range(E): curr_offset = expert_offsets[e].item() if curr_offset > prev_offset: dy_e = grouped_grad_out[prev_offset:curr_offset] # [M_e, N] a_e = lora_A[e * R : (e + 1) * R, :].to(compute_dtype) # [r, K] b_e = lora_B[:, e * R : (e + 1) * R].to(compute_dtype) # [N, r] # dX_e = scaling * (dY_e @ B_e) @ A_e dy_b = dy_e @ b_e # [M_e, r] dx_e = scaling * (dy_b @ a_e) # [M_e, K] d_input_lora[prev_offset:curr_offset] = dx_e prev_offset = curr_offset return d_input_lora # ============================================================================= # Helper: Extract LoRA params from PEFT ParamWrapper # ============================================================================= def get_lora_params_from_wrapper(module) -> tuple: """ Extract LoRA parameters from a PEFT ParamWrapper. Returns: (lora_A, lora_B, scaling) if LoRA is active, else (None, None, None) """ if not hasattr(module, "lora_A") or not hasattr(module, "lora_B"): return None, None, None active_adapters = getattr(module, "active_adapters", ["default"]) if not active_adapters: return None, None, None adapter_name = active_adapters[0] lora_A_dict = getattr(module, "lora_A", {}) lora_B_dict = getattr(module, "lora_B", {}) scaling_dict = getattr(module, "scaling", {}) if adapter_name not in lora_A_dict: return None, None, None lora_A = lora_A_dict[adapter_name].weight lora_B = lora_B_dict[adapter_name].weight scaling = scaling_dict[adapter_name] return lora_A, lora_B, scaling # ============================================================================= # Drop-in replacement for parallel_linear # ============================================================================= def parallel_linear_lora( inputs: torch.Tensor, expert_weights: torch.Tensor, k: int, sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor, expert_offsets: torch.Tensor, lora_A: Optional[torch.Tensor] = None, lora_B: Optional[torch.Tensor] = None, scaling: float = 1.0, expert_biases: Optional[torch.Tensor] = None, gates: Optional[torch.Tensor] = None, grouped_in: bool = False, grouped_out: bool = False, use_fused_dX: bool = False, use_fused_gather: bool = False, ): """ Drop-in replacement for parallel_linear that supports LoRA. If lora_A and lora_B are provided, uses fused LoRA kernel. Otherwise falls back to standard scatter2scatter. """ if lora_A is not None and lora_B is not None: return ScatterMoELoRA.apply( inputs, expert_weights, k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, lora_A, lora_B, scaling, expert_biases, gates, grouped_in, grouped_out, use_fused_dX, use_fused_gather, ) else: from .parallel_experts import ParallelLinear return ParallelLinear.apply( inputs, expert_weights, k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, expert_biases, gates, grouped_in, grouped_out, ) ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py ================================================ """ Selective Expert Dequantization =============================== Instead of dequantizing all E expert weight matrices at once (which creates a ~1 GB transient buffer for 256 experts), only dequantize the experts that are actually routed to by the current batch's top-k selection. For Qwen3.5-35B-A3B (E=256, top_k=8, hidden=2048, intermediate=512): - Full dequant: [256, 2048, 1024] = 1,074 MB per projection - Selective (8 active): [8, 2048, 1024] = 33.5 MB per projection - Savings: ~97% memory reduction per layer This module provides format-agnostic selective weight extraction: - BnB 4-bit (nf4/fp4): slice quantized data + absmax per expert - bf16/fp32: direct indexing (no dequant needed) - FP8: slice + cast The ScatterMoE kernel itself doesn't change — we remap expert indices from global (0..E-1) to compact (0..num_active-1) and pass the smaller weight tensor. """ import torch import torch.nn as nn def get_active_experts(sorted_expert_idxs: torch.Tensor, E: int) -> torch.Tensor: """Get sorted unique expert indices from the routing output. Args: sorted_expert_idxs: Expert assignments sorted by expert id [T*k] E: Total number of experts Returns: active: Sorted unique expert indices [num_active] """ return torch.unique(sorted_expert_idxs) def remap_expert_indices( sorted_expert_idxs: torch.Tensor, expert_offsets: torch.Tensor, active_experts: torch.Tensor, E: int, ) -> tuple[torch.Tensor, torch.Tensor]: """Remap global expert indices to compact indices. Maps expert ids from [0..E-1] to [0..num_active-1], preserving the sort order. Also compacts expert_offsets to only active experts. Args: sorted_expert_idxs: [T*k] expert ids in sorted order expert_offsets: [E] cumulative token counts (original) active_experts: [num_active] sorted unique expert ids E: Total number of experts Returns: remapped_idxs: [T*k] expert ids in [0..num_active-1] compact_offsets: [num_active] cumulative token counts """ # Build remap table: global_id -> compact_id remap = torch.empty(E, dtype=torch.long, device=sorted_expert_idxs.device) remap[active_experts] = torch.arange( len(active_experts), device=sorted_expert_idxs.device ) remapped_idxs = remap[sorted_expert_idxs] # Compact the expert_offsets: only keep active experts' cumulative counts compact_offsets = expert_offsets[active_experts] return remapped_idxs, compact_offsets def _selective_dequant_bnb4( raw_param: torch.Tensor, quant_state, active_experts: torch.Tensor, expert_shape: tuple[int, int], ) -> torch.Tensor: """Dequantize only selected experts from BnB 4-bit packed data. The raw parameter is a flattened 4-bit packed tensor. Each expert's data is contiguous (stored in expert-major order), so we can gather the packed data and absmax blocks for active experts, then dequantize as one contiguous block. Args: raw_param: Flattened uint8 tensor of packed 4-bit weights quant_state: BnB QuantState with absmax, blocksize, code, etc. active_experts: [num_active] expert indices to dequantize expert_shape: (dim1, dim2) shape per expert (e.g. (1024, 2048)) Returns: Dequantized weights [num_active, dim1, dim2] in original dtype """ import bitsandbytes.functional as F # noqa: N812 from bitsandbytes.functional import QuantState expert_numel = expert_shape[0] * expert_shape[1] packed_per_expert = expert_numel // 2 # 4-bit = 2 values per byte blocks_per_expert = expert_numel // quant_state.blocksize num_active = len(active_experts) if blocks_per_expert == 0: # Expert is smaller than one quantization block — blocks span across # expert boundaries, so per-expert slicing isn't possible. # Fallback: full dequantize + index. full = F.dequantize_4bit(raw_param, quant_state) E_total = full.numel() // expert_numel return full.reshape(E_total, *expert_shape)[active_experts] # Use fused Triton kernel for NF4 (handles selective gather + dequant in one pass) if quant_state.quant_type == "nf4" and raw_param.dtype == torch.uint8: from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant_kernel import ( selective_dequant_nf4_triton, ) # Handle nested (double) quantization: dequantize absmax first # BnB uses dequantize_blockwise (not _4bit) for nested absmax + offset if quant_state.nested: absmax = F.dequantize_blockwise(quant_state.absmax, quant_state.state2) absmax += quant_state.offset if absmax.dtype != torch.float32: absmax = absmax.float() else: absmax = quant_state.absmax return selective_dequant_nf4_triton( packed_data=raw_param, absmax=absmax, active_experts=active_experts, expert_shape=expert_shape, blocksize=quant_state.blocksize, dtype=quant_state.dtype, codebook=quant_state.code, ) # Fallback: gather + BnB dequant (for fp4 or non-uint8 packed formats) raw_flat = raw_param.reshape(-1) offsets_qt = ( active_experts.long()[:, None] * packed_per_expert + torch.arange(packed_per_expert, device=raw_param.device)[None, :] ).reshape(-1) qt_gathered = raw_flat[offsets_qt] offsets_abs = ( active_experts.long()[:, None] * blocks_per_expert + torch.arange(blocks_per_expert, device=raw_param.device)[None, :] ).reshape(-1) if quant_state.nested: full_absmax = F.dequantize_blockwise(quant_state.absmax, quant_state.state2) full_absmax += quant_state.offset if full_absmax.dtype != torch.float32: full_absmax = full_absmax.float() absmax_gathered = full_absmax[offsets_abs] else: absmax_gathered = quant_state.absmax[offsets_abs] qt_gathered = qt_gathered.unsqueeze(1) if qt_gathered.dim() == 1 else qt_gathered gathered_qs = QuantState( absmax=absmax_gathered, shape=torch.Size([num_active * expert_numel]), blocksize=quant_state.blocksize, quant_type=quant_state.quant_type, code=quant_state.code, dtype=quant_state.dtype, ) deq = F.dequantize_4bit(qt_gathered, gathered_qs) return deq.reshape(num_active, *expert_shape) def _selective_index_dense( param: torch.Tensor, active_experts: torch.Tensor, ) -> torch.Tensor: """Select experts from a dense (bf16/fp32) weight tensor. Simple indexing — no dequantization needed. """ return param[active_experts] def selective_expert_weights( experts_module: nn.Module, param_name: str, active_experts: torch.Tensor, ) -> torch.Tensor: """Extract and dequantize only the active experts' weights. Format-agnostic: dispatches based on whether the parameter is BnB 4-bit quantized (via parametrize), FP8, or dense bf16/fp32. Args: experts_module: The base experts module (e.g. Qwen3_5MoeExperts) param_name: "gate_up_proj" or "down_proj" active_experts: [num_active] sorted unique expert indices Returns: Compact weight tensor [num_active, dim1, dim2] ready for ScatterMoE """ # Check if the parameter is BnB-quantized via parametrize if ( hasattr(experts_module, "parametrizations") and param_name in experts_module.parametrizations ): param_list = experts_module.parametrizations[param_name] parametrization = param_list[0] # BnB 4-bit parametrization if hasattr(parametrization, "quant_state"): # The raw quantized data is on the ParametrizationList, not the # individual Bnb4bitParametrization module raw_param = param_list.original qs = parametrization.quant_state # qs.shape is the original tensor shape before flattening. # For MoE experts it's [E, d1, d2] (3D) or [total_elements] (1D). orig_shape = qs.shape if isinstance(orig_shape, torch.Size) and len(orig_shape) == 3: expert_shape = (orig_shape[1], orig_shape[2]) elif isinstance(orig_shape, torch.Size) and len(orig_shape) == 1: # Flattened — need to infer from module attributes E_total = getattr(experts_module, "num_experts", None) if E_total is None: E_total = int(active_experts.max().item()) + 1 expert_numel = orig_shape[0] // E_total d2 = getattr(experts_module, "hidden_dim", None) or getattr( experts_module, "intermediate_dim", None ) if d2 and expert_numel % d2 == 0: expert_shape = (expert_numel // d2, d2) else: full = getattr(experts_module, param_name) return full[active_experts] else: full = getattr(experts_module, param_name) return full[active_experts] return _selective_dequant_bnb4(raw_param, qs, active_experts, expert_shape) # Dense parameter (bf16/fp32) — direct indexing param = getattr(experts_module, param_name) if param.dim() == 3: return param[active_experts] # Fallback: full access return param def selective_lora_weights( lora_A: torch.Tensor, lora_B: torch.Tensor, active_experts: torch.Tensor, E: int, ) -> tuple[torch.Tensor, torch.Tensor]: """Select LoRA A and B weights for only the active experts. LoRA layout (scattermoe format): A: [r*E, K] — expert e occupies rows [e*r : (e+1)*r] B: [N, r*E] — expert e occupies cols [e*r : (e+1)*r] Returns compact: A: [r*num_active, K] B: [N, r*num_active] """ R = lora_A.size(0) // E # Vectorized gather: active_experts[:, None] * R + arange(R)[None, :] row_idx = ( active_experts.long()[:, None] * R + torch.arange(R, device=lora_A.device)[None, :] ).reshape(-1) compact_A = lora_A[row_idx] # [r*num_active, K] compact_B = lora_B[:, row_idx] # [N, r*num_active] return compact_A, compact_B ================================================ FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant_kernel.py ================================================ """ Triton kernel for fused selective expert gather + NF4 dequantization. Instead of: 1. Gather packed uint8 data for active experts (memory copy) 2. Gather absmax for active experts (memory copy) 3. Call BnB dequantize_4bit CUDA kernel This kernel does all three in one pass: - Reads packed NF4 bytes from expert-strided positions - Looks up the NF4 codebook - Multiplies by the per-block absmax - Writes bf16 output directly This eliminates the intermediate gather buffer entirely. """ import torch import triton import triton.language as tl # NF4 codebook (16 values, precomputed by BnB) # These are the normalized float4 reconstruction values NF4_CODEBOOK = [ -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453, -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0, 0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224, 0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0, ] @triton.jit def _selective_dequant_nf4_kernel( # Input: packed NF4 data (flattened, expert-major order) packed_ptr, # Input: absmax values (flattened, expert-major order) absmax_ptr, # Input: active expert indices active_experts_ptr, # Input: NF4 codebook (16 float values) codebook_ptr, # Output: dequantized bf16 weights [num_active, expert_numel] out_ptr, stride_out_e, # stride for expert dim in output # Dimensions num_active, packed_per_expert, # expert_numel // 2 blocks_per_expert, # expert_numel // blocksize blocksize: tl.constexpr, # Tile size BLOCK_SIZE: tl.constexpr, # elements per thread block (must be multiple of 2) ): """ Each program processes BLOCK_SIZE elements from one expert. Grid: (num_active, cdiv(expert_numel, BLOCK_SIZE)) For each output element: 1. Compute which byte in packed data contains this element 2. Extract the 4-bit nibble (high or low) 3. Look up in NF4 codebook 4. Scale by absmax for this block """ expert_local_idx = tl.program_id(0) # which active expert (0..num_active-1) block_id = tl.program_id(1) # which element block # Load the global expert index expert_global = tl.load(active_experts_ptr + expert_local_idx).to(tl.int64) expert_numel = packed_per_expert * 2 # 2 elements per packed byte elem_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = elem_offset < expert_numel # Each element is packed as: byte[i//2], low nibble for even i, high for odd i byte_idx = elem_offset // 2 is_high = (elem_offset % 2) == 1 # Read packed bytes from the global expert's region packed_global_offset = expert_global * packed_per_expert + byte_idx packed_bytes = tl.load(packed_ptr + packed_global_offset, mask=mask, other=0).to( tl.int32 ) # Extract 4-bit nibble # BnB packing: high nibble = even element, low nibble = odd element nibble = tl.where(is_high, packed_bytes & 0xF, (packed_bytes >> 4) & 0xF) # NF4 codebook lookup # Load all 16 codebook values (small, fits in registers) # Use gather from codebook pointer code_val = tl.load(codebook_ptr + nibble, mask=mask, other=0.0) # Load absmax for this element's quantization block block_idx = elem_offset // blocksize absmax_global_offset = expert_global * blocks_per_expert + block_idx absmax_val = tl.load(absmax_ptr + absmax_global_offset, mask=mask, other=1.0) # Dequantize: value = codebook[nibble] * absmax result = code_val * absmax_val # Store to output out_offset = expert_local_idx * stride_out_e + elem_offset tl.store(out_ptr + out_offset, result.to(out_ptr.dtype.element_ty), mask=mask) def selective_dequant_nf4_triton( packed_data: torch.Tensor, absmax: torch.Tensor, active_experts: torch.Tensor, expert_shape: tuple[int, int], blocksize: int, dtype: torch.dtype = torch.bfloat16, codebook: torch.Tensor | None = None, ) -> torch.Tensor: """Fused selective gather + NF4 dequantization via Triton kernel. Args: packed_data: Flattened packed NF4 data [total_packed] or [total_packed, 1] absmax: Per-block scaling factors [total_blocks] active_experts: Sorted indices of experts to dequantize [num_active] expert_shape: (dim1, dim2) per expert blocksize: Quantization block size dtype: Output dtype (default bf16) codebook: NF4 lookup table [16] (uses default NF4 codebook if None) Returns: Dequantized weights [num_active, dim1, dim2] """ num_active = active_experts.shape[0] expert_numel = expert_shape[0] * expert_shape[1] packed_per_expert = expert_numel // 2 blocks_per_expert = expert_numel // blocksize # Prepare codebook on device if codebook is None: codebook = torch.tensor( NF4_CODEBOOK, dtype=torch.float32, device=packed_data.device ) else: codebook = codebook.to(device=packed_data.device, dtype=torch.float32) # Flatten inputs packed_flat = packed_data.reshape(-1) absmax_flat = absmax.reshape(-1).float() # absmax is usually fp32 # Output buffer out = torch.empty(num_active, expert_numel, dtype=dtype, device=packed_data.device) BLOCK_SIZE = 1024 # Process 1024 elements per thread block grid = (num_active, triton.cdiv(expert_numel, BLOCK_SIZE)) _selective_dequant_nf4_kernel[grid]( packed_flat, absmax_flat, active_experts, codebook, out, out.stride(0), num_active=num_active, packed_per_expert=packed_per_expert, blocks_per_expert=blocks_per_expert, blocksize=blocksize, BLOCK_SIZE=BLOCK_SIZE, ) return out.reshape(num_active, *expert_shape) ================================================ FILE: src/axolotl/integrations/kernels/plugin.py ================================================ import importlib import os from pathlib import Path import torch from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def _check_sonicmoe_gpu_compat(): """Validate GPU compute capability for SonicMoE and configure env. Supported: Hopper (sm_90), Blackwell (sm_100 - sm_103). B300 (sm_103) additionally requires Triton 3.6.0. """ if not torch.cuda.is_available(): return cc = torch.cuda.get_device_capability() if cc < (9, 0): raise RuntimeError( f"SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+) GPU, " f"but detected sm_{cc[0]}{cc[1]}." ) if cc > (10, 3): raise RuntimeError( f"SonicMoE does not yet support sm_{cc[0]}{cc[1]}. " f"Supported: Hopper (sm_90) and Blackwell (sm_100 - sm_103)." ) # Blackwell (sm_100+): enable QuACK GEMM kernels if cc >= (10, 0): os.environ.setdefault("USE_QUACK_GEMM", "1") LOG.info( f"Blackwell GPU (sm_{cc[0]}{cc[1]}) detected, enabling USE_QUACK_GEMM=1" ) # B300 (sm_103): requires Triton 3.6.0 if cc == (10, 3): triton_spec = importlib.util.find_spec("triton") if triton_spec is None: raise RuntimeError( "B300 (sm_103) requires Triton 3.6.0, but Triton is not installed." ) import triton triton_version = tuple(int(x) for x in triton.__version__.split(".")[:2]) if triton_version != (3, 6): raise RuntimeError( f"B300 (sm_103) requires Triton 3.6.x, but found {triton.__version__}." ) class KernelsPlugin(BasePlugin): def get_input_args(self): return "axolotl.integrations.kernels.KernelsArgs" def pre_model_load(self, cfg): from axolotl.integrations.kernels.constants import SPARSE_MOE_BLOCK # Prefer text backbone type for VLMs, but fall back to base type # when the text type isn't in the supported mapping (e.g. qwen3_5_moe_text) moe_model_type = cfg.model_config_type_text or cfg.model_config_type if ( moe_model_type not in SPARSE_MOE_BLOCK and cfg.model_config_type in SPARSE_MOE_BLOCK ): moe_model_type = cfg.model_config_type if cfg.use_scattermoe: self._register_kernels() self._kernelize_model(moe_model_type) elif cfg.use_sonicmoe: if not importlib.util.find_spec("sonicmoe"): raise RuntimeError( "SonicMoE is not installed. See installation instructions at " "https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/kernels/README.md#sonicmoe-installation" ) _check_sonicmoe_gpu_compat() from axolotl.integrations.kernels.sonicmoe import patch_sonicmoe LOG.info(f"Applying SonicMoE patches for model type: {moe_model_type}") patch_sonicmoe( moe_model_type, torch_compile=bool(getattr(cfg, "torch_compile", False)), ) def _register_kernels(self): from kernels import ( LocalLayerRepository, Mode, register_kernel_mapping, ) plugin_root = Path(__file__).parent register_kernel_mapping( { "HFScatterMoEParallelExperts": { "cuda": { Mode.TRAINING: LocalLayerRepository( repo_path=plugin_root / "libs" / "scattermoe_lora", package_name="scattermoe_lora", layer_name="HFScatterMoEGatedMLP", ), Mode.INFERENCE: LocalLayerRepository( repo_path=plugin_root / "libs" / "scattermoe_lora", package_name="scattermoe_lora", layer_name="HFScatterMoEGatedMLP", ), }, } } ) def add_callbacks_pre_trainer(self, cfg, model): callbacks = [] if cfg.use_scattermoe: from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) callbacks.append(AutotuneReportCallback()) return callbacks def _kernelize_model(self, model_type: str): from kernels import replace_kernel_forward_from_hub from axolotl.integrations.kernels.constants import resolve_moe_block_classes for model_moe_cls in resolve_moe_block_classes(model_type): replace_kernel_forward_from_hub( model_moe_cls, "HFScatterMoEParallelExperts" ) ================================================ FILE: src/axolotl/integrations/kernels/sonicmoe/__init__.py ================================================ from .patch import patch_sonicmoe __all__ = ["patch_sonicmoe"] ================================================ FILE: src/axolotl/integrations/kernels/sonicmoe/patch.py ================================================ """ SonicMoE patching for SparseMoeBlock forward pass. Monkeypatches the SparseMoeBlock class for a given model type to use SonicMoE's optimized kernels. Two forward paths are supported: 1. **General routing path** (routing_fn is not None): Uses a custom routing function + ``moe_general_routing_inputs``. Suitable for models with non-standard routing (softmax->topk, sigmoid->topk). 2. **Fused topk->softmax path** (routing_fn is None): Uses ``moe_TC_softmax_topk_layer`` which fuses routing + expert computation. Suitable for models with simple topk->softmax routing. Weight format conversion (interleave/deinterleave) is handled by the WeightConverter system, so the forward assumes weights are already in interleaved format. Shared experts are handled generically: if the block has a ``shared_expert`` or ``shared_experts`` attribute, its output is computed alongside the routed experts and added to the final output. An optional ``shared_expert_gate`` applies sigmoid gating to the shared expert contribution. """ import torch import torch.nn.functional as F from axolotl.integrations.kernels.constants import resolve_moe_block_classes from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def patch_sonicmoe(model_type: str, torch_compile: bool = False): """Main entry point: patch SparseMoeBlock for SonicMoE support. Args: model_type: The HuggingFace model type (e.g. "qwen3_moe"). torch_compile: If True, wrap routing functions with torch.compile for kernel fusion (fuses softmax+topk+renorm into fewer launches). """ from .routing import get_model_moe_config from .weight_converter import register_sonicmoe_weight_converter routing_fn, activation, router_attr = get_model_moe_config(model_type) if torch_compile and routing_fn is not None: routing_fn = _try_compile_routing(routing_fn) for moe_cls in resolve_moe_block_classes(model_type): _patch_forward(moe_cls, routing_fn, activation, router_attr) register_sonicmoe_weight_converter(model_type) def _try_compile_routing(routing_fn): """Attempt to torch.compile the routing function, fall back to eager on failure.""" try: compiled_fn = torch.compile(routing_fn, mode="reduce-overhead", dynamic=False) LOG.info(f"torch.compile enabled for routing function: {routing_fn.__name__}") return compiled_fn except Exception as exc: # pylint: disable=broad-except LOG.warning( f"torch.compile failed for routing function {routing_fn.__name__}, " f"falling back to eager: {exc}" ) return routing_fn def _patch_forward(moe_cls, routing_fn, activation, router_attr): """Monkeypatch the SparseMoeBlock class with a SonicMoE forward. The patched forward handles shared experts generically: if ``self.shared_expert`` or ``self.shared_experts`` exists, it is computed and added to the routed output. If ``self.shared_expert_gate`` also exists, it applies sigmoid gating to the shared expert contribution (as in qwen2_moe). Args: moe_cls: The SparseMoeBlock class to patch. routing_fn: Routing function (e.g. softmax_topk_routing), or None for the fused moe_TC_softmax_topk_layer path. activation: SonicMoE ActivationType enum value. router_attr: Name of the router module attribute on the MoE block. """ if hasattr(moe_cls, "_original_forward"): LOG.info(f"{moe_cls.__name__}.forward already patched with SonicMoE, skipping") return original_forward = moe_cls.forward if routing_fn is not None: _make_general_forward(moe_cls, routing_fn, activation) else: _make_fused_forward(moe_cls, activation, router_attr) moe_cls._original_forward = original_forward LOG.info(f"Patched {moe_cls.__name__}.forward with SonicMoE implementation") def _make_general_forward(moe_cls, routing_fn, activation): """Create forward using routing_fn + moe_general_routing_inputs.""" def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: from sonicmoe import moe_general_routing_inputs batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_flat = hidden_states.view(-1, hidden_dim) # Shared expert (computed early, matching original model ordering) shared_expert_output = _compute_shared_expert(self, hidden_states_flat) # Routing router_scores, token_indices, expert_indices, _router_logits = routing_fn( hidden_states_flat, self ) # Permute weights to SonicMoE layout: # gate_up: [E, 2*I, H] -> [2*I, H, E] # down: [E, H, I] -> [H, I, E] gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0) down_weight = self.experts.down_proj.permute(1, 2, 0) E = gate_up_weight.shape[-1] output, _ = moe_general_routing_inputs( hidden_states_flat, router_scores, token_indices, expert_indices, gate_up_weight, None, # b1 (no gate/up bias) down_weight, None, # b2 (no down bias) E, torch.cuda.current_stream().cuda_stream, activation, False, # is_inference_mode ) # Add shared expert contribution if present if shared_expert_output is not None: if hasattr(self, "shared_expert_gate"): shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states_flat)) * shared_expert_output ) output = output + shared_expert_output return output.view(batch_size, sequence_length, hidden_dim) moe_cls.forward = sonicmoe_forward def _make_fused_forward(moe_cls, activation, router_attr): """Create forward using moe_TC_softmax_topk_layer (topk -> softmax).""" def sonicmoe_fused_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: from sonicmoe import moe_TC_softmax_topk_layer batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_flat = hidden_states.view(-1, hidden_dim) # Shared expert (computed early, matching original model ordering) shared_expert_output = _compute_shared_expert(self, hidden_states_flat) router = getattr(self, router_attr) # Permute weights to SonicMoE layout: # gate_up: [E, 2*I, H] -> [2*I, H, E] # down: [E, H, I] -> [H, I, E] gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0) down_weight = self.experts.down_proj.permute(1, 2, 0) output, _router_logits, _expert_freq = moe_TC_softmax_topk_layer( hidden_states_flat, router.weight, gate_up_weight, None, # b1 (no gate/up bias) down_weight, None, # b2 (no down bias) router.top_k, torch.cuda.current_stream().cuda_stream, activation, False, # is_inference_mode ) # Add shared expert contribution if present if shared_expert_output is not None: if hasattr(self, "shared_expert_gate"): shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states_flat)) * shared_expert_output ) output = output + shared_expert_output return output.view(batch_size, sequence_length, hidden_dim) moe_cls.forward = sonicmoe_fused_forward def _compute_shared_expert(moe_block, hidden_states_flat): """Compute shared expert output if the block has one. Handles singular (qwen2_moe: ``shared_expert``), plural (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP (hunyuan_v1_moe: ``shared_mlp``) attribute names. """ shared_expert = ( getattr(moe_block, "shared_expert", None) or getattr(moe_block, "shared_experts", None) or getattr(moe_block, "shared_mlp", None) ) if shared_expert is not None: return shared_expert(hidden_states_flat) return None ================================================ FILE: src/axolotl/integrations/kernels/sonicmoe/routing.py ================================================ """ Routing functions for SonicMoE integration. Different MoE architectures use different routing strategies: - qwen3_moe / qwen2_moe / qwen3_5_moe / qwen3_vl_moe / qwen3_omni_moe: softmax -> topk (with optional renormalization) - gpt_oss: topk -> softmax (uses fused moe_TC_softmax_topk_layer, routing_fn=None) - glm_moe_dsa: sigmoid -> topk (with group-based expert selection) - mistral4: softmax -> group selection -> topk (with renormalization and scaling) Each model type maps to a (routing_fn, activation_type, router_attr) triple. When routing_fn is None, the fused moe_TC_softmax_topk_layer path is used. """ import torch import torch.nn.functional as F def get_model_moe_config(model_type: str): """Returns (routing_fn, activation, router_attr) for a given model type. Args: model_type: HuggingFace model type string. Returns: routing_fn: Callable or None. None signals the fused moe_TC_softmax_topk_layer path (topk -> softmax models). activation: SonicMoE ActivationType enum value. router_attr: Name of the router module attribute on the MoE block (e.g. "gate" or "router"). The activation type cannot be derived from config.hidden_act because e.g. qwen3_moe reports "silu" but architecturally uses SwiGLU (act_fn(gate) * up pattern). So we specify it per model type. """ from sonicmoe.enums import ActivationType if model_type in ( "qwen2_moe", "qwen3_moe", "qwen3_5_moe", "qwen3_next", "qwen3_vl_moe", "qwen3_omni_moe", "olmoe", "mixtral", "minimax", ): return softmax_topk_routing, ActivationType.SWIGLU, "gate" elif model_type in ("mistral4",): return softmax_group_topk_routing, ActivationType.SWIGLU, "gate" elif model_type in ( "glm_moe_dsa", "deepseek_v3", "glm4_moe", "glm4_moe_lite", "glm4v_moe", "minimax_m2", ): return sigmoid_topk_routing, ActivationType.SWIGLU, "gate" # elif model_type in ("ernie4_5_moe",): # # Softmax→topk with e_score_correction_bias applied between softmax and topk. # return ..., ActivationType.SWIGLU, "gate" # elif model_type in ("deepseek_v2",): # # Softmax→topk with group_limited_greedy. Different attr names: num_group # # (not n_group), gate is nn.Linear (not a router class). # return ..., ActivationType.SWIGLU, "gate" # elif model_type in ("hunyuan_v1_moe",): # # Softmax→topk but gate structure differs: gate.wg (not gate.weight), # # top_k on block not gate, creates scatter routing matrix. # return ..., ActivationType.SWIGLU, "gate" # Fused topk -> softmax path (routing_fn=None): # elif model_type in ("gpt_oss",): # # NOTE: gpt_oss has a router bias which moe_TC_softmax_topk_layer # # ignores (it only takes router_w, not bias). Also has transposed # # weight layout [E, H, 2*I] and custom GLU activation. # return None, ActivationType.SWIGLU, "router" else: raise ValueError(f"SonicMoE: unsupported model type '{model_type}'") def softmax_topk_routing( hidden_states: torch.Tensor, moe_block ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """Qwen3/Qwen2-style routing: softmax -> topk -> optional renorm. Args: hidden_states: [T, H] flattened token representations moe_block: MoE block module (accesses moe_block.gate.*) Returns: router_scores: [T*K] flattened scores (float32) token_indices: [T*K] which token each entry belongs to (int32), sorted ascending expert_indices: [T*K] which expert (int32) router_logits: [T, E] original logits for aux loss """ gate = moe_block.gate T, H = hidden_states.shape K = gate.top_k # Compute router logits and softmax over all experts router_logits = F.linear(hidden_states, gate.weight) # [T, E] router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32) # [T, E] # Select top-k experts per token top_values, top_indices = torch.topk(router_probs, K, dim=-1) # [T, K] each # Renormalize if configured (default True for models without the attribute, # e.g. Mixtral/MiniMax which always normalize) if getattr(gate, "norm_topk_prob", True): top_values = top_values / top_values.sum(dim=-1, keepdim=True) # no-op: matches transformers which casts to softmax output dtype (float32). # top_values = top_values.to(router_probs.dtype) # Flatten for moe_general_routing_inputs. # Token indices are naturally sorted ascending from the [T, K] layout: # [0, 0, ..., 1, 1, ..., T-1, T-1, ...] — this is required by SonicMoE. # Expert sorting is handled internally by general_routing_router_metadata. token_indices = ( torch.arange(T, device=hidden_states.device, dtype=torch.int32) .unsqueeze(1) .expand(T, K) ) flat_scores = top_values.reshape(-1) # [T*K] flat_token_idx = token_indices.reshape(-1) # [T*K] flat_expert_idx = top_indices.to(torch.int32).reshape(-1) # [T*K] return flat_scores, flat_token_idx, flat_expert_idx, router_logits def softmax_group_topk_routing( hidden_states: torch.Tensor, moe_block ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """Mistral4-style routing: softmax -> group selection -> topk -> renorm -> scale.""" gate = moe_block.gate T, H = hidden_states.shape K = moe_block.top_k E = getattr(moe_block, "n_routed_experts", gate.weight.shape[0]) n_group = getattr(moe_block, "n_group", 1) router_logits = F.linear(hidden_states, gate.weight) # [T, E] router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32) # [T, E] scores_for_choice = router_probs # Group selection: pick top groups, mask the rest if n_group > 1: group_scores = ( scores_for_choice.view(-1, n_group, E // n_group) .topk(2, dim=-1)[0] .sum(dim=-1) ) group_idx = torch.topk( group_scores, k=moe_block.topk_group, dim=-1, sorted=False )[1] group_mask = torch.zeros_like(group_scores) group_mask.scatter_(1, group_idx, 1) score_mask = ( group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E) ) scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1] topk_weights = router_probs.gather(1, topk_indices) # Renormalization + scaling norm_topk_prob = getattr(moe_block, "norm_topk_prob", True) if norm_topk_prob: topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20) routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0) topk_weights = topk_weights * routed_scaling_factor # Flatten for moe_general_routing_inputs token_indices = ( torch.arange(T, device=hidden_states.device, dtype=torch.int32) .unsqueeze(1) .expand(T, K) ) flat_scores = topk_weights.to(torch.float32).reshape(-1) # [T*K] flat_token_idx = token_indices.reshape(-1) # [T*K] flat_expert_idx = topk_indices.to(torch.int32).reshape(-1) # [T*K] return flat_scores, flat_token_idx, flat_expert_idx, router_logits def sigmoid_topk_routing( hidden_states: torch.Tensor, moe_block ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """Sigmoid-based routing: sigmoid -> optional group selection -> topk. Supports two variants: - **Group selection** (glm_moe_dsa, deepseek_v3, etc.): n_group > 1, bias on gate, group-based masking before topk. - **No group selection** (minimax_m2): n_group == 1 (or absent), bias on moe_block, straight topk from all experts. Final routing weights come from the original sigmoid scores (not bias-corrected), with optional renormalization and scaling. Args: hidden_states: [T, H] flattened token representations moe_block: MoE block module (accesses moe_block.gate.* and optional moe_block.n_group, .topk_group, .top_k, .norm_topk_prob, .routed_scaling_factor, .n_routed_experts) Returns: router_scores: [T*K] flattened scores (float32) token_indices: [T*K] which token each entry belongs to (int32), sorted ascending expert_indices: [T*K] which expert (int32) router_logits: [T, E] original logits for aux loss """ gate = moe_block.gate T, H = hidden_states.shape K = moe_block.top_k E = getattr(moe_block, "n_routed_experts", gate.weight.shape[0]) n_group = getattr(moe_block, "n_group", 1) # Compute router logits and sigmoid probabilities router_logits = F.linear(hidden_states.float(), gate.weight.float()) # [T, E] router_probs = router_logits.sigmoid() # [T, E] # Bias-corrected scores for expert selection (not used for final weights). # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 stores it on the block. e_score_correction_bias = getattr(gate, "e_score_correction_bias", None) if e_score_correction_bias is None: e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None) if e_score_correction_bias is None: raise AttributeError( f"sigmoid_topk_routing requires e_score_correction_bias on " f"gate ({type(gate)}) or moe_block ({type(moe_block)}), but neither has it" ) scores_for_choice = router_probs + e_score_correction_bias # Group-based selection: pick top groups, mask the rest (skip when n_group == 1) if n_group > 1: group_scores = ( scores_for_choice.view(-1, n_group, E // n_group) .topk(2, dim=-1)[0] .sum(dim=-1) ) # [T, n_group] group_idx = torch.topk( group_scores, k=moe_block.topk_group, dim=-1, sorted=False )[1] group_mask = torch.zeros_like(group_scores) group_mask.scatter_(1, group_idx, 1) score_mask = ( group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E) ) scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # Final topk from (possibly masked) scores topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1] # Gather weights from original sigmoid scores (not bias-corrected) topk_weights = router_probs.gather(1, topk_indices) # Optional renormalization + scaling norm_topk_prob = getattr(moe_block, "norm_topk_prob", True) if norm_topk_prob: topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20) routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0) topk_weights = topk_weights * routed_scaling_factor # Flatten for moe_general_routing_inputs. # Token indices are naturally sorted ascending from the [T, K] layout. token_indices = ( torch.arange(T, device=hidden_states.device, dtype=torch.int32) .unsqueeze(1) .expand(T, K) ) flat_scores = topk_weights.to(torch.float32).reshape(-1) # [T*K] flat_token_idx = token_indices.reshape(-1) # [T*K] flat_expert_idx = topk_indices.to(torch.int32).reshape(-1) # [T*K] return flat_scores, flat_token_idx, flat_expert_idx, router_logits ================================================ FILE: src/axolotl/integrations/kernels/sonicmoe/weight_converter.py ================================================ """ Custom WeightConverter operations for SonicMoE weight format conversion. SonicMoE requires gate_up_proj weights in interleaved format: - Standard (concatenated): [E, 2*I, H] where first I rows are gate, last I rows are up - SonicMoE (interleaved): [E, 2*I, H] where rows alternate [g0, u0, g1, u1, ...] These ConversionOps integrate with transformers' WeightConverter system so that weights are transparently converted during loading and reverted during saving. """ from typing import Any import torch from einops import rearrange from transformers.core_model_loading import ConversionOps from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def interleave_gate_up(tensor: torch.Tensor) -> torch.Tensor: """[gate..., up...] -> [g0, u0, g1, u1, ...] along the 2*I dimension.""" return rearrange(tensor, "... (two out) h -> ... (out two) h", two=2) def deinterleave_gate_up(tensor: torch.Tensor) -> torch.Tensor: """[g0, u0, g1, u1, ...] -> [gate..., up...] along the 2*I dimension.""" return rearrange(tensor, "... (out two) h -> ... (two out) h", two=2) class ConcatenatedToInterleaved(ConversionOps): """Convert concatenated gate/up projections to interleaved format. Input: [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H] Output: [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...] This operation is applied along ``dim`` (default 1, the 2*I dimension). """ def __init__(self, dim: int = 1): self.dim = dim @torch.no_grad() def convert( self, input_dict: dict[str, Any], source_patterns: list[str], target_patterns: list[str], **kwargs, ) -> dict[str, torch.Tensor]: target_pattern = self._get_target_pattern( input_dict, source_patterns, target_patterns ) tensors = next(iter(input_dict.values())) tensor = tensors[0] if isinstance(tensors, list) else tensors interleaved = interleave_gate_up(tensor) return {target_pattern: interleaved} def _get_target_pattern( self, input_dict: dict[str, Any], source_patterns: list[str], target_patterns: list[str], ) -> str: # Follow the same logic as Transpose.get_target_pattern if len(input_dict) != 1: raise ValueError("Undefined Operation encountered!") if len(target_patterns) > 1: if len(source_patterns) == 1: return source_patterns[0] raise ValueError("Undefined Operation encountered!") return target_patterns[0] @property def reverse_op(self) -> ConversionOps: return InterleavedToConcatenated(self.dim) class InterleavedToConcatenated(ConversionOps): """Convert interleaved gate/up projections back to concatenated format. Input: [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...] Output: [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H] This is the reverse of ``ConcatenatedToInterleaved``. """ def __init__(self, dim: int = 1): self.dim = dim @torch.no_grad() def convert( self, input_dict: dict[str, Any], source_patterns: list[str], target_patterns: list[str], **kwargs, ) -> dict[str, torch.Tensor]: target_pattern = self._get_target_pattern( input_dict, source_patterns, target_patterns ) tensors = next(iter(input_dict.values())) tensor = tensors[0] if isinstance(tensors, list) else tensors concatenated = deinterleave_gate_up(tensor) return {target_pattern: concatenated} def _get_target_pattern( self, input_dict: dict[str, Any], source_patterns: list[str], target_patterns: list[str], ) -> str: if len(input_dict) != 1: raise ValueError("Undefined Operation encountered!") if len(target_patterns) > 1: if len(source_patterns) == 1: return source_patterns[0] raise ValueError("Undefined Operation encountered!") return target_patterns[0] @property def reverse_op(self) -> ConversionOps: return ConcatenatedToInterleaved(self.dim) def register_sonicmoe_weight_converter(model_type: str): """Override the conversion mapping to add interleave step for gate_up_proj. Appends a ConcatenatedToInterleaved operation to the existing gate_up_proj converter chain. For example, qwen3_moe's chain becomes: MergeModulelist(dim=0) -> Concatenate(dim=1) -> ConcatenatedToInterleaved(dim=1) The reverse is auto-generated for saving: InterleavedToConcatenated(dim=1) -> Chunk(dim=1) -> SplitModulelist(dim=0) """ from transformers.conversion_mapping import ( get_checkpoint_conversion_mapping, register_checkpoint_conversion_mapping, ) existing = get_checkpoint_conversion_mapping(model_type) if existing is None: LOG.warning( f"No conversion mapping found for model type '{model_type}'. " "SonicMoE weight interleaving will not be applied during checkpoint loading." ) return # Find the gate_up_proj converter and append ConcatenatedToInterleaved patched = False for converter in existing: if hasattr(converter, "operations") and any( "gate_up_proj" in pat for pat in converter.target_patterns ): # Guard against double registration (e.g. plugin reloaded) if any( isinstance(op, ConcatenatedToInterleaved) for op in converter.operations ): LOG.info( f"SonicMoE weight converter already registered for '{model_type}'" ) return converter.operations.append(ConcatenatedToInterleaved(dim=1)) patched = True break if not patched: LOG.warning( f"Could not find gate_up_proj converter for model type '{model_type}'. " "SonicMoE weight interleaving will not be applied during checkpoint loading." ) return register_checkpoint_conversion_mapping(model_type, existing, overwrite=True) LOG.info(f"Registered SonicMoE weight converter for model type '{model_type}'") ================================================ FILE: src/axolotl/integrations/liger/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: src/axolotl/integrations/liger/README.md ================================================ # Liger Kernel Integration Liger Kernel provides efficient Triton kernels for LLM training, offering: - 20% increase in multi-GPU training throughput - 60% reduction in memory usage - Compatibility with both FSDP and DeepSpeed See https://github.com/linkedin/Liger-Kernel ## Usage ```yaml plugins: - axolotl.integrations.liger.LigerPlugin liger_rope: true liger_rms_norm: true liger_glu_activation: true liger_layer_norm: true liger_fused_linear_cross_entropy: true # FLCE-specific liger_use_token_scaling: true ``` ## Supported Models - deepseek_v2 - gemma - gemma2 - gemma3 - granite - jamba - llama - mistral - mixtral - mllama - mllama_text_model - olmo2 - paligemma - phi3 - qwen2 - qwen2_5_vl - qwen2_vl ## Citation ```bib @article{hsu2024ligerkernelefficienttriton, title={Liger Kernel: Efficient Triton Kernels for LLM Training}, author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen}, year={2024}, eprint={2410.10989}, archivePrefix={arXiv}, primaryClass={cs.LG}, url={https://arxiv.org/abs/2410.10989}, journal={arXiv preprint arXiv:2410.10989}, } ``` ================================================ FILE: src/axolotl/integrations/liger/__init__.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Module for the Plugin for LIGER integraton with Axolotl. Liger Kernel is the collection of Triton-native kernels for LLM Training. It is designed to be performant, correct, and light-weight. """ from .args import LigerArgs from .plugin import LigerPlugin __all__ = [ "LigerArgs", "LigerPlugin", ] ================================================ FILE: src/axolotl/integrations/liger/args.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Module for handling LIGER input arguments. """ from pydantic import BaseModel, Field, model_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class LigerArgs(BaseModel): """ Input args for LIGER. """ liger_rope: bool | None = None liger_rms_norm: bool | None = None liger_layer_norm: bool | None = None liger_swiglu: bool | None = None liger_glu_activation: bool | None = None liger_cross_entropy: bool | None = None liger_fused_linear_cross_entropy: bool | None = None liger_use_token_scaling: bool | None = Field( default=None, json_schema_extra={ "description": ( "Enables use_token_scaling in fused_linear_cross_entropy. " "When True, each token's loss is multiplied by its predicted probability (detached from gradients)." ) }, ) @model_validator(mode="before") @classmethod def check_deprecated_swiglu(cls, data): if data.get("liger_swiglu") is not None: if data.get("liger_glu_activation") is not None: raise ValueError( "You cannot have both `liger_swiglu` and `liger_glu_activation` set." ) LOG.warning( "The 'liger_swiglu' argument is deprecated and will be removed in a future release. " "Please use 'liger_glu_activation' instead." ) data["liger_glu_activation"] = data.pop("liger_swiglu") return data @model_validator(mode="before") @classmethod def check_tiled_mlp_conflict(cls, data): if ( data.get("liger_glu_activation") is True and data.get("tiled_mlp") is True and not data.get("tiled_mlp_use_original_mlp") ): raise ValueError( "You cannot have both `liger_glu_activation` and `tiled_mlp` set without `tiled_mlp_use_original_mlp: true`." ) return data @model_validator(mode="before") @classmethod def check_liger_rms_norm_tensor_parallel(cls, data): if data.get("liger_rms_norm") and data.get("tensor_parallel_size", 1) > 1: raise ValueError( "`liger_rms_norm` is incompatible with tensor parallelism, " "see https://github.com/linkedin/Liger-Kernel/issues/826" ) return data @model_validator(mode="before") @classmethod def check_liger_use_token_scaling_flce(cls, data): if data.get("liger_use_token_scaling") and not data.get( "liger_fused_linear_cross_entropy" ): raise ValueError( "`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled." ) return data @model_validator(mode="after") def check_tensor_parallel_size_liger_fused_linear_cross_entropy(self): # TODO @SalmanMohammadi this is a larger fix - investigate if self.tensor_parallel_size > 1 and self.liger_fused_linear_cross_entropy: raise ValueError("Tensor parallelism is not compatible with liger losses.") return self ================================================ FILE: src/axolotl/integrations/liger/models/__init__.py ================================================ ================================================ FILE: src/axolotl/integrations/liger/models/base.py ================================================ """ Generic FLCE patch for untested models similar to Llama """ from typing import Optional, Tuple, Union import torch from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss from liger_kernel.transformers.trainer.orpo_trainer import _FSDPForwardRedirection from liger_kernel.utils import PEFT_AVAILABLE from peft.utils import ModulesToSaveWrapper from torch.distributed.fsdp import FullyShardedDataParallel from transformers.modeling_outputs import CausalLMOutputWithPast from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix def lce_forward( self, *args, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, skip_logits: Optional[bool] = None, **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. logits_to_keep (`int` or `torch.Tensor`, *optional*): If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size. If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. This is useful when using packed tensor format (single dimension for batch and sequence length). """ output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( *args, output_attentions=output_attentions, output_hidden_states=output_hidden_states, **kwargs, ) hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = ( slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep ) kept_hidden_states = hidden_states[:, slice_indices, :] shift_labels = kwargs.pop("shift_labels", None) logits = None loss = None # if in training mode, don't materialize logits if skip_logits and labels is None and shift_labels is None: raise ValueError("skip_logits is True, but labels and shift_labels are None") if skip_logits is None: # By default, if in training mode, don't materialize logits skip_logits = self.training and (labels is not None or shift_labels is not None) if skip_logits: loss = lce_maybe_trainable_lm_head( self, hidden_states=kept_hidden_states, hidden_size=self.config.hidden_size, labels=labels, shift_labels=shift_labels, **kwargs, ) else: logits = self.lm_head(kept_hidden_states) if labels is not None: loss = self.loss_function( logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs, ) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def lce_maybe_trainable_lm_head( self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs ): lm_head = self.lm_head # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration, # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read # from the unwrapped module. # See https://huggingface.co/docs/peft/package_reference/lora for reference. if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper): lm_head = lm_head.modules_to_save.default # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA, # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass # so the module entire parameters are summoned and kept in memory during the kernel execution. if isinstance(lm_head, FullyShardedDataParallel): return _FSDPForwardRedirection()( lm_head, _liger_for_causal_lm_loss, lm_head.module, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs, ) # FSDP is not used so we can read the lm_head weights and call the kernel directly return _liger_for_causal_lm_loss( lm_head=self.lm_head, hidden_states=hidden_states, hidden_size=hidden_size, labels=labels, shift_labels=shift_labels, **loss_kwargs, ) def _liger_for_causal_lm_loss( lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs ): return LigerForCausalLMLoss( hidden_states=hidden_states, lm_head_weight=lm_head.weight, labels=labels, hidden_size=hidden_size, shift_labels=shift_labels, **loss_kwargs, ) def patch_lce_forward( model_type, ): try: # Dynamically import the module and MLP class module_path = f"transformers.models.{model_type}.modeling_{model_type}" model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type) module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]) model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM") model_cls.forward = lce_forward except (ImportError, AttributeError) as e: raise RuntimeError( f"Could not import ForCausalLM class for model_type: {model_type}. " f"Error: {str(e)}" ) from e ================================================ FILE: src/axolotl/integrations/liger/models/deepseekv2.py ================================================ """ DeepseekV2 model with LigerFusedLinearCrossEntropyLoss """ from typing import List, Optional, Tuple, Union import torch from liger_kernel.transformers.fused_linear_cross_entropy import ( LigerFusedLinearCrossEntropyLoss, ) from torch.nn import CrossEntropyLoss from transformers.modeling_outputs import CausalLMOutputWithPast def lce_forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. Returns: Example: ```python >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." ```""" output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = outputs[0] loss = None logits = None if self.training: shift_hidden_states = hidden_states[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # flatten tokens shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size) shift_labels = shift_labels.view(-1) lce = LigerFusedLinearCrossEntropyLoss() loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels) else: logits = self.lm_head(hidden_states) logits = logits.float() loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: src/axolotl/integrations/liger/models/jamba.py ================================================ """ Jamba model with LigerFusedLinearCrossEntropyLoss """ from typing import Optional, Tuple, Union import torch from liger_kernel.transformers.fused_linear_cross_entropy import ( LigerFusedLinearCrossEntropyLoss, ) from torch.nn import CrossEntropyLoss from transformers.modeling_outputs import MoeCausalLMOutputWithPast from transformers.models.jamba.modeling_jamba import ( HybridMambaAttentionDynamicCache, load_balancing_loss_func, ) def lce_forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: Optional[Union[int, None]] = None, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. num_logits_to_keep (`int` or `None`, *optional*): Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all `input_ids`. Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences. Returns: Example: ```python >>> from transformers import AutoTokenizer, JambaForCausalLM >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1") >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1") >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." ```""" output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_router_logits=output_router_logits, cache_position=cache_position, return_dict=return_dict, ) hidden_states = outputs[0] loss = None logits = None if self.training: shift_hidden_states = hidden_states[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # flatten tokens shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size) shift_labels = shift_labels.view(-1) lce = LigerFusedLinearCrossEntropyLoss() loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels) else: if num_logits_to_keep is None: logits = self.lm_head(hidden_states) else: logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :]) logits = logits.float() if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) aux_loss = None if output_router_logits: aux_loss = load_balancing_loss_func( outputs.router_logits if return_dict else outputs[-1], self.num_experts, self.num_experts_per_tok, attention_mask, ) if labels is not None: loss += self.router_aux_loss_coef * aux_loss.to( loss.device ) # make sure to reside in the same device if not return_dict: output = (logits,) + outputs[1:] if output_router_logits: output = (aux_loss,) + output return (loss,) + output if loss is not None else output return MoeCausalLMOutputWithPast( loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, router_logits=outputs.router_logits, ) ================================================ FILE: src/axolotl/integrations/liger/models/llama4.py ================================================ """ Liger FLCE for llama4 """ import sys from copy import deepcopy from typing import List, Optional, Tuple, Union import torch from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss from transformers.modeling_outputs import CausalLMOutputWithPast def lce_forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[ Union["Cache", List[torch.FloatTensor]] # noqa: F821 ] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. logits_to_keep (`int` or `torch.Tensor`, *optional*): If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size. If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. This is useful when using packed tensor format (single dimension for batch and sequence length). Returns: """ output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, cache_position=cache_position, ) hidden_states = outputs[0] if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1: raise Exception("Liger Kernel does not support pretraining_tp!!") logits = None loss = None # if in training mode, don't materialize logits if self.training and (labels is not None): loss = LigerForCausalLMLoss( hidden_states=hidden_states, lm_head_weight=self.lm_head.weight, labels=labels, hidden_size=self.config.hidden_size, **loss_kwargs, ) else: # if in inference mode materialize logits slice_indices = ( slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep ) logits = self.lm_head(hidden_states[:, slice_indices, :]) if labels is not None: loss = self.loss_function( logits=logits, labels=labels, vocab_size=self.config.vocab_size, **loss_kwargs, ) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def apply_liger_kernel_to_llama4( cross_entropy: bool = False, fused_linear_cross_entropy: bool = False, rms_norm: bool = False, glu_activation: bool = False, layer_norm: bool = False, **kwargs, ) -> None: """ Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3) Args: cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False. fused_linear_cross_entropy (bool): Whether to apply Liger's fused linear cross entropy loss. Default is False. `cross_entropy` and `fused_linear_cross_entropy` cannot both be False. If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient. rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False. glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False. layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False. """ import transformers.models.llama4.modeling_llama4 # noqa: F401 from liger_kernel.transformers.functional import liger_cross_entropy from liger_kernel.transformers.layer_norm import LigerLayerNorm from liger_kernel.transformers.rms_norm import LigerRMSNorm from liger_kernel.transformers.swiglu import LigerSwiGLUMLP assert not (cross_entropy and fused_linear_cross_entropy), ( "cross_entropy and fused_linear_cross_entropy cannot both be True." ) modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"] if rms_norm: modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm if glu_activation: def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs): "Accepts intermediate_size to pass to LigerSwiGLUMLP" # clone config to avoid modifying the original config = deepcopy(config) if intermediate_size: config.intermediate_size = intermediate_size return LigerSwiGLUMLP(config, **kwargs) modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper if layer_norm: modeling_llama4.nn.LayerNorm = LigerLayerNorm if cross_entropy: from transformers.loss.loss_utils import nn nn.functional.cross_entropy = liger_cross_entropy if fused_linear_cross_entropy: modeling_llama4.Llama4ForCausalLM.forward = lce_forward ================================================ FILE: src/axolotl/integrations/liger/models/qwen3.py ================================================ """ Liger FLCE for Qwen3. Based on transformers v4.51.3. """ import sys from typing import Optional, Tuple, Union import torch from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss from transformers.cache_utils import Cache from transformers.modeling_outputs import CausalLMOutputWithPast def lce_forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. logits_to_keep (`int` or `torch.Tensor`, *optional*): If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size. If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. This is useful when using packed tensor format (single dimension for batch and sequence length). Returns: """ output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, cache_position=cache_position, **kwargs, ) hidden_states = outputs[0] logits = None loss = None # if in training mode, don't materialize logits if self.training and (labels is not None): loss = LigerForCausalLMLoss( hidden_states=hidden_states, lm_head_weight=self.lm_head.weight, labels=labels, hidden_size=self.config.hidden_size, **kwargs, ) else: # if in inference mode materialize logits slice_indices = ( slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep ) logits = self.lm_head(hidden_states[:, slice_indices, :]) if labels is not None: loss = self.loss_function( logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs, ) return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def apply_liger_kernel_to_qwen3( cross_entropy: bool = False, fused_linear_cross_entropy: bool = False, rms_norm: bool = False, glu_activation: bool = False, layer_norm: bool = False, **kwargs, ) -> None: """ Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3) Args: cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False. fused_linear_cross_entropy (bool): Whether to apply Liger's fused linear cross entropy loss. Default is False. `cross_entropy` and `fused_linear_cross_entropy` cannot both be False. If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient. rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False. glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False. layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False. """ import transformers.models.qwen3.modeling_qwen3 # noqa: F401 from liger_kernel.transformers.functional import liger_cross_entropy from liger_kernel.transformers.layer_norm import LigerLayerNorm from liger_kernel.transformers.rms_norm import LigerRMSNorm from liger_kernel.transformers.swiglu import LigerSwiGLUMLP assert not (cross_entropy and fused_linear_cross_entropy), ( "cross_entropy and fused_linear_cross_entropy cannot both be True." ) modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"] if rms_norm: modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm if glu_activation: modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP if layer_norm: modeling_qwen3.nn.LayerNorm = LigerLayerNorm if cross_entropy: from transformers.loss.loss_utils import nn nn.functional.cross_entropy = liger_cross_entropy if fused_linear_cross_entropy: modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward ================================================ FILE: src/axolotl/integrations/liger/models/qwen3_moe.py ================================================ """ Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3. """ import sys from copy import deepcopy from typing import List, Optional, Union import torch from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss from transformers.modeling_outputs import MoeCausalLMOutputWithPast from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func def lce_forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, ) -> MoeCausalLMOutputWithPast: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. logits_to_keep (`int` or `torch.Tensor`, *optional*): If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size. If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. This is useful when using packed tensor format (single dimension for batch and sequence length). Returns: """ output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_router_logits=output_router_logits, cache_position=cache_position, **kwargs, ) hidden_states = outputs[0] logits = None loss = None # if in training mode, don't materialize logits if self.training and (labels is not None): loss = LigerForCausalLMLoss( hidden_states=hidden_states, lm_head_weight=self.lm_head.weight, labels=labels, hidden_size=self.config.hidden_size, **kwargs, ) else: # if in inference mode materialize logits slice_indices = ( slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep ) logits = self.lm_head(hidden_states[:, slice_indices, :]) if labels is not None: loss = self.loss_function( logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs, ) aux_loss = None if output_router_logits: aux_loss = load_balancing_loss_func( outputs.router_logits, self.num_experts, self.num_experts_per_tok, attention_mask, ) if labels is not None: loss += self.router_aux_loss_coef * aux_loss.to( loss.device ) # make sure to reside in the same device return MoeCausalLMOutputWithPast( loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def apply_liger_kernel_to_qwen3_moe( cross_entropy: bool = False, fused_linear_cross_entropy: bool = False, rms_norm: bool = False, glu_activation: bool = False, layer_norm: bool = False, **kwargs, ) -> None: """ Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3) Args: cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False. fused_linear_cross_entropy (bool): Whether to apply Liger's fused linear cross entropy loss. Default is False. `cross_entropy` and `fused_linear_cross_entropy` cannot both be False. If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient. rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False. glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False. layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False. """ import transformers.models.qwen3_moe.modeling_qwen3_moe # noqa: F401 from liger_kernel.transformers.functional import liger_cross_entropy from liger_kernel.transformers.layer_norm import LigerLayerNorm from liger_kernel.transformers.rms_norm import LigerRMSNorm from liger_kernel.transformers.swiglu import LigerSwiGLUMLP assert not (cross_entropy and fused_linear_cross_entropy), ( "cross_entropy and fused_linear_cross_entropy cannot both be True." ) modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"] if rms_norm: modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm if glu_activation: def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs): "Accepts intermediate_size to pass to LigerSwiGLUMLP" # clone config to avoid modifying the original config = deepcopy(config) if intermediate_size: config.intermediate_size = intermediate_size return LigerSwiGLUMLP(config, **kwargs) modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper if layer_norm: modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm if cross_entropy: from transformers.loss.loss_utils import nn nn.functional.cross_entropy = liger_cross_entropy if fused_linear_cross_entropy: modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward ================================================ FILE: src/axolotl/integrations/liger/plugin.py ================================================ """ Liger-Kernel Plugin for Axolotl """ import inspect import sys from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class LigerPlugin(BasePlugin): """ Plugin for LIGER integraton with Axolotl. """ def get_input_args(self): return "axolotl.integrations.liger.LigerArgs" def pre_model_load(self, cfg): # shim: liger-kernel 0.7.0 imports ORPOTrainer from old trl path import trl.trainer from trl.experimental.orpo import ORPOTrainer trl.trainer.ORPOTrainer = ORPOTrainer if cfg.torch_compile: # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled import liger_kernel.ops.fused_linear_cross_entropy from .utils import patch_with_compile_disable patch_with_compile_disable( liger_kernel.ops.fused_linear_cross_entropy, "fused_linear_cross_entropy_forward", ) patch_with_compile_disable( liger_kernel.ops.fused_linear_cross_entropy, "fused_linear_cross_entropy_backward", ) from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss from liger_kernel.transformers.functional import liger_cross_entropy from liger_kernel.transformers.layer_norm import LigerLayerNorm from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN from liger_kernel.transformers.rms_norm import LigerRMSNorm from liger_kernel.transformers.rope import liger_rotary_pos_emb from liger_kernel.transformers.swiglu import LigerSwiGLUMLP if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy: raise ValueError( "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set." ) if cfg.liger_use_token_scaling: # Patch FLCE to set token_scaling=True for function and class API from liger_kernel.transformers import functional from liger_kernel.transformers.fused_linear_cross_entropy import ( LigerFusedLinearCrossEntropyLoss, ) old_liger_fused_linear_cross_entropy = ( functional.liger_fused_linear_cross_entropy ) def patched_liger_fused_linear_cross_entropy(*args, **kwargs): kwargs["use_token_scaling"] = True return old_liger_fused_linear_cross_entropy(*args, **kwargs) functional.liger_fused_linear_cross_entropy = ( patched_liger_fused_linear_cross_entropy ) old_init = LigerFusedLinearCrossEntropyLoss.__init__ def patched_init(self, *args, **kwargs): kwargs["use_token_scaling"] = True return old_init(self, *args, **kwargs) LigerFusedLinearCrossEntropyLoss.__init__ = patched_init if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN: apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type] liger_fn_sig = inspect.signature(apply_liger_fn) kwargs = {} if "rope" in liger_fn_sig.parameters: kwargs["rope"] = cfg.liger_rope if "cross_entropy" in liger_fn_sig.parameters: kwargs["cross_entropy"] = cfg.liger_cross_entropy if "fused_linear_cross_entropy" in liger_fn_sig.parameters: kwargs["fused_linear_cross_entropy"] = ( cfg.liger_fused_linear_cross_entropy ) if "rms_norm" in liger_fn_sig.parameters: kwargs["rms_norm"] = cfg.liger_rms_norm if "layer_norm" in liger_fn_sig.parameters: kwargs["layer_norm"] = cfg.liger_layer_norm if "geglu" in liger_fn_sig.parameters: kwargs["geglu"] = cfg.liger_glu_activation elif "swiglu" in liger_fn_sig.parameters: kwargs["swiglu"] = cfg.liger_glu_activation LOG.info(f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}") apply_liger_fn(**kwargs) elif cfg.model_config_type == "jamba": from transformers.models.jamba import modeling_jamba from .models.jamba import lce_forward as jamba_lce_forward if cfg.liger_rope: modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb if cfg.liger_rms_norm: modeling_jamba.JambaRMSNorm = LigerRMSNorm if cfg.liger_glu_activation: modeling_jamba.JambaMLP = LigerSwiGLUMLP if cfg.liger_layer_norm: modeling_jamba.nn.LayerNorm = LigerLayerNorm if cfg.liger_cross_entropy: from transformers.loss.loss_utils import nn nn.functional.cross_entropy = liger_cross_entropy if cfg.liger_fused_linear_cross_entropy: modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward elif cfg.model_config_type == "deepseek_v2": from accelerate import init_empty_weights from transformers import AutoModelForCausalLM with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( cfg.base_model, trust_remote_code=cfg.trust_remote_code or False ) modeling_mod = sys.modules[model.__class__.__module__] from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward if cfg.liger_rope: # The DeepseekV2 version of RoPE is different than upstream LLaMA. # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528 LOG.warning("Fused liger_rope is not supported for DeepseekV2.") if cfg.liger_rms_norm: modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm if cfg.liger_glu_activation: modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward if cfg.liger_layer_norm: LOG.warning("liger_layer_norm is not supported for DeepseekV2.") if cfg.liger_cross_entropy: # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses # nn.CrossEntropyLoss in the forward method. modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss if cfg.liger_fused_linear_cross_entropy: modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward elif cfg.model_config_type == "llama4": from axolotl.integrations.liger.models.llama4 import ( apply_liger_kernel_to_llama4, ) apply_liger_kernel_to_llama4( cross_entropy=cfg.liger_cross_entropy, fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy, glu_activation=cfg.liger_glu_activation, rms_norm=cfg.liger_rms_norm, layer_norm=cfg.liger_layer_norm, ) elif cfg.model_config_type == "qwen3": from axolotl.integrations.liger.models.qwen3 import ( apply_liger_kernel_to_qwen3, ) apply_liger_kernel_to_qwen3( cross_entropy=cfg.liger_cross_entropy, fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy, glu_activation=cfg.liger_glu_activation, rms_norm=cfg.liger_rms_norm, layer_norm=cfg.liger_layer_norm, ) elif cfg.model_config_type == "qwen3_moe": from axolotl.integrations.liger.models.qwen3_moe import ( apply_liger_kernel_to_qwen3_moe, ) apply_liger_kernel_to_qwen3_moe( cross_entropy=cfg.liger_cross_entropy, fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy, glu_activation=cfg.liger_glu_activation, rms_norm=cfg.liger_rms_norm, layer_norm=cfg.liger_layer_norm, ) elif cfg.model_config_type == "granitemoe": from liger_kernel.transformers import apply_liger_kernel_to_granite apply_liger_kernel_to_granite( rope=cfg.liger_rope, cross_entropy=cfg.liger_cross_entropy, fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy, rms_norm=cfg.liger_rms_norm, swiglu=cfg.liger_glu_activation, ) elif cfg.liger_fused_linear_cross_entropy: try: from .models.base import patch_lce_forward patch_lce_forward(cfg.model_config_type) LOG.warning_once( f"Applied ONLY liger_fused_linear_cross_entropy genericpatches for model type: {cfg.model_config_type}" ) LOG.warning_once( f"Liger + {cfg.model_config_type} generic FLCE support is experimental and may not work as expected." ) except RuntimeError: LOG.warning( f"Unsupported model config type: {cfg.model_config_type}. Liger not applied." ) else: LOG.warning( f"Unsupported model config type: {cfg.model_config_type}. Liger not applied." ) ================================================ FILE: src/axolotl/integrations/liger/utils.py ================================================ """ utils to patch liger kernel ops to disable torch.compile """ from functools import wraps import torch def patch_with_compile_disable(module, function_name): """ Patch a function in a module by wrapping it with torch.compile.disable Args: module: The module containing the function to patch function_name: The name of the function to patch """ original_function = getattr(module, function_name) @wraps(original_function) @torch.compiler.disable def wrapped_function(*args, **kwargs): return original_function(*args, **kwargs) # Replace the original function with the wrapped one setattr(module, function_name, wrapped_function) # Return the original function in case you need to restore it later return original_function ================================================ FILE: src/axolotl/integrations/llm_compressor/README.md ================================================ # LLMCompressor Integration Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor). This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale. It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training. --- ## Requirements - Axolotl with `llmcompressor` extras: ```bash pip install "axolotl[llmcompressor]" ``` - Requires `llmcompressor >= 0.5.1` This will install all necessary dependencies to fine-tune sparsified models using the integration. --- ## Usage To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config: ```yaml plugins: - axolotl.integrations.llm_compressor.LLMCompressorPlugin llmcompressor: recipe: finetuning_stage: finetuning_modifiers: ConstantPruningModifier: targets: [ 're:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight', 're:.*o_proj.weight', 're:.*gate_proj.weight', 're:.*up_proj.weight', 're:.*down_proj.weight', ] start: 0 save_compressed: true # ... (other training arguments) ``` This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**. Pre-sparsified checkpoints can be: - Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor) - Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic) - Any custom LLM with compatible sparsity patterns that you've created yourself To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation: [https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md) ### Storage Optimization with save_compressed Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which: - Reduces disk space usage by approximately 40% - Maintains compatibility with vLLM for accelerated inference - Maintains compatibility with llmcompressor for further optimization (example: quantization) This option is highly recommended when working with sparse models to maximize the benefits of model compression. ### Example Config See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example. --- ## Inference with vLLM After fine-tuning your sparse model, you can leverage vLLM for efficient inference. You can also use LLMCompressor to apply additional quantization to your fine-tuned sparse model before inference for even greater performance benefits.: ```python from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM("path/to/your/sparse/model") outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/). ## Learn More For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository: [https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) ================================================ FILE: src/axolotl/integrations/llm_compressor/__init__.py ================================================ """Integration entry point for the LLMCompressor plugin.""" from .plugin import LLMCompressorPlugin __all__ = ["LLMCompressorPlugin"] ================================================ FILE: src/axolotl/integrations/llm_compressor/args.py ================================================ """ LLMCompressor and Sparse Finetuning config models. """ from typing import Any from pydantic import BaseModel, Field from typing_extensions import Annotated class CompressionArgs(BaseModel): """Sparse Finetuning config for LLMCompressor.""" # Typing for recipe is set to Any due to: # https://github.com/vllm-project/llm-compressor/issues/1319 recipe: Annotated[ Any, Field( description="The recipe containing the compression algorithms and hyperparameters to apply." ), ] save_compressed: Annotated[ bool, Field( default=False, description="Whether to save the compressed model after training.", ), ] class LLMCompressorArgs(BaseModel): """LLMCompressor configuration BaseModel.""" llmcompressor: Annotated[ CompressionArgs, Field( description="Arguments enabling compression pathways through the LLM Compressor plugins" ), ] ================================================ FILE: src/axolotl/integrations/llm_compressor/plugin.py ================================================ """ Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks by maintaining masks for zero weights during training. """ from functools import wraps from typing import Any, Callable, Concatenate, ParamSpec, TypeVar from llmcompressor import active_session, create_session from llmcompressor.core import callbacks as session_callbacks from llmcompressor.recipe import Recipe from torch.nn import Module from transformers.trainer import Trainer from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState from transformers.training_args import TrainingArguments from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger P = ParamSpec("P") # Params for generic function signatures R = TypeVar("R") # Return type for generic function signatures LOG = get_logger(__name__) class LLMCompressorCallbackHandler(TrainerCallback): """ Trainer callback for Sparse Finetuning. Maintains sparsity patterns during training by applying masks after optimization steps, ensuring zero-weight updates are canceled out. """ def __init__(self, trainer: Trainer, recipe: Any): """ Initialize the Sparse Finetuning callback handler. Args: trainer (Trainer): Huggingface Trainer instance. recipe (Recipe | dict): Sparse finetuning recipe to apply. """ super().__init__() self.trainer = trainer self.recipe = ( Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe ) self.original_compute_loss = trainer.compute_loss self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss) create_session() def on_train_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ) -> None: """ Called at the beginning of training. Initializes the compression session. Args: args (TrainingArguments): Training arguments. state (TrainerState): Trainer state. control (TrainerControl): Trainer control. """ super().on_train_begin(args, state, control, **kwargs) self.trainer.accelerator.wait_for_everyone() active_session().initialize( model=self.trainer.model, optimizer=self.trainer.optimizer, start=state.epoch, recipe=self.recipe, ) self.trainer.accelerator.wait_for_everyone() def on_step_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ) -> None: """ Called at the beginning of a training step. Triggers batch_start callback. """ super().on_step_begin(args, state, control, **kwargs) session_callbacks.batch_start() def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ) -> None: """ Called at the end of a training step. Triggers optimizer and batch_end callbacks. """ super().on_step_end(args, state, control, **kwargs) session_callbacks.optim_pre_step() session_callbacks.optim_post_step() session_callbacks.batch_end() def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ) -> None: """ Called at the end of training. Finalizes the compression session. """ super().on_train_end(args, state, control, **kwargs) active_session().finalize() self.trainer.compute_loss_func = self.original_compute_loss class LLMCompressorPlugin(BasePlugin): """ Sparse Finetuning plugin for Axolotl integration. """ def get_input_args(self) -> str: """ Returns the path to the plugin's argument definition. Returns: str: Dotted path to the LLMCompressorArgs class. """ return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs" def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list: """ Adds Sparse Finetuning callback to the Trainer instance. Args: cfg (Any): Configuration object containing the sparse recipe. trainer (Trainer): Huggingface Trainer instance. Returns: list: List containing the configured callback instances. """ LOG.info("Adding Sparse Finetuning callback to the trainer") callback = LLMCompressorCallbackHandler( trainer=trainer, recipe=cfg.llmcompressor.recipe, ) return [callback] def compute_loss_wrapper( compute_loss_func: Callable[Concatenate[Module, P], R], ) -> Callable[Concatenate[Module, P], R]: """ Wraps the loss computation function to trigger the loss_calculated callback. Args: compute_loss_func (Callable): Original loss computation function. Returns: Callable: Wrapped function that also invokes the loss_calculated callback. """ @wraps(compute_loss_func) def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R: loss = compute_loss_func(model, *args, **kwargs) if active_session().lifecycle.initialized_ and model.training: session_callbacks.loss_calculated(loss=loss) return loss return compute_and_notify ================================================ FILE: src/axolotl/integrations/llm_compressor/utils.py ================================================ """Utilities for llmcompressor integration with axolotl.""" from typing import Union from llmcompressor.transformers.sparsification.compressed_tensors_utils import ( modify_save_pretrained, ) from transformers import PreTrainedModel, Trainer def save_compressed_model( model: PreTrainedModel, output_dir: Union[str, bytes], trainer: Trainer, save_compressed: bool = False, ) -> None: """ Synchronize processes, apply compression hooks, and save the model. Args: model (PreTrainedModel): The model to be saved. output_dir (str or bytes): Path where the model files will be written. trainer (Trainer): Hugging Face Trainer for process synchronization. save_compressed (bool): Write compressed tensors if True. """ trainer.accelerator.wait_for_everyone() # Only the main process writes the files if not trainer.accelerator.is_main_process: return modify_save_pretrained(model) model.save_pretrained( output_dir, save_compressed=save_compressed, skip_sparsity_compression_stats=not save_compressed, ) ================================================ FILE: src/axolotl/integrations/lm_eval/README.md ================================================ # LM Eval Harness Run evaluation on model using the popular lm-evaluation-harness library. See https://github.com/EleutherAI/lm-evaluation-harness ## Usage There are two ways to use the LM Eval integration: ### 1. Post-Training Evaluation When training with the plugin enabled, evaluation runs automatically after training completes: ```yaml plugins: - axolotl.integrations.lm_eval.LMEvalPlugin lm_eval_tasks: - gsm8k - hellaswag - arc_easy lm_eval_batch_size: # Batch size for evaluation # Directory to save evaluation results. # The final model is loaded from this directory # unless specified otherwise (see below) output_dir: ``` Run training as usual: ```bash axolotl train config.yml ``` ### 2. Standalone CLI Evaluation Evaluate any model directly without training: ```yaml lm_eval_model: meta-llama/Llama-2-7b-hf plugins: - axolotl.integrations.lm_eval.LMEvalPlugin lm_eval_tasks: - gsm8k - hellaswag - arc_easy lm_eval_batch_size: 8 output_dir: ./outputs ``` Run evaluation: ```bash axolotl lm-eval config.yml ``` ## Model Selection Priority The model to evaluate is selected in the following priority order: 1. **`lm_eval_model`** - Explicit model path or HuggingFace repo (highest priority) 2. **`hub_model_id`** - Trained model pushed to HuggingFace Hub 3. **`output_dir`** - Local checkpoint directory containing trained model weights ## Citation ```bib @misc{eval-harness, author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, title = {A framework for few-shot language model evaluation}, month = 07, year = 2024, publisher = {Zenodo}, version = {v0.4.3}, doi = {10.5281/zenodo.12608602}, url = {https://zenodo.org/records/12608602} } ``` ================================================ FILE: src/axolotl/integrations/lm_eval/__init__.py ================================================ """ Module for the Plugin for LM Eval Harness """ import subprocess # nosec from axolotl.integrations.base import BasePlugin from axolotl.integrations.lm_eval.cli import build_lm_eval_command, get_model_path from .args import LMEvalArgs as LMEvalArgs class LMEvalPlugin(BasePlugin): """ Plugin for LM Evaluation Harness integraton with Axolotl. """ def get_input_args(self): return "axolotl.integrations.lm_eval.LMEvalArgs" def post_train_unload(self, cfg): if cfg.lm_eval_post_train: for lm_eval_args in build_lm_eval_command( cfg.lm_eval_tasks, bfloat16=cfg.bfloat16 or cfg.bf16, flash_attention=cfg.flash_attention, output_dir=cfg.output_dir, batch_size=cfg.lm_eval_batch_size, wandb_project=cfg.wandb_project, wandb_entity=cfg.wandb_entity, wandb_name=cfg.wandb_name, model=get_model_path(cfg), ): subprocess.run( # nosec lm_eval_args, check=True, ) ================================================ FILE: src/axolotl/integrations/lm_eval/args.py ================================================ """ Module for handling lm eval harness input arguments. """ from typing import List, Optional from pydantic import BaseModel class LMEvalArgs(BaseModel): """ Input args for lm eval harness """ lm_eval_tasks: List[str] = [] lm_eval_batch_size: Optional[int] = 8 lm_eval_post_train: Optional[bool] = True lm_eval_model: Optional[str] = None ================================================ FILE: src/axolotl/integrations/lm_eval/cli.py ================================================ """ axolotl CLI for running lm_eval tasks """ import subprocess # nosec from collections import defaultdict from datetime import datetime from typing import Optional import click import yaml from axolotl.utils.dict import DictDefault def get_model_path(cfg: DictDefault) -> str | None: """ Determine which model path to use for evaluation. Priority order (highest to lowest): 1. lm_eval_model - Explicit model path override 2. hub_model_id - Model pushed to HuggingFace Hub 3. None - Falls back to output_dir in build_lm_eval_command Returns: Model path string or None to use output_dir fallback """ return cfg.lm_eval_model or cfg.hub_model_id or None def build_lm_eval_command( tasks: list[str], bfloat16=True, flash_attention=False, output_dir="./", batch_size=8, wandb_project=None, wandb_entity=None, wandb_name=None, model=None, revision=None, apply_chat_template=None, fewshot_as_multiturn=None, ): tasks_by_num_fewshot: dict[str, list] = defaultdict(list) if isinstance(tasks, str): tasks = [tasks] for task in tasks: num_fewshot = "-1" task_parts = task.split(":") task_name = task_parts[0] if len(task_parts) == 2: task_name, num_fewshot = task_parts tasks_by_num_fewshot[str(num_fewshot)].append(task_name) for num_fewshot, tasks_list in tasks_by_num_fewshot.items(): tasks_str = ",".join(tasks_list) num_fewshot_val = num_fewshot if num_fewshot != "-1" else None pretrained = "pretrained=" pretrained += model if model else output_dir fa2 = ",attn_implementation=flash_attention_2" if flash_attention else "" dtype = ",dtype=bfloat16" if bfloat16 else ",dtype=float16" revision = f",revision={revision}" if revision else "" output_path = output_dir output_path += "" if output_dir.endswith("/") else "/" output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S") lm_eval_args = [ "lm_eval", "--model", "hf", "--model_args", f"{pretrained}{fa2}{dtype}{revision}", "--tasks", tasks_str, "--batch_size", str(batch_size), "--output_path", output_path, ] wandb_args = [] if wandb_project: wandb_args.append(f"project={wandb_project}") if wandb_entity: wandb_args.append(f"entity={wandb_entity}") if wandb_name: wandb_args.append(f"name={wandb_name}") if wandb_args: lm_eval_args.append("--wandb_args") lm_eval_args.append(",".join(wandb_args)) if apply_chat_template: lm_eval_args.append("--apply_chat_template") if num_fewshot_val: lm_eval_args.append("--num_fewshot") lm_eval_args.append(str(num_fewshot_val)) if apply_chat_template and fewshot_as_multiturn: lm_eval_args.append("--fewshot_as_multiturn") yield lm_eval_args @click.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) @click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str)) def lm_eval(config: str, cloud: Optional[str] = None): """ use lm eval to evaluate a trained language model """ if cloud: from axolotl.cli.cloud import do_cli_lm_eval do_cli_lm_eval(cloud_config=cloud, config=config) else: with open(config, encoding="utf-8") as file: cfg: DictDefault = DictDefault(yaml.safe_load(file)) for lm_eval_args in build_lm_eval_command( cfg.lm_eval_tasks, bfloat16=cfg.bfloat16 or cfg.bf16, flash_attention=cfg.flash_attention, output_dir=cfg.output_dir, batch_size=cfg.lm_eval_batch_size, wandb_project=cfg.wandb_project, wandb_entity=cfg.wandb_entity, wandb_name=cfg.wandb_name, model=get_model_path(cfg), revision=cfg.revision, apply_chat_template=cfg.apply_chat_template, fewshot_as_multiturn=cfg.fewshot_as_multiturn, ): subprocess.run( # nosec lm_eval_args, check=True, ) ================================================ FILE: src/axolotl/integrations/spectrum/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: src/axolotl/integrations/spectrum/README.md ================================================ # Spectrum: Targeted Training on Signal to Noise Ratio by Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar This plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR). See https://github.com/cognitivecomputations/spectrum ## Overview Spectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models. By identifying the top n% of layers with the highest SNR, you can optimize training efficiency. ## Usage ```yaml plugins: - axolotl.integrations.spectrum.SpectrumPlugin spectrum_top_fraction: 0.5 # Optional if using a pre-scanned model as your base_model. Useful if using a model mirror spectrum_model_name: meta-llama/Meta-Llama-3.1-8B ``` ## Citation ```bib @misc{hartford2024spectrumtargetedtrainingsignal, title={Spectrum: Targeted Training on Signal to Noise Ratio}, author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar}, year={2024}, eprint={2406.06623}, archivePrefix={arXiv}, primaryClass={cs.LG}, url={https://arxiv.org/abs/2406.06623}, } ``` ================================================ FILE: src/axolotl/integrations/spectrum/__init__.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Spectrum Plugin to automatically generate unfrozen parameters based on SNR data. """ import json import requests from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger from .args import SpectrumArgs as SpectrumArgs LOG = get_logger(__name__) def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5): unfrozen_parameters = {} for layer_name, info in snr_data.items(): layer_type = info["type"] if layer_type not in unfrozen_parameters: unfrozen_parameters[layer_type] = [] unfrozen_parameters[layer_type].append((layer_name, info["snr"])) top_layers_by_type = {} for layer_type, layers in unfrozen_parameters.items(): layers_sorted = sorted(layers, key=lambda x: x[1], reverse=True) num_top_layers = int(len(layers) * top_fraction) top_layers_by_type[layer_type] = [ layer[0] for layer in layers_sorted[:num_top_layers] ] unfrozen_parameters = [ "^lm_head.weight$", "^model.embed_tokens.weight$", ] for _, layer_names in top_layers_by_type.items(): for layer_name in layer_names: unfrozen_parameters.append(layer_name) return unfrozen_parameters class SpectrumPlugin(BasePlugin): """ Spectrum Plugin to automatically generate unfrozen parameters based on SNR data. """ base_url = "https://raw.githubusercontent.com/cognitivecomputations/spectrum/main/model_snr_results/" base_path = "./model_snr_results/" snr_file_template = "snr_results_{model_name_slug}.json" def get_input_args(self): return "axolotl.integrations.spectrum.SpectrumArgs" def pre_model_load(self, cfg): if cfg.get("spectrum_model_name"): model_name = cfg["spectrum_model_name"] else: model_name = cfg["base_model"] top_fraction = cfg.get("spectrum_top_fraction", 50) model_slug = model_name.replace("/", "-").replace("_", "-") snr_url = self.base_url + self.snr_file_template.format( model_name_slug=model_slug ) snr_path = self.base_path + self.snr_file_template.format( model_name_slug=model_slug ) # first check if the files exist locally and read the json snr_data = None try: with open(snr_path, "r", encoding="utf-8") as fin: snr_data = json.load(fin) except FileNotFoundError: pass except Exception as exc: LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}") if not snr_data: try: snr_data = requests.get(snr_url, timeout=60).json() except requests.exceptions.RequestException as exc: LOG.warning(f"Failed to fetch SNR data from {snr_url}: {exc}") return # also catch json parsing errors except json.JSONDecodeError as exc: LOG.warning(f"Failed to parse SNR data from {snr_url}: {exc}") return unfrozen_parameters = _generate_unfrozen_params_yaml( snr_data, top_fraction=top_fraction ) cfg["unfrozen_parameters"] = unfrozen_parameters ================================================ FILE: src/axolotl/integrations/spectrum/args.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Module for handling Spectrum input arguments. """ from typing import Optional from pydantic import BaseModel, model_validator class SpectrumArgs(BaseModel): """ Input args for Spectrum. """ spectrum_top_fraction: Optional[float] = 0.5 spectrum_model_name: Optional[str] = None @model_validator(mode="before") @classmethod def check_fsdp_use_orig_params(cls, data): if ( data.get("fsdp") and data.get("fsdp_config") and not data["fsdp_config"].get("use_orig_params") and data.get("plugins") and any("SpectrumPlugin" in plugin for plugin in data["plugins"]) ): # would otherwise raise # ValueError: Must flatten tensors with uniform `requires_grad` when `use_orig_params=False` raise ValueError( "FSDP + SpectrumPlugin cannot be used together when `use_orig_params=False` is set" ) return data ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 70.50235748291016, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 134.4214630126953, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 235.74794006347656, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 73.25755310058594, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 27.22879981994629, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 17.5551815032959, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 54.210426330566406, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 38.808937072753906, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 29.799747467041016, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 10.296355247497559, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 8.86428165435791, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 6.43813943862915, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 7.0912184715271, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 3.285884141921997, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 6.073758125305176, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 5.325990676879883, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 4.591946601867676, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 7.021907329559326, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 6.392782211303711, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 210.51983642578125, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 7.1035943031311035, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 18.701711654663086, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 14.842622756958008, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 10.50004768371582, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 7.225146770477295, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 7.463952541351318, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 15.226134300231934, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 105.4173355102539, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.5021594166755676, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 34.75935363769531, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 22.855531692504883, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 25.09166717529297, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 28.533172607421875, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 18.625717163085938, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 39.77565383911133, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 24.77678680419922, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 11.854388236999512, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 20.372356414794922, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 14.639552116394043, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 9.82955551147461, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 13.942151069641113, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 12.524999618530273, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 8.19681167602539, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 8.561081886291504, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 6.421900749206543, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 5.568161964416504, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 10.090147972106934, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 5.6181230545043945, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 5.173826694488525, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 5.663441181182861, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 6.824708461761475, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 4.724992275238037, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 6.829834938049316, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 9.968582153320312, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 14.35350513458252, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 20.121768951416016, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 1.9020992517471313, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 46.9393424987793, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 76.04901123046875, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 104.08525848388672, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 77.74343872070312, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 104.15605926513672, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 105.16349792480469, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 78.4150390625, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 57.51069641113281, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 50.26409912109375, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 50.36701965332031, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 56.66413497924805, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 62.384559631347656, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 44.97883987426758, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 69.7376480102539, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 35.93111801147461, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 33.63168716430664, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 37.695919036865234, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 43.516517639160156, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 30.479318618774414, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 12.495409965515137, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 19.616689682006836, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 18.42948341369629, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 10.799560546875, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 14.167623519897461, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 14.938597679138184, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 8.896568298339844, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 25.774547576904297, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 1.8306859731674194, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.896544337272644, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 2.345759868621826, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 2.0610744953155518, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 2.3658556938171387, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.6586917638778687, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 1.7613047361373901, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 1.325312852859497, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 1.458108901977539, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 1.4319790601730347, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.9579543471336365, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.8787619471549988, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.0447536706924438, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.9157310724258423, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.7528730630874634, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.9293556213378906, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 0.8057093620300293, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 1.2973601818084717, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.1357901096343994, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 1.3661632537841797, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.8829066753387451, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.9105398654937744, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 2.086926221847534, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 1.0393351316452026, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 1.114574670791626, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 2.599745035171509, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.1256712675094604, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 1.1784162521362305, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.8094121813774109, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.22000817954540253, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.21972468495368958, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.22064059972763062, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.22308556735515594, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.22396250069141388, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.228360116481781, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.2306283563375473, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.2430228292942047, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.2115175724029541, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.18226943910121918, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.144245907664299, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.21965907514095306, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.1797526627779007, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.26513636112213135, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.19463808834552765, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.22129350900650024, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.22545330226421356, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.25302645564079285, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.26326504349708557, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.15203869342803955, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.22418837249279022, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.23777326941490173, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.18076598644256592, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.19919466972351074, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.11310968548059464, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.08452697843313217, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.1029304787516594, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.03922705352306366, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.1410205066204071, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.18240582942962646, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.1702580451965332, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.19508686661720276, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.21549257636070251, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.22021502256393433, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.2044307142496109, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.22745060920715332, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.23825915157794952, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.2181481122970581, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.23490090668201447, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.2379382699728012, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.19233369827270508, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.2587313652038574, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.07332809269428253, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.22992204129695892, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.2537729740142822, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.2389948070049286, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.20716068148612976, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.2575169503688812, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.22347678244113922, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.18831054866313934, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.19853907823562622, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.16343259811401367, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.1583252102136612, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.254446804523468, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.23828543722629547, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 856.5148315429688, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 48.941104888916016, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 70.25466918945312, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 370.885986328125, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 75.51139831542969, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 52.004058837890625, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": 641.026611328125, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 323.4858093261719, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 2.1745388507843018, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 3.0791690349578857, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 2.029968023300171, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 70.4939193725586, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 134.2310028076172, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 235.44140625, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 73.19381713867188, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 27.216264724731445, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 17.544504165649414, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 54.17462158203125, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 38.78171920776367, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 29.777149200439453, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 10.289377212524414, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 8.858332633972168, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 6.433396816253662, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 7.085702419281006, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 3.323948383331299, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 6.204164505004883, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 5.321533203125, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 4.588479995727539, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 7.01450252532959, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 6.386813163757324, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 210.38458251953125, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 7.096683979034424, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 18.68245506286621, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 14.824685096740723, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 10.491303443908691, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 7.2194437980651855, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 7.458613872528076, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 15.222760200500488, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 105.41569519042969, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.5017311573028564, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 34.71562576293945, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 22.82915496826172, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 25.0699520111084, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 28.508079528808594, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 18.608009338378906, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 39.732391357421875, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 24.760026931762695, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 11.842738151550293, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 20.35906982421875, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 14.627532958984375, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 9.821962356567383, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 13.930404663085938, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 12.509871482849121, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 8.187695503234863, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 8.553187370300293, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 6.414614200592041, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 5.561778545379639, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 10.078697204589844, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 5.61345100402832, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 5.265484809875488, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 5.659949779510498, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 6.8203511238098145, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 4.721294403076172, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 6.82572603225708, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 9.963521003723145, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 14.342291831970215, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 20.092098236083984, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 1.901187777519226, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 46.9141731262207, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 76.07878112792969, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 103.9194564819336, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 77.62561798095703, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 104.01624298095703, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 105.0235366821289, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 78.33445739746094, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 57.44070816040039, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 50.20344924926758, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 50.32845687866211, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 56.6197624206543, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 62.338096618652344, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 44.92917251586914, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 69.69624328613281, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 35.90705108642578, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 33.610374450683594, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 37.67365646362305, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 43.488929748535156, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 30.451993942260742, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 12.480182647705078, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 19.595102310180664, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 19.067970275878906, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 10.786394119262695, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 14.150126457214355, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 14.927021026611328, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 8.891448020935059, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 25.74305534362793, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 1.7818864583969116, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.8955822587013245, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 2.344149351119995, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 2.0597119331359863, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 2.36411714553833, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.6570613384246826, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 1.7604507207870483, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 1.3245182037353516, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 1.4567548036575317, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 1.4310829639434814, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.95713210105896, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.8781776428222656, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.0438013076782227, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.9315219521522522, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.7521569728851318, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.9286947250366211, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 0.8047553896903992, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 1.2965552806854248, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.134974479675293, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 1.3648872375488281, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.8667459487915039, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.9100639224052429, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 2.127535820007324, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 1.0382369756698608, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 1.113753318786621, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 2.597890853881836, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.1248247623443604, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 1.1984941959381104, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.8139898777008057, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.21965594589710236, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.219479501247406, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.22144284844398499, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.22390463948249817, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.22383669018745422, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.22818723320960999, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.23134392499923706, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.24275101721286774, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.21139128506183624, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.18210072815418243, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.14415481686592102, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.21947966516017914, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.17875106632709503, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.264996200799942, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.19353187084197998, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.22111012041568756, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.2242278754711151, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.2527434229850769, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.26184532046318054, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.1519661247730255, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.22386522591114044, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.2386160045862198, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.18057651817798615, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.1989467740058899, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.11306505650281906, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.08449216932058334, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.10287519544363022, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.039204664528369904, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.14075909554958344, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.18212397396564484, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.1700422316789627, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.1948907971382141, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.2153141051530838, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.21998055279254913, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.20416118204593658, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.2272879034280777, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.23795834183692932, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.21887299418449402, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.23469635844230652, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.23774078488349915, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.1920779049396515, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.2584812641143799, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.07330238074064255, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.23073157668113708, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.2523840367794037, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.23874858021736145, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.20698708295822144, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.25723400712013245, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.223300039768219, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.18824049830436707, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.19840741157531738, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.16326843202114105, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.1581888198852539, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.25306230783462524, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.23808495700359344, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 864.8881225585938, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 48.853694915771484, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 70.18457794189453, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 371.1153259277344, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 75.41203308105469, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 51.92624282836914, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": 642.9313354492188, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 323.5724182128906, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 2.1736748218536377, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 3.1729259490966797, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 2.024953842163086, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.28.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.29.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.30.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.31.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.32.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.33.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.34.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.35.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 20.964319229125977, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 0.11561352014541626, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 0.14991413056850433, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 0.3673713207244873, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 0.5076134204864502, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 33.89468002319336, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 45.08732986450195, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 33.234222412109375, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 29.3447322845459, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 26.664169311523438, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 22.323949813842773, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 18.259737014770508, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 14.422037124633789, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 22.172054290771484, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 27.363698959350586, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 28.474334716796875, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 10.4143648147583, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 10.719133377075195, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 8.6494722366333, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 5.69321870803833, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 23.889677047729492, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 11.59121036529541, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 5.997435569763184, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 19.415578842163086, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 8.241704940795898, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 12.993823051452637, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 36.26508712768555, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 19.957971572875977, "type": "mlp.down_proj" }, "model.layers.28.mlp.down_proj": { "snr": 6.067765235900879, "type": "mlp.down_proj" }, "model.layers.29.mlp.down_proj": { "snr": 5.369481086730957, "type": "mlp.down_proj" }, "model.layers.30.mlp.down_proj": { "snr": 7.358774662017822, "type": "mlp.down_proj" }, "model.layers.31.mlp.down_proj": { "snr": 7.8687238693237305, "type": "mlp.down_proj" }, "model.layers.32.mlp.down_proj": { "snr": 8.713484764099121, "type": "mlp.down_proj" }, "model.layers.33.mlp.down_proj": { "snr": 21.233531951904297, "type": "mlp.down_proj" }, "model.layers.34.mlp.down_proj": { "snr": 32.37357711791992, "type": "mlp.down_proj" }, "model.layers.35.mlp.down_proj": { "snr": 179.8053741455078, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.24989914894104004, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.11613649874925613, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.16354432702064514, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 0.36216047406196594, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.3485107719898224, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 2.6546616554260254, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 8.362885475158691, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 7.38665246963501, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 13.016111373901367, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 14.94902515411377, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 20.92418670654297, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 15.954015731811523, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 8.980009078979492, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 17.59958267211914, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 17.23070526123047, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 23.725330352783203, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 17.000444412231445, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 18.293012619018555, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 12.644190788269043, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 16.278690338134766, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 7.407368183135986, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 6.109912395477295, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 5.3692426681518555, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 9.354235649108887, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 7.655010223388672, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 6.252986431121826, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 14.26718521118164, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 7.705836772918701, "type": "mlp.gate_proj" }, "model.layers.28.mlp.gate_proj": { "snr": 5.998677730560303, "type": "mlp.gate_proj" }, "model.layers.29.mlp.gate_proj": { "snr": 6.044872760772705, "type": "mlp.gate_proj" }, "model.layers.30.mlp.gate_proj": { "snr": 9.027137756347656, "type": "mlp.gate_proj" }, "model.layers.31.mlp.gate_proj": { "snr": 5.449969291687012, "type": "mlp.gate_proj" }, "model.layers.32.mlp.gate_proj": { "snr": 4.206825256347656, "type": "mlp.gate_proj" }, "model.layers.33.mlp.gate_proj": { "snr": 5.22825288772583, "type": "mlp.gate_proj" }, "model.layers.34.mlp.gate_proj": { "snr": 43.71927261352539, "type": "mlp.gate_proj" }, "model.layers.35.mlp.gate_proj": { "snr": 45.37385177612305, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 0.7069714665412903, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 0.17766596376895905, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 0.28577035665512085, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 0.6763099431991577, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 0.8340913653373718, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 3.946547031402588, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 19.56715202331543, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 36.21149826049805, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 44.28759002685547, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 45.47198486328125, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 79.00128936767578, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 52.28038787841797, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 48.08102035522461, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 56.071285247802734, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 72.24358367919922, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 54.818233489990234, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 47.251495361328125, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 51.585636138916016, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 43.47938919067383, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 38.132469177246094, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 21.78435707092285, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 22.261096954345703, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 30.751861572265625, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 28.61063575744629, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 20.21415901184082, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 20.759052276611328, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 33.80818557739258, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 17.274362564086914, "type": "mlp.up_proj" }, "model.layers.28.mlp.up_proj": { "snr": 13.943653106689453, "type": "mlp.up_proj" }, "model.layers.29.mlp.up_proj": { "snr": 16.202186584472656, "type": "mlp.up_proj" }, "model.layers.30.mlp.up_proj": { "snr": 24.25114631652832, "type": "mlp.up_proj" }, "model.layers.31.mlp.up_proj": { "snr": 10.68645191192627, "type": "mlp.up_proj" }, "model.layers.32.mlp.up_proj": { "snr": 5.7449774742126465, "type": "mlp.up_proj" }, "model.layers.33.mlp.up_proj": { "snr": 11.879876136779785, "type": "mlp.up_proj" }, "model.layers.34.mlp.up_proj": { "snr": 25.948715209960938, "type": "mlp.up_proj" }, "model.layers.35.mlp.up_proj": { "snr": 38.63526153564453, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.28.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.29.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.30.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.31.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.32.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.33.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.34.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.35.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 12.243099212646484, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.6446183323860168, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.7159711718559265, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 5.5100932121276855, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 3.0802414417266846, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.0472767353057861, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 3.576918601989746, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 3.3793225288391113, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 2.9598212242126465, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 6.102792263031006, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 2.231630325317383, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 2.176372766494751, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.3229435682296753, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 2.6183862686157227, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 2.608288526535034, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 1.5090984106063843, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 1.284422516822815, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 0.8903945088386536, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.8880385160446167, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 0.8905735015869141, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.9060881733894348, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.7572551965713501, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 0.940827488899231, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 3.7776191234588623, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 1.328923225402832, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 1.3986345529556274, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.2436336278915405, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.7737217545509338, "type": "self_attn.k_proj" }, "model.layers.28.self_attn.k_proj": { "snr": 2.6027626991271973, "type": "self_attn.k_proj" }, "model.layers.29.self_attn.k_proj": { "snr": 2.2332751750946045, "type": "self_attn.k_proj" }, "model.layers.30.self_attn.k_proj": { "snr": 2.476585626602173, "type": "self_attn.k_proj" }, "model.layers.31.self_attn.k_proj": { "snr": 1.1115432977676392, "type": "self_attn.k_proj" }, "model.layers.32.self_attn.k_proj": { "snr": 0.8251476287841797, "type": "self_attn.k_proj" }, "model.layers.33.self_attn.k_proj": { "snr": 0.9331105947494507, "type": "self_attn.k_proj" }, "model.layers.34.self_attn.k_proj": { "snr": 6.602395534515381, "type": "self_attn.k_proj" }, "model.layers.35.self_attn.k_proj": { "snr": 10.151693344116211, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.3661542534828186, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.19571374356746674, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.2244851142168045, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.2593664526939392, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.2569783926010132, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.2564302980899811, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.18539844453334808, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.2328651398420334, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.22055882215499878, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.21800543367862701, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.22867777943611145, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.23986175656318665, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.17598563432693481, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.20469218492507935, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.21040217578411102, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.23787625133991241, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.16339677572250366, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.2070712298154831, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.1826934814453125, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.19459959864616394, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.2668156027793884, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.16906610131263733, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.18790249526500702, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.18883933126926422, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.1793188899755478, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.1800570785999298, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.17790433764457703, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.2029498964548111, "type": "self_attn.o_proj" }, "model.layers.28.self_attn.o_proj": { "snr": 0.17044201493263245, "type": "self_attn.o_proj" }, "model.layers.29.self_attn.o_proj": { "snr": 0.19938386976718903, "type": "self_attn.o_proj" }, "model.layers.30.self_attn.o_proj": { "snr": 0.23108959197998047, "type": "self_attn.o_proj" }, "model.layers.31.self_attn.o_proj": { "snr": 0.16427059471607208, "type": "self_attn.o_proj" }, "model.layers.32.self_attn.o_proj": { "snr": 0.10631092637777328, "type": "self_attn.o_proj" }, "model.layers.33.self_attn.o_proj": { "snr": 0.09417019784450531, "type": "self_attn.o_proj" }, "model.layers.34.self_attn.o_proj": { "snr": 0.1324978619813919, "type": "self_attn.o_proj" }, "model.layers.35.self_attn.o_proj": { "snr": 0.11784011125564575, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.05565479397773743, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.138458251953125, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.12992437183856964, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.15362468361854553, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.1563446819782257, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.15544593334197998, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.15956827998161316, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.17549948394298553, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.16668449342250824, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.15626586973667145, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.18318884074687958, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.171547532081604, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.18164905905723572, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.2091975212097168, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.17431670427322388, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.20902502536773682, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.15439842641353607, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.1945274919271469, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.18916545808315277, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.20778712630271912, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.20866931974887848, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.1900305300951004, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.18200653791427612, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.2070988416671753, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.1845332235097885, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.20868781208992004, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.19242744147777557, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.15225112438201904, "type": "self_attn.q_proj" }, "model.layers.28.self_attn.q_proj": { "snr": 0.20065009593963623, "type": "self_attn.q_proj" }, "model.layers.29.self_attn.q_proj": { "snr": 0.19390477240085602, "type": "self_attn.q_proj" }, "model.layers.30.self_attn.q_proj": { "snr": 0.18538697063922882, "type": "self_attn.q_proj" }, "model.layers.31.self_attn.q_proj": { "snr": 0.18954339623451233, "type": "self_attn.q_proj" }, "model.layers.32.self_attn.q_proj": { "snr": 0.20089596509933472, "type": "self_attn.q_proj" }, "model.layers.33.self_attn.q_proj": { "snr": 0.19814996421337128, "type": "self_attn.q_proj" }, "model.layers.34.self_attn.q_proj": { "snr": 0.17733213305473328, "type": "self_attn.q_proj" }, "model.layers.35.self_attn.q_proj": { "snr": 0.14075976610183716, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 845.8053588867188, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 83.97241973876953, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 213.70960998535156, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": 18.950267791748047, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": 435.8339538574219, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.28.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.29.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.30.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.31.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.32.self_attn.v_proj": { "snr": 1.2341279983520508, "type": "self_attn.v_proj" }, "model.layers.33.self_attn.v_proj": { "snr": 0.6158654689788818, "type": "self_attn.v_proj" }, "model.layers.34.self_attn.v_proj": { "snr": 509.3221130371094, "type": "self_attn.v_proj" }, "model.layers.35.self_attn.v_proj": { "snr": 538.6658325195312, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.28.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.29.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.30.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.31.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.32.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.33.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.34.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.35.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 20.942785263061523, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 0.11550866067409515, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 0.14981402456760406, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 0.36719316244125366, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 0.5072987079620361, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 33.86688232421875, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 45.066246032714844, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 33.20981979370117, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 29.310104370117188, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 26.638381958007812, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 22.302486419677734, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 18.249290466308594, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 14.057564735412598, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 22.154281616210938, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 27.348575592041016, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 28.447378158569336, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 10.405216217041016, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 10.71042251586914, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 8.642854690551758, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 5.690433979034424, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 23.869070053100586, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 11.584356307983398, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 5.992950916290283, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 18.495361328125, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 8.233827590942383, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 12.626734733581543, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 36.21802520751953, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 19.932941436767578, "type": "mlp.down_proj" }, "model.layers.28.mlp.down_proj": { "snr": 6.0616455078125, "type": "mlp.down_proj" }, "model.layers.29.mlp.down_proj": { "snr": 5.363720417022705, "type": "mlp.down_proj" }, "model.layers.30.mlp.down_proj": { "snr": 7.455615520477295, "type": "mlp.down_proj" }, "model.layers.31.mlp.down_proj": { "snr": 7.8631815910339355, "type": "mlp.down_proj" }, "model.layers.32.mlp.down_proj": { "snr": 8.706913948059082, "type": "mlp.down_proj" }, "model.layers.33.mlp.down_proj": { "snr": 21.220134735107422, "type": "mlp.down_proj" }, "model.layers.34.mlp.down_proj": { "snr": 32.33852005004883, "type": "mlp.down_proj" }, "model.layers.35.mlp.down_proj": { "snr": 179.8906707763672, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.24970805644989014, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.11607512086629868, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.16310769319534302, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 0.3621424436569214, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.3482637107372284, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 2.6533455848693848, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 8.359040260314941, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 7.382037162780762, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 13.00683879852295, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 14.936161994934082, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 20.907283782958984, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 15.941497802734375, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 8.97419548034668, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 17.585100173950195, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 17.21462059020996, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 23.703285217285156, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 16.986576080322266, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 18.27729606628418, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 12.63351058959961, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 16.2633113861084, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 7.399787902832031, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 6.10424280166626, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 5.363350868225098, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 9.344535827636719, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 7.647364616394043, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 6.143579959869385, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 14.254817008972168, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 7.7000861167907715, "type": "mlp.gate_proj" }, "model.layers.28.mlp.gate_proj": { "snr": 5.994422435760498, "type": "mlp.gate_proj" }, "model.layers.29.mlp.gate_proj": { "snr": 6.041909694671631, "type": "mlp.gate_proj" }, "model.layers.30.mlp.gate_proj": { "snr": 9.027522087097168, "type": "mlp.gate_proj" }, "model.layers.31.mlp.gate_proj": { "snr": 5.450753211975098, "type": "mlp.gate_proj" }, "model.layers.32.mlp.gate_proj": { "snr": 4.149200439453125, "type": "mlp.gate_proj" }, "model.layers.33.mlp.gate_proj": { "snr": 5.223763942718506, "type": "mlp.gate_proj" }, "model.layers.34.mlp.gate_proj": { "snr": 43.65521240234375, "type": "mlp.gate_proj" }, "model.layers.35.mlp.gate_proj": { "snr": 45.312774658203125, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 0.7065013647079468, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 0.17752516269683838, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 0.2847473919391632, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 0.6757690906524658, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 0.8353318572044373, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 3.940711736679077, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 19.556047439575195, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 36.19340515136719, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 44.2518424987793, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 45.418025970458984, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 78.90928649902344, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 52.24648666381836, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 48.02030563354492, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 56.016239166259766, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 72.16619873046875, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 54.75283432006836, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 47.204097747802734, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 51.549312591552734, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 43.43872833251953, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 38.09785461425781, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 21.767858505249023, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 22.243661880493164, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 30.71843147277832, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 28.5756778717041, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 20.186717987060547, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 20.742860794067383, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 33.777984619140625, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 17.254213333129883, "type": "mlp.up_proj" }, "model.layers.28.mlp.up_proj": { "snr": 13.930026054382324, "type": "mlp.up_proj" }, "model.layers.29.mlp.up_proj": { "snr": 16.17984390258789, "type": "mlp.up_proj" }, "model.layers.30.mlp.up_proj": { "snr": 24.236648559570312, "type": "mlp.up_proj" }, "model.layers.31.mlp.up_proj": { "snr": 10.665648460388184, "type": "mlp.up_proj" }, "model.layers.32.mlp.up_proj": { "snr": 5.735939025878906, "type": "mlp.up_proj" }, "model.layers.33.mlp.up_proj": { "snr": 11.592061042785645, "type": "mlp.up_proj" }, "model.layers.34.mlp.up_proj": { "snr": 25.923419952392578, "type": "mlp.up_proj" }, "model.layers.35.mlp.up_proj": { "snr": 38.579349517822266, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.28.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.29.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.30.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.31.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.32.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.33.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.34.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.35.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 12.24727725982666, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.6436238288879395, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.7156716585159302, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 5.505439758300781, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 3.0760715007781982, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.0453941822052002, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 3.57472562789917, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 3.3765170574188232, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 2.8859639167785645, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 6.09852409362793, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 2.229580879211426, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 2.173879623413086, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.3220131397247314, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 2.61668062210083, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 2.606799840927124, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 1.5080311298370361, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 1.2841484546661377, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 0.8896433115005493, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.8873414993286133, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 0.8897770643234253, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.9051405787467957, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.7568970322608948, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 0.9403582811355591, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 3.777062177658081, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 1.3280683755874634, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 1.3980307579040527, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.2435240745544434, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.7732619047164917, "type": "self_attn.k_proj" }, "model.layers.28.self_attn.k_proj": { "snr": 2.6010243892669678, "type": "self_attn.k_proj" }, "model.layers.29.self_attn.k_proj": { "snr": 2.232773780822754, "type": "self_attn.k_proj" }, "model.layers.30.self_attn.k_proj": { "snr": 2.4743099212646484, "type": "self_attn.k_proj" }, "model.layers.31.self_attn.k_proj": { "snr": 1.11082923412323, "type": "self_attn.k_proj" }, "model.layers.32.self_attn.k_proj": { "snr": 0.8243986368179321, "type": "self_attn.k_proj" }, "model.layers.33.self_attn.k_proj": { "snr": 0.932928204536438, "type": "self_attn.k_proj" }, "model.layers.34.self_attn.k_proj": { "snr": 6.608611583709717, "type": "self_attn.k_proj" }, "model.layers.35.self_attn.k_proj": { "snr": 10.160987854003906, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.36662933230400085, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.1955128312110901, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.22419843077659607, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.25902292132377625, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.2567676901817322, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.2560890316963196, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.18518221378326416, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.23254290223121643, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.2203962802886963, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.217017263174057, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.22843335568904877, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.23816843330860138, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.17585325241088867, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.20451271533966064, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.2095799297094345, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.23767071962356567, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.16328400373458862, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.20690056681632996, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.18191492557525635, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.1945018619298935, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.26658856868743896, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.16897724568843842, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.18773262202739716, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.18808405101299286, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.17919476330280304, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.1793426126241684, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.1777871698141098, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.20279864966869354, "type": "self_attn.o_proj" }, "model.layers.28.self_attn.o_proj": { "snr": 0.17030371725559235, "type": "self_attn.o_proj" }, "model.layers.29.self_attn.o_proj": { "snr": 0.1992504596710205, "type": "self_attn.o_proj" }, "model.layers.30.self_attn.o_proj": { "snr": 0.23085352778434753, "type": "self_attn.o_proj" }, "model.layers.31.self_attn.o_proj": { "snr": 0.1641533523797989, "type": "self_attn.o_proj" }, "model.layers.32.self_attn.o_proj": { "snr": 0.10621391236782074, "type": "self_attn.o_proj" }, "model.layers.33.self_attn.o_proj": { "snr": 0.09411631524562836, "type": "self_attn.o_proj" }, "model.layers.34.self_attn.o_proj": { "snr": 0.13239727914333344, "type": "self_attn.o_proj" }, "model.layers.35.self_attn.o_proj": { "snr": 0.11740171164274216, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.055595725774765015, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.13823610544204712, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.1297825127840042, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.15291297435760498, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.15615035593509674, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.15535500645637512, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.15993140637874603, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.1753682643175125, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.1664913445711136, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.15656901895999908, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.18300014734268188, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.1713649481534958, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.1809009313583374, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.20895132422447205, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.17413195967674255, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.20878490805625916, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.1547088772058487, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.1943129003047943, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.1889297217130661, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.207680344581604, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.20839959383010864, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.18989044427871704, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.18180623650550842, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.2069384753704071, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.1842993050813675, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.2078687846660614, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.19224946200847626, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.15170617401599884, "type": "self_attn.q_proj" }, "model.layers.28.self_attn.q_proj": { "snr": 0.20116600394248962, "type": "self_attn.q_proj" }, "model.layers.29.self_attn.q_proj": { "snr": 0.19373668730258942, "type": "self_attn.q_proj" }, "model.layers.30.self_attn.q_proj": { "snr": 0.18462225794792175, "type": "self_attn.q_proj" }, "model.layers.31.self_attn.q_proj": { "snr": 0.18939673900604248, "type": "self_attn.q_proj" }, "model.layers.32.self_attn.q_proj": { "snr": 0.20071947574615479, "type": "self_attn.q_proj" }, "model.layers.33.self_attn.q_proj": { "snr": 0.19740056991577148, "type": "self_attn.q_proj" }, "model.layers.34.self_attn.q_proj": { "snr": 0.17658494412899017, "type": "self_attn.q_proj" }, "model.layers.35.self_attn.q_proj": { "snr": 0.1407373696565628, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 846.30126953125, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 83.83415222167969, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 213.51316833496094, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": 18.92746925354004, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": 433.9771728515625, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.28.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.29.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.30.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.31.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.32.self_attn.v_proj": { "snr": 1.2332282066345215, "type": "self_attn.v_proj" }, "model.layers.33.self_attn.v_proj": { "snr": 0.6151890158653259, "type": "self_attn.v_proj" }, "model.layers.34.self_attn.v_proj": { "snr": 509.7169189453125, "type": "self_attn.v_proj" }, "model.layers.35.self_attn.v_proj": { "snr": 536.0748901367188, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 10.283808708190918, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 1.2089825868606567, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 19.309062957763672, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 50.174461364746094, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 114.28582763671875, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 215.5762176513672, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 204.5117950439453, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 182.5479278564453, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 74.92950439453125, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 16.482666015625, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 55.33920669555664, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 16.851062774658203, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 58.65230178833008, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 11.150161743164062, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 65.32643127441406, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 46.736305236816406, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 14.288785934448242, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 23.40110206604004, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 86.34363555908203, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 49.14613342285156, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 1276.84814453125, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 51.803409576416016, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 143.0666046142578, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 35.14984893798828, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 21.41700553894043, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 10.651569366455078, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 21.635149002075195, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 1446.2774658203125, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.04497330263257027, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.16888172924518585, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.33653727173805237, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 3.1445391178131104, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 9.107144355773926, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 15.909018516540527, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 60.9138069152832, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 57.570281982421875, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 65.82791137695312, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 10.455283164978027, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 26.970706939697266, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 31.139820098876953, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 43.987159729003906, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 20.704849243164062, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 21.191452026367188, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 42.66447830200195, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 22.136825561523438, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 22.60980987548828, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 81.80574035644531, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 20.88619613647461, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 58.3524055480957, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 22.786706924438477, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 16.932226181030273, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 16.819862365722656, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 19.76348304748535, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 28.98714256286621, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 36.7071533203125, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 51.81539535522461, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 0.2243107706308365, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 0.4464716613292694, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 1.7838181257247925, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 17.912736892700195, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 47.45841979980469, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 56.3084602355957, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 173.33717346191406, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 148.22750854492188, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 133.63565063476562, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 83.65129852294922, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 117.94369506835938, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 94.52413940429688, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 130.43333435058594, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 76.11975860595703, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 158.75192260742188, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 143.72706604003906, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 84.28279876708984, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 116.65055084228516, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 177.1201934814453, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 82.4564437866211, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 137.73019409179688, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 89.97538757324219, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 86.30876159667969, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 61.53449249267578, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 45.22392654418945, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 60.3155517578125, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 40.06092071533203, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 48.12322998046875, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": 0.08805440366268158, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 4.771554470062256, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.46674421429634094, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 1.6167784929275513, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 2.0980119705200195, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 1.4339035749435425, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.7446703910827637, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 1.2829725742340088, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 2.2314982414245605, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 1.5125916004180908, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 1.2817912101745605, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 3.3553454875946045, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 1.591347336769104, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.1114169359207153, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 1.1536189317703247, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.994098424911499, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 1.484580636024475, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 1.2999093532562256, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 2.1628623008728027, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.3842225074768066, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 1.440075159072876, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 1.7816450595855713, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 1.746536135673523, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 1.318993091583252, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 1.7234206199645996, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 2.586996555328369, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 1.6486897468566895, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.3349357843399048, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.9039687514305115, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.10605750232934952, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.2503393292427063, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.21453581750392914, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.20600366592407227, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.22004099190235138, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.2267625778913498, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.1736888736486435, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.2314220815896988, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.24031606316566467, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.13458871841430664, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.20170633494853973, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.19507651031017303, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.1862162947654724, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.15117767453193665, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.1857745349407196, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.2064860314130783, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.15419450402259827, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.17895667254924774, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.18284623324871063, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.17497135698795319, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.178844153881073, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.16190896928310394, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.19371949136257172, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.14116843044757843, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.14100700616836548, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.14792074263095856, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.11953117698431015, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.06241385638713837, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.02127065323293209, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.14693336188793182, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.16316214203834534, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.1218630000948906, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.13916714489459991, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.155359148979187, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.1590007096529007, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.1958903819322586, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.22448301315307617, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.20126597583293915, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.1980895698070526, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.2289486974477768, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.22922305762767792, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.21452386677265167, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.24151542782783508, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.21893717348575592, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.2321016639471054, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.24078059196472168, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.22774985432624817, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.20914016664028168, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.22847522795200348, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.2500442862510681, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.2353251725435257, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.20365388691425323, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.21967172622680664, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.2122868150472641, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.2415798157453537, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.12347634881734848, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 230.88636779785156, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 22.38136100769043, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 246.59597778320312, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 499.61761474609375, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 69.18345642089844, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 984.9320068359375, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 64.06214141845703, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 28.43911361694336, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": 725.1439819335938, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": 63.43681716918945, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": 238.4695587158203, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 111.88697814941406, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": 686.2830200195312, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 566.2647705078125, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 4.070064544677734, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 4.3411664962768555, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 10.277782440185547, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 1.2050706148147583, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 19.284534454345703, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 50.16513442993164, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 114.24882507324219, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 215.48194885253906, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 204.39431762695312, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 182.5116729736328, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 74.9266128540039, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 16.474102020263672, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 55.30583572387695, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 16.84047508239746, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 58.62131118774414, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 11.144298553466797, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 65.28057098388672, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 46.701290130615234, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 14.278325080871582, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 23.382247924804688, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 93.8782958984375, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 49.10498809814453, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 1277.5101318359375, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 51.7880859375, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 143.03504943847656, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 35.123931884765625, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 21.403743743896484, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 10.551352500915527, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 21.62333869934082, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 1541.98681640625, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.04497644677758217, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.16878646612167358, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.336302250623703, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 3.141293525695801, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 9.098686218261719, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 15.89354419708252, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 60.85503387451172, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 57.53098678588867, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 65.77096557617188, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 10.453179359436035, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 26.94801139831543, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 31.111093521118164, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 43.963191986083984, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 20.690765380859375, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 20.47557258605957, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 42.63906478881836, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 22.11542320251465, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 22.590566635131836, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 81.74773406982422, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 20.872997283935547, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 58.32197952270508, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 22.784095764160156, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 16.935768127441406, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 16.830224990844727, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 19.774564743041992, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 27.770675659179688, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 36.714595794677734, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 51.81637191772461, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 0.22425401210784912, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 0.4456978142261505, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 1.7769725322723389, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 17.8966121673584, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 47.43608856201172, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 56.2298698425293, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 173.1498260498047, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 148.02874755859375, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 133.5174560546875, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 83.45183563232422, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 117.88772583007812, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 94.41156768798828, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 130.3107452392578, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 76.04458618164062, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 158.59634399414062, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 143.59596252441406, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 84.2161636352539, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 116.55204010009766, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 176.95449829101562, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 82.37284088134766, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 137.5695343017578, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 89.87335205078125, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 86.1510238647461, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 61.37428665161133, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 45.10757064819336, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 60.16519546508789, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 39.96969223022461, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 48.04258346557617, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": 0.08800078183412552, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 4.764852046966553, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.46627077460289, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 1.6155915260314941, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 2.096365451812744, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 1.431254267692566, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 1.7440669536590576, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 1.2815033197402954, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 2.2301025390625, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 1.5116536617279053, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 1.2699830532073975, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 3.3086464405059814, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 1.59111487865448, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 1.1007944345474243, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 1.163416862487793, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.9935113787651062, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 1.483581304550171, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 1.2992271184921265, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 2.162485122680664, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 1.3841017484664917, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 1.453418493270874, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 1.781678557395935, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 1.7460925579071045, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 1.3188031911849976, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 1.723441243171692, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 2.585094928741455, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 1.6478856801986694, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 1.3221096992492676, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.9034463167190552, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.10636883229017258, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.24971255660057068, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.21437697112560272, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.2058248072862625, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.21978946030139923, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.2269466072320938, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.17318543791770935, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.23159846663475037, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.2400084286928177, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.134766086935997, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.20152011513710022, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.19492347538471222, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.18607021868228912, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.15107683837413788, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.18565276265144348, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.20626339316368103, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.1541011780500412, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.1784645915031433, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.18307389318943024, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.17449897527694702, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.1787375956773758, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.161802276968956, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.1931520402431488, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.14108893275260925, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.14064815640449524, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.14790543913841248, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.11950570344924927, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.062389008700847626, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.02138795144855976, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.14676862955093384, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.16297142207622528, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.12198334187269211, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.13921146094799042, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.15567339956760406, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.1589033454656601, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.195299431681633, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.22430908679962158, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.2011336237192154, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.1982448250055313, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.22880099713802338, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.22898294031620026, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.21394900977611542, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.24130398035049438, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.21905161440372467, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.2319282442331314, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.24004821479320526, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.22754515707492828, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.2086794078350067, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.2290779948234558, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.250373899936676, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.23474709689617157, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.20302507281303406, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.21992310881614685, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.2120121270418167, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.24161922931671143, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.12337693572044373, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 231.07347106933594, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 22.34870719909668, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 246.30386352539062, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 499.5611572265625, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 69.09609985351562, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 983.3341674804688, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 64.04925537109375, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 28.41021728515625, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": 724.2736206054688, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": 63.35670852661133, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": Infinity, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": 238.2569122314453, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 111.78319549560547, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": 687.0054931640625, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 565.3272705078125, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 4.064513683319092, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 4.335177421569824, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": 4.538210391998291, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 7.746472358703613, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 4.3358893394470215, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 26.88057518005371, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 8.699942588806152, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 32.808380126953125, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 10.831522941589355, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 18.843679428100586, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 9.348078727722168, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 7.061270236968994, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 5.454320907592773, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 7.386133193969727, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 6.648562908172607, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 5.853652477264404, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 8.570493698120117, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 13.120837211608887, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 14.780969619750977, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 6.953134059906006, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 12.589436531066895, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 8.844094276428223, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 7.598869800567627, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 11.293925285339355, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 9.384604454040527, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 12.12533187866211, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 11.217570304870605, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 14.197714805603027, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 12.449926376342773, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 16.885862350463867, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 23.410266876220703, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 22.57662582397461, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 17.29996681213379, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 11.718637466430664, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 6.376136779785156, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 6.794021129608154, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 3.2425343990325928, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 2.368421792984009, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 3.3193087577819824, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 3.9515960216522217, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 3.2761318683624268, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 4.026322841644287, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 3.415473699569702, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 3.3418092727661133, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 3.6233012676239014, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 3.2199010848999023, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 3.6848936080932617, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 3.4439642429351807, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 3.7366604804992676, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 4.262336254119873, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 4.333253860473633, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 3.640247344970703, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 4.2978034019470215, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 4.339972496032715, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 3.8502564430236816, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 28.129924774169922, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 41.49960708618164, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 125.47801971435547, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 119.93355560302734, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 162.62631225585938, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 32.36909484863281, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 49.10078430175781, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 28.541580200195312, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 14.764090538024902, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 16.5697078704834, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 19.26059913635254, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 15.082040786743164, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 15.5792875289917, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 9.84595012664795, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 11.506875991821289, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 21.507600784301758, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 15.110466957092285, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 27.062183380126953, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 16.40383529663086, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 13.117464065551758, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 11.393353462219238, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 10.791608810424805, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 7.512388706207275, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 9.889434814453125, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 7.587779521942139, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 4.561068058013916, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": 4.538210391998291, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.1.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.2.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.3.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.4.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.5.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.6.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.7.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.8.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.9.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.10.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.11.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.12.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.13.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.14.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.15.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.16.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.17.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.18.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.19.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.20.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.21.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.22.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.23.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.24.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.25.post_feedforward_layernorm": { "snr": Infinity, "type": "post_feedforward_layernorm" }, "model.layers.0.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.1.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.2.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.3.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.4.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.5.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.6.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.7.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.8.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.9.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.10.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.11.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.12.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.13.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.14.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.15.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.16.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.17.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.18.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.19.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.20.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.21.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.22.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.23.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.24.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.25.pre_feedforward_layernorm": { "snr": Infinity, "type": "pre_feedforward_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 0.5685535073280334, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 1.060130000114441, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 1.0735561847686768, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 1.0217311382293701, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 0.9687430262565613, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 0.8411160111427307, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 0.936741054058075, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 0.7236003279685974, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 0.9032857418060303, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 0.7513307929039001, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.6875415444374084, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.6611058712005615, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 0.8023670315742493, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.7188767194747925, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.7930117249488831, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.9076258540153503, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 0.7295113801956177, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 0.898467481136322, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 0.9652048945426941, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 0.9855819344520569, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 1.2863355875015259, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 1.116607904434204, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 0.7438228130340576, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 0.8499895334243774, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 0.7764042019844055, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 0.7127887606620789, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.2556447386741638, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.2930974066257477, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.27571651339530945, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.280631959438324, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.2958097755908966, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.3072899580001831, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.31374114751815796, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.2903076410293579, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.2625811696052551, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.2306082546710968, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.24869701266288757, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.2556127905845642, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.28926730155944824, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.25355643033981323, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.23122912645339966, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.28772857785224915, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.22682352364063263, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.2558597922325134, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.1773315966129303, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.2106105089187622, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.2008877396583557, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.1973956972360611, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.25533634424209595, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.20066529512405396, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.18342143297195435, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.3224162459373474, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.2074502408504486, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.33233126997947693, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.3586291968822479, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.2850974202156067, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.37816473841667175, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.31616899371147156, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.4988365173339844, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.4238639175891876, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.2674674689769745, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.34524214267730713, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.4472109377384186, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.41363632678985596, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.44623735547065735, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.4404333531856537, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.5200268626213074, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.4320363700389862, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.46235284209251404, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.47477203607559204, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.4001321494579315, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.42365774512290955, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.37057873606681824, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.3990235924720764, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.35094162821769714, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.35721710324287415, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.2812618315219879, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.19463211297988892, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": 1.3365743160247803, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 2.402009963989258, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": 3.8695859909057617, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": 4.117948055267334, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 5.651231288909912, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 2.720799446105957, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 1.4446897506713867, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 4.497112274169922, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": 1.7241870164871216, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 1.7104988098144531, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": 1.4231206178665161, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 2.1643989086151123, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 1.5254249572753906, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 2.3788745403289795, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 3.4155967235565186, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 4.623549938201904, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": 1.5291141271591187, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": 3.9934189319610596, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": 9.035382270812988, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": 5.8578925132751465, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": 3.759958505630493, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": 4.558528900146484, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": 0.9163281917572021, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 2.564377546310425, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": 3.689103841781616, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 5.6444854736328125, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 70.0594253540039, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 11.135851860046387, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 7.035482883453369, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 6.422532081604004, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 5.748020172119141, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 3.885556697845459, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 3.4336745738983154, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 2.791595935821533, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 5.36277961730957, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 4.459208011627197, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 6.272170066833496, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 5.264761447906494, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 4.324735641479492, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 3.878648042678833, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 2.9773054122924805, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 4.471445560455322, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 25.227100372314453, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 6.58299446105957, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 3.4688243865966797, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 1.555246114730835, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.7770601511001587, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 0.6239906549453735, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 0.6440379023551941, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 0.5120116472244263, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 0.6544050574302673, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 0.5381016731262207, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 0.622873842716217, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 0.9361700415611267, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 1.475605845451355, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 1.608325719833374, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 1.0720024108886719, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 0.7111338973045349, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 28.431896209716797, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 15.546019554138184, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 23.048023223876953, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 25.790977478027344, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 18.552549362182617, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 8.85106372833252, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 10.653799057006836, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 7.365357875823975, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 11.98373794555664, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 8.04493236541748, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 8.523039817810059, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 5.381742477416992, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 3.9845118522644043, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 3.4893221855163574, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 1.764201045036316, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 0.9730708599090576, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 0.11727584153413773, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.24786807596683502, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.36378130316734314, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 0.2983120381832123, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 0.33789733052253723, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 0.29155924916267395, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 0.2537297010421753, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 0.28204113245010376, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 0.2776711583137512, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 0.2927376627922058, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.31486213207244873, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.32363659143447876, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 0.31382912397384644, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.4635234773159027, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.25379249453544617, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.2628238797187805, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.27602291107177734, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.2149604707956314, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.2540294826030731, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.27978822588920593, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.3121289908885956, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.35037684440612793, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.366205096244812, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.3692712187767029, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.3301038146018982, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.3003396987915039, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.30804169178009033, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.28501132130622864, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.2171541005373001, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.19183959066867828, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.19215913116931915, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.25486502051353455, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.03850084915757179, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.0713055431842804, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.07948919385671616, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.08047746121883392, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.0852593332529068, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.09794823825359344, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.09627152234315872, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.11065381020307541, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.12031875550746918, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.09804573655128479, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.10897502303123474, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.09267337620258331, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.08803492039442062, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.0902542844414711, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.10154066979885101, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.09083802253007889, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": 2.842210054397583, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 10.59461498260498, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": 8.993025779724121, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": 62.567787170410156, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 23.80082893371582, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 7.957369804382324, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 12.01815414428711, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 5.095500469207764, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": 11.719332695007324, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 555.0869750976562, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": 22.95538330078125, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 30.042158126831055, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 9.577271461486816, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 18.176361083984375, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 1.5695856809616089, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 2.7235565185546875, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 57.09797286987305, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 9.538983345031738, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 6.227016925811768, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 5.660686492919922, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 5.178432464599609, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 3.5638349056243896, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 3.0918056964874268, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 2.456392288208008, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 4.525328636169434, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 3.9409055709838867, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 5.447249412536621, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 4.807600975036621, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 3.915374517440796, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 3.4820363521575928, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 2.6045074462890625, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 3.7237701416015625, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 22.160131454467773, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 6.072206020355225, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 3.2467362880706787, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 1.4111896753311157, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.7405938506126404, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 0.5916463136672974, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 0.6149423718452454, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 0.48369669914245605, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 0.6047574877738953, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 0.5092479586601257, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 0.5999670624732971, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 0.8980127573013306, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 1.4252448081970215, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 1.509937047958374, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 1.0066585540771484, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 0.6413647532463074, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 26.08852195739746, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 13.382951736450195, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 20.088768005371094, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 23.0632381439209, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 16.07433319091797, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 8.00507640838623, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 9.538354873657227, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 6.286602973937988, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 10.092820167541504, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 7.193963527679443, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 7.320116996765137, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 4.8728532791137695, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 3.596583366394043, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 3.166161298751831, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 1.5600818395614624, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 0.8726214170455933, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 0.1154392883181572, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.24299409985542297, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.3624322712421417, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 0.29509487748146057, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 0.32953736186027527, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 0.2908833622932434, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 0.2488437294960022, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 0.27847856283187866, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 0.27143892645835876, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 0.28804272413253784, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.31197959184646606, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.3203586935997009, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 0.30905747413635254, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.46828722953796387, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.24205778539180756, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.2559327781200409, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.2638678550720215, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.21109595894813538, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.24751724302768707, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.2728094160556793, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.3001374304294586, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.33903488516807556, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.3530929982662201, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.36753255128860474, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.3373180329799652, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.2970578670501709, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.3076324760913849, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.2766900658607483, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.20973259210586548, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.18185566365718842, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.18329747021198273, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.2437991499900818, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.038040731102228165, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.0707998052239418, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.0787411704659462, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.08089710026979446, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.08591937273740768, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.09852176159620285, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.09690654277801514, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.11181341856718063, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.12042108923196793, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.09799323976039886, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.10901063680648804, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.09307146072387695, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.0880950540304184, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.08886399120092392, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.09955056011676788, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.08929339051246643, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": 2.5501928329467773, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 9.449499130249023, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": 7.9920830726623535, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": 50.69462585449219, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 19.083511352539062, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 7.21597146987915, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 11.27744197845459, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 4.579711437225342, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": 10.940719604492188, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 553.4417724609375, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": 20.59434700012207, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 26.636865615844727, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 8.614749908447266, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 17.722007751464844, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 1.48500657081604, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 2.5776851177215576, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 2.306217670440674, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 2.2327167987823486, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 1.4501516819000244, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 1.363667607307434, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 1.4520279169082642, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 1.4664665460586548, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 1.4122329950332642, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 1.0504299402236938, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 0.9837537407875061, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 0.8659006357192993, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 0.7936406135559082, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 0.9000886678695679, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 1.1559213399887085, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 1.3054672479629517, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 1.196791410446167, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 1.3163655996322632, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 1.3388997316360474, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 1.592497706413269, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 1.5399079322814941, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 1.5683293342590332, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 1.4739630222320557, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 1.2608393430709839, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 1.2087301015853882, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 1.1851829290390015, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 1.0537594556808472, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 1.1649317741394043, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 1.2376821041107178, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 1.147771954536438, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.9385462999343872, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.8528683185577393, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.761657178401947, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 0.6598325371742249, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.44578588008880615, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 0.4053060710430145, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 0.3588462769985199, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 0.35667839646339417, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 0.3106202781200409, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 0.2821919322013855, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 0.29143741726875305, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 0.29830989241600037, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 0.2862427532672882, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 0.2797018587589264, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 0.2679217755794525, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 0.2782425880432129, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 0.3503592610359192, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 0.3968559205532074, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 0.4318574070930481, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 0.4693693220615387, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 0.5051979422569275, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 0.5675955414772034, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 0.5861843824386597, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 0.4759417772293091, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 0.38529056310653687, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 0.3180919587612152, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 0.2695689797401428, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 0.21765239536762238, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 1.4919718503952026, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 1.7983858585357666, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 2.1709094047546387, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 2.751326560974121, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 3.063521385192871, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 2.4026951789855957, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 2.3890223503112793, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 2.3861353397369385, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 2.0745043754577637, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 1.8550645112991333, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 1.6184496879577637, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 1.9287559986114502, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 1.7427546977996826, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 1.9872609376907349, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 2.0224087238311768, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 1.7851638793945312, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 1.7160604000091553, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 1.6870195865631104, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 1.6585396528244019, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 1.5509096384048462, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 1.4310423135757446, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 1.5009464025497437, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 1.4866929054260254, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 1.332513689994812, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 1.073512077331543, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 0.7472100257873535, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 0.4880162179470062, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 0.2527681589126587, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 0.08262510597705841, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.1441459059715271, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.21418076753616333, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 0.22496014833450317, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 0.23101305961608887, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 0.23644132912158966, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 0.23666173219680786, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 0.19791515171527863, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 0.22062039375305176, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 0.21218444406986237, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.24218571186065674, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.21870514750480652, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 0.22160987555980682, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.22726823389530182, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.20256873965263367, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.24100735783576965, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 0.23794010281562805, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 0.2913324534893036, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 0.28093472123146057, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 0.31062793731689453, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.2942160367965698, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.28014805912971497, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 0.3512437045574188, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 0.2837671637535095, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 0.2960015535354614, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 0.5086414813995361, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 0.24054698646068573, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.247616246342659, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.18390265107154846, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.14759540557861328, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.15726515650749207, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.16903570294380188, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.17953157424926758, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.2351229190826416, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.22804339230060577, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.24786025285720825, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.21847976744174957, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.2092437595129013, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.23278094828128815, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.20468176901340485, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.2353818416595459, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.2702614367008209, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.19177420437335968, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.18293911218643188, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.20286045968532562, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.20763878524303436, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.190629780292511, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.22044304013252258, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.21491236984729767, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.23289704322814941, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.21457163989543915, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.1949365884065628, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.1606779545545578, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.13892440497875214, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.1407029926776886, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.16027599573135376, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.0534212663769722, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.06873775273561478, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.07522258907556534, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.06616844981908798, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.06809444725513458, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.0758095383644104, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.07800278812646866, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.07535763084888458, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.09488166123628616, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.09709945321083069, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.09381720423698425, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.08205580711364746, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.10723169893026352, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.10166660696268082, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.08822792023420334, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.0814041867852211, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.07586681097745895, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.07040166854858398, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.0728282704949379, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.06912193447351456, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.06646180897951126, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.06960278004407883, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.06566876918077469, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.07412787526845932, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.07131384313106537, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.07768437266349792, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.0809575766324997, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.06796683371067047, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": 1.4029983282089233, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 3.123720169067383, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": 2.4177253246307373, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": 5.588768005371094, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 4.395562648773193, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 3.2982685565948486, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 3.2798449993133545, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 2.109200954437256, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": 3.229325532913208, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 1.7349927425384521, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": 1.5926740169525146, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 1.9097802639007568, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 2.5654332637786865, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 3.536489963531494, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 8.366667747497559, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 7.348303318023682, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": 2.815748691558838, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": 4.048776149749756, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": 4.426101207733154, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": 7.098501682281494, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": 3.700288772583008, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": 2.1859049797058105, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": 3.6953284740448, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 11.148802757263184, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": 2.4171905517578125, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 4.404144287109375, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 2.340604782104492, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 3.284160614013672, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json ================================================ { "model.layers.0.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.1.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.2.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.3.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.4.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.5.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.6.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.7.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.8.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.9.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.10.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.11.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.12.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.13.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.14.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.15.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.16.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.17.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.18.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.19.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.20.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.21.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.22.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.23.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.24.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.25.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.26.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "model.layers.27.input_layernorm": { "snr": Infinity, "type": "input_layernorm" }, "lm_head": { "snr": Infinity, "type": "lm_head" }, "model.layers.0.mlp.down_proj": { "snr": 2.364603281021118, "type": "mlp.down_proj" }, "model.layers.1.mlp.down_proj": { "snr": 2.229910373687744, "type": "mlp.down_proj" }, "model.layers.2.mlp.down_proj": { "snr": 1.4312117099761963, "type": "mlp.down_proj" }, "model.layers.3.mlp.down_proj": { "snr": 1.3216407299041748, "type": "mlp.down_proj" }, "model.layers.4.mlp.down_proj": { "snr": 1.4183496236801147, "type": "mlp.down_proj" }, "model.layers.5.mlp.down_proj": { "snr": 1.4453660249710083, "type": "mlp.down_proj" }, "model.layers.6.mlp.down_proj": { "snr": 1.4030662775039673, "type": "mlp.down_proj" }, "model.layers.7.mlp.down_proj": { "snr": 1.042332649230957, "type": "mlp.down_proj" }, "model.layers.8.mlp.down_proj": { "snr": 0.9530982375144958, "type": "mlp.down_proj" }, "model.layers.9.mlp.down_proj": { "snr": 0.849862277507782, "type": "mlp.down_proj" }, "model.layers.10.mlp.down_proj": { "snr": 0.7704945206642151, "type": "mlp.down_proj" }, "model.layers.11.mlp.down_proj": { "snr": 0.8871145844459534, "type": "mlp.down_proj" }, "model.layers.12.mlp.down_proj": { "snr": 1.1408143043518066, "type": "mlp.down_proj" }, "model.layers.13.mlp.down_proj": { "snr": 1.2769343852996826, "type": "mlp.down_proj" }, "model.layers.14.mlp.down_proj": { "snr": 1.1703068017959595, "type": "mlp.down_proj" }, "model.layers.15.mlp.down_proj": { "snr": 1.2794467210769653, "type": "mlp.down_proj" }, "model.layers.16.mlp.down_proj": { "snr": 1.3154453039169312, "type": "mlp.down_proj" }, "model.layers.17.mlp.down_proj": { "snr": 1.5596749782562256, "type": "mlp.down_proj" }, "model.layers.18.mlp.down_proj": { "snr": 1.4949405193328857, "type": "mlp.down_proj" }, "model.layers.19.mlp.down_proj": { "snr": 1.5329173803329468, "type": "mlp.down_proj" }, "model.layers.20.mlp.down_proj": { "snr": 1.4396660327911377, "type": "mlp.down_proj" }, "model.layers.21.mlp.down_proj": { "snr": 1.217085838317871, "type": "mlp.down_proj" }, "model.layers.22.mlp.down_proj": { "snr": 1.150472640991211, "type": "mlp.down_proj" }, "model.layers.23.mlp.down_proj": { "snr": 1.1166225671768188, "type": "mlp.down_proj" }, "model.layers.24.mlp.down_proj": { "snr": 0.9966591000556946, "type": "mlp.down_proj" }, "model.layers.25.mlp.down_proj": { "snr": 1.0938347578048706, "type": "mlp.down_proj" }, "model.layers.26.mlp.down_proj": { "snr": 1.1505423784255981, "type": "mlp.down_proj" }, "model.layers.27.mlp.down_proj": { "snr": 1.1156749725341797, "type": "mlp.down_proj" }, "model.layers.0.mlp.gate_proj": { "snr": 0.9329171776771545, "type": "mlp.gate_proj" }, "model.layers.1.mlp.gate_proj": { "snr": 0.8513413667678833, "type": "mlp.gate_proj" }, "model.layers.2.mlp.gate_proj": { "snr": 0.7584061026573181, "type": "mlp.gate_proj" }, "model.layers.3.mlp.gate_proj": { "snr": 0.65835040807724, "type": "mlp.gate_proj" }, "model.layers.4.mlp.gate_proj": { "snr": 0.436420738697052, "type": "mlp.gate_proj" }, "model.layers.5.mlp.gate_proj": { "snr": 0.39712461829185486, "type": "mlp.gate_proj" }, "model.layers.6.mlp.gate_proj": { "snr": 0.3530206084251404, "type": "mlp.gate_proj" }, "model.layers.7.mlp.gate_proj": { "snr": 0.34982794523239136, "type": "mlp.gate_proj" }, "model.layers.8.mlp.gate_proj": { "snr": 0.30338960886001587, "type": "mlp.gate_proj" }, "model.layers.9.mlp.gate_proj": { "snr": 0.27569833397865295, "type": "mlp.gate_proj" }, "model.layers.10.mlp.gate_proj": { "snr": 0.28934162855148315, "type": "mlp.gate_proj" }, "model.layers.11.mlp.gate_proj": { "snr": 0.2929173707962036, "type": "mlp.gate_proj" }, "model.layers.12.mlp.gate_proj": { "snr": 0.28263387084007263, "type": "mlp.gate_proj" }, "model.layers.13.mlp.gate_proj": { "snr": 0.27778616547584534, "type": "mlp.gate_proj" }, "model.layers.14.mlp.gate_proj": { "snr": 0.26527827978134155, "type": "mlp.gate_proj" }, "model.layers.15.mlp.gate_proj": { "snr": 0.27635642886161804, "type": "mlp.gate_proj" }, "model.layers.16.mlp.gate_proj": { "snr": 0.35072311758995056, "type": "mlp.gate_proj" }, "model.layers.17.mlp.gate_proj": { "snr": 0.4002636671066284, "type": "mlp.gate_proj" }, "model.layers.18.mlp.gate_proj": { "snr": 0.4319891333580017, "type": "mlp.gate_proj" }, "model.layers.19.mlp.gate_proj": { "snr": 0.47527065873146057, "type": "mlp.gate_proj" }, "model.layers.20.mlp.gate_proj": { "snr": 0.5112077593803406, "type": "mlp.gate_proj" }, "model.layers.21.mlp.gate_proj": { "snr": 0.5749644637107849, "type": "mlp.gate_proj" }, "model.layers.22.mlp.gate_proj": { "snr": 0.5967603921890259, "type": "mlp.gate_proj" }, "model.layers.23.mlp.gate_proj": { "snr": 0.48045310378074646, "type": "mlp.gate_proj" }, "model.layers.24.mlp.gate_proj": { "snr": 0.3838970363140106, "type": "mlp.gate_proj" }, "model.layers.25.mlp.gate_proj": { "snr": 0.3108249604701996, "type": "mlp.gate_proj" }, "model.layers.26.mlp.gate_proj": { "snr": 0.26704445481300354, "type": "mlp.gate_proj" }, "model.layers.27.mlp.gate_proj": { "snr": 0.20953254401683807, "type": "mlp.gate_proj" }, "model.layers.0.mlp.up_proj": { "snr": 1.5084924697875977, "type": "mlp.up_proj" }, "model.layers.1.mlp.up_proj": { "snr": 1.7789595127105713, "type": "mlp.up_proj" }, "model.layers.2.mlp.up_proj": { "snr": 2.1431775093078613, "type": "mlp.up_proj" }, "model.layers.3.mlp.up_proj": { "snr": 2.762744903564453, "type": "mlp.up_proj" }, "model.layers.4.mlp.up_proj": { "snr": 3.0324745178222656, "type": "mlp.up_proj" }, "model.layers.5.mlp.up_proj": { "snr": 2.3884809017181396, "type": "mlp.up_proj" }, "model.layers.6.mlp.up_proj": { "snr": 2.388005256652832, "type": "mlp.up_proj" }, "model.layers.7.mlp.up_proj": { "snr": 2.339340925216675, "type": "mlp.up_proj" }, "model.layers.8.mlp.up_proj": { "snr": 2.0497021675109863, "type": "mlp.up_proj" }, "model.layers.9.mlp.up_proj": { "snr": 1.822119116783142, "type": "mlp.up_proj" }, "model.layers.10.mlp.up_proj": { "snr": 1.600373387336731, "type": "mlp.up_proj" }, "model.layers.11.mlp.up_proj": { "snr": 1.9298171997070312, "type": "mlp.up_proj" }, "model.layers.12.mlp.up_proj": { "snr": 1.728783369064331, "type": "mlp.up_proj" }, "model.layers.13.mlp.up_proj": { "snr": 1.965298056602478, "type": "mlp.up_proj" }, "model.layers.14.mlp.up_proj": { "snr": 2.023681640625, "type": "mlp.up_proj" }, "model.layers.15.mlp.up_proj": { "snr": 1.7721818685531616, "type": "mlp.up_proj" }, "model.layers.16.mlp.up_proj": { "snr": 1.7068361043930054, "type": "mlp.up_proj" }, "model.layers.17.mlp.up_proj": { "snr": 1.6673219203948975, "type": "mlp.up_proj" }, "model.layers.18.mlp.up_proj": { "snr": 1.6240718364715576, "type": "mlp.up_proj" }, "model.layers.19.mlp.up_proj": { "snr": 1.5169662237167358, "type": "mlp.up_proj" }, "model.layers.20.mlp.up_proj": { "snr": 1.4018198251724243, "type": "mlp.up_proj" }, "model.layers.21.mlp.up_proj": { "snr": 1.4556466341018677, "type": "mlp.up_proj" }, "model.layers.22.mlp.up_proj": { "snr": 1.4304454326629639, "type": "mlp.up_proj" }, "model.layers.23.mlp.up_proj": { "snr": 1.2785290479660034, "type": "mlp.up_proj" }, "model.layers.24.mlp.up_proj": { "snr": 1.023495078086853, "type": "mlp.up_proj" }, "model.layers.25.mlp.up_proj": { "snr": 0.6992124915122986, "type": "mlp.up_proj" }, "model.layers.26.mlp.up_proj": { "snr": 0.4549211859703064, "type": "mlp.up_proj" }, "model.layers.27.mlp.up_proj": { "snr": 0.23889905214309692, "type": "mlp.up_proj" }, "model.embed_tokens": { "snr": Infinity, "type": "model.embed_tokens" }, "model.norm": { "snr": Infinity, "type": "model.norm" }, "model.layers.0.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.1.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.2.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.3.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.4.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.5.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.6.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.7.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.8.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.9.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.10.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.11.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.12.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.13.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.14.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.15.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.16.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.17.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.18.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.19.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.20.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.21.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.22.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.23.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.24.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.25.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.26.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.27.post_attention_layernorm": { "snr": Infinity, "type": "post_attention_layernorm" }, "model.layers.0.self_attn.k_proj": { "snr": 0.08150045573711395, "type": "self_attn.k_proj" }, "model.layers.1.self_attn.k_proj": { "snr": 0.1428358554840088, "type": "self_attn.k_proj" }, "model.layers.2.self_attn.k_proj": { "snr": 0.2096949815750122, "type": "self_attn.k_proj" }, "model.layers.3.self_attn.k_proj": { "snr": 0.22633400559425354, "type": "self_attn.k_proj" }, "model.layers.4.self_attn.k_proj": { "snr": 0.2293967455625534, "type": "self_attn.k_proj" }, "model.layers.5.self_attn.k_proj": { "snr": 0.23336802423000336, "type": "self_attn.k_proj" }, "model.layers.6.self_attn.k_proj": { "snr": 0.23429904878139496, "type": "self_attn.k_proj" }, "model.layers.7.self_attn.k_proj": { "snr": 0.19610290229320526, "type": "self_attn.k_proj" }, "model.layers.8.self_attn.k_proj": { "snr": 0.2163258045911789, "type": "self_attn.k_proj" }, "model.layers.9.self_attn.k_proj": { "snr": 0.21039333939552307, "type": "self_attn.k_proj" }, "model.layers.10.self_attn.k_proj": { "snr": 0.23533931374549866, "type": "self_attn.k_proj" }, "model.layers.11.self_attn.k_proj": { "snr": 0.21457058191299438, "type": "self_attn.k_proj" }, "model.layers.12.self_attn.k_proj": { "snr": 0.21686571836471558, "type": "self_attn.k_proj" }, "model.layers.13.self_attn.k_proj": { "snr": 0.22398065030574799, "type": "self_attn.k_proj" }, "model.layers.14.self_attn.k_proj": { "snr": 0.20160657167434692, "type": "self_attn.k_proj" }, "model.layers.15.self_attn.k_proj": { "snr": 0.23705022037029266, "type": "self_attn.k_proj" }, "model.layers.16.self_attn.k_proj": { "snr": 0.23254962265491486, "type": "self_attn.k_proj" }, "model.layers.17.self_attn.k_proj": { "snr": 0.2892642617225647, "type": "self_attn.k_proj" }, "model.layers.18.self_attn.k_proj": { "snr": 0.27587130665779114, "type": "self_attn.k_proj" }, "model.layers.19.self_attn.k_proj": { "snr": 0.30891212821006775, "type": "self_attn.k_proj" }, "model.layers.20.self_attn.k_proj": { "snr": 0.28997519612312317, "type": "self_attn.k_proj" }, "model.layers.21.self_attn.k_proj": { "snr": 0.27534863352775574, "type": "self_attn.k_proj" }, "model.layers.22.self_attn.k_proj": { "snr": 0.35139667987823486, "type": "self_attn.k_proj" }, "model.layers.23.self_attn.k_proj": { "snr": 0.2773109972476959, "type": "self_attn.k_proj" }, "model.layers.24.self_attn.k_proj": { "snr": 0.2853511571884155, "type": "self_attn.k_proj" }, "model.layers.25.self_attn.k_proj": { "snr": 0.5030262470245361, "type": "self_attn.k_proj" }, "model.layers.26.self_attn.k_proj": { "snr": 0.2317112237215042, "type": "self_attn.k_proj" }, "model.layers.27.self_attn.k_proj": { "snr": 0.24419328570365906, "type": "self_attn.k_proj" }, "model.layers.0.self_attn.o_proj": { "snr": 0.17767645418643951, "type": "self_attn.o_proj" }, "model.layers.1.self_attn.o_proj": { "snr": 0.14102177321910858, "type": "self_attn.o_proj" }, "model.layers.2.self_attn.o_proj": { "snr": 0.1523692011833191, "type": "self_attn.o_proj" }, "model.layers.3.self_attn.o_proj": { "snr": 0.16522075235843658, "type": "self_attn.o_proj" }, "model.layers.4.self_attn.o_proj": { "snr": 0.17483487725257874, "type": "self_attn.o_proj" }, "model.layers.5.self_attn.o_proj": { "snr": 0.227921262383461, "type": "self_attn.o_proj" }, "model.layers.6.self_attn.o_proj": { "snr": 0.2196175903081894, "type": "self_attn.o_proj" }, "model.layers.7.self_attn.o_proj": { "snr": 0.24270132184028625, "type": "self_attn.o_proj" }, "model.layers.8.self_attn.o_proj": { "snr": 0.2118290364742279, "type": "self_attn.o_proj" }, "model.layers.9.self_attn.o_proj": { "snr": 0.20525991916656494, "type": "self_attn.o_proj" }, "model.layers.10.self_attn.o_proj": { "snr": 0.22847208380699158, "type": "self_attn.o_proj" }, "model.layers.11.self_attn.o_proj": { "snr": 0.19665324687957764, "type": "self_attn.o_proj" }, "model.layers.12.self_attn.o_proj": { "snr": 0.23233532905578613, "type": "self_attn.o_proj" }, "model.layers.13.self_attn.o_proj": { "snr": 0.2624332308769226, "type": "self_attn.o_proj" }, "model.layers.14.self_attn.o_proj": { "snr": 0.1868327558040619, "type": "self_attn.o_proj" }, "model.layers.15.self_attn.o_proj": { "snr": 0.17706255614757538, "type": "self_attn.o_proj" }, "model.layers.16.self_attn.o_proj": { "snr": 0.19422705471515656, "type": "self_attn.o_proj" }, "model.layers.17.self_attn.o_proj": { "snr": 0.2000615894794464, "type": "self_attn.o_proj" }, "model.layers.18.self_attn.o_proj": { "snr": 0.1874573826789856, "type": "self_attn.o_proj" }, "model.layers.19.self_attn.o_proj": { "snr": 0.21297843754291534, "type": "self_attn.o_proj" }, "model.layers.20.self_attn.o_proj": { "snr": 0.2100859135389328, "type": "self_attn.o_proj" }, "model.layers.21.self_attn.o_proj": { "snr": 0.22561520338058472, "type": "self_attn.o_proj" }, "model.layers.22.self_attn.o_proj": { "snr": 0.20994484424591064, "type": "self_attn.o_proj" }, "model.layers.23.self_attn.o_proj": { "snr": 0.18978221714496613, "type": "self_attn.o_proj" }, "model.layers.24.self_attn.o_proj": { "snr": 0.1571759581565857, "type": "self_attn.o_proj" }, "model.layers.25.self_attn.o_proj": { "snr": 0.1349896937608719, "type": "self_attn.o_proj" }, "model.layers.26.self_attn.o_proj": { "snr": 0.1368866115808487, "type": "self_attn.o_proj" }, "model.layers.27.self_attn.o_proj": { "snr": 0.1571887582540512, "type": "self_attn.o_proj" }, "model.layers.0.self_attn.q_proj": { "snr": 0.05295897275209427, "type": "self_attn.q_proj" }, "model.layers.1.self_attn.q_proj": { "snr": 0.06835605204105377, "type": "self_attn.q_proj" }, "model.layers.2.self_attn.q_proj": { "snr": 0.0746372863650322, "type": "self_attn.q_proj" }, "model.layers.3.self_attn.q_proj": { "snr": 0.06615085154771805, "type": "self_attn.q_proj" }, "model.layers.4.self_attn.q_proj": { "snr": 0.06788161396980286, "type": "self_attn.q_proj" }, "model.layers.5.self_attn.q_proj": { "snr": 0.07514483481645584, "type": "self_attn.q_proj" }, "model.layers.6.self_attn.q_proj": { "snr": 0.07777862250804901, "type": "self_attn.q_proj" }, "model.layers.7.self_attn.q_proj": { "snr": 0.07534090429544449, "type": "self_attn.q_proj" }, "model.layers.8.self_attn.q_proj": { "snr": 0.09494179487228394, "type": "self_attn.q_proj" }, "model.layers.9.self_attn.q_proj": { "snr": 0.09699037671089172, "type": "self_attn.q_proj" }, "model.layers.10.self_attn.q_proj": { "snr": 0.09426294267177582, "type": "self_attn.q_proj" }, "model.layers.11.self_attn.q_proj": { "snr": 0.08260341733694077, "type": "self_attn.q_proj" }, "model.layers.12.self_attn.q_proj": { "snr": 0.10650420933961868, "type": "self_attn.q_proj" }, "model.layers.13.self_attn.q_proj": { "snr": 0.10250870138406754, "type": "self_attn.q_proj" }, "model.layers.14.self_attn.q_proj": { "snr": 0.08775162696838379, "type": "self_attn.q_proj" }, "model.layers.15.self_attn.q_proj": { "snr": 0.08071447163820267, "type": "self_attn.q_proj" }, "model.layers.16.self_attn.q_proj": { "snr": 0.07530857622623444, "type": "self_attn.q_proj" }, "model.layers.17.self_attn.q_proj": { "snr": 0.06964966654777527, "type": "self_attn.q_proj" }, "model.layers.18.self_attn.q_proj": { "snr": 0.07150755077600479, "type": "self_attn.q_proj" }, "model.layers.19.self_attn.q_proj": { "snr": 0.0676807165145874, "type": "self_attn.q_proj" }, "model.layers.20.self_attn.q_proj": { "snr": 0.06511317938566208, "type": "self_attn.q_proj" }, "model.layers.21.self_attn.q_proj": { "snr": 0.06773187220096588, "type": "self_attn.q_proj" }, "model.layers.22.self_attn.q_proj": { "snr": 0.06400436162948608, "type": "self_attn.q_proj" }, "model.layers.23.self_attn.q_proj": { "snr": 0.0726117342710495, "type": "self_attn.q_proj" }, "model.layers.24.self_attn.q_proj": { "snr": 0.06882446259260178, "type": "self_attn.q_proj" }, "model.layers.25.self_attn.q_proj": { "snr": 0.07506493479013443, "type": "self_attn.q_proj" }, "model.layers.26.self_attn.q_proj": { "snr": 0.07797915488481522, "type": "self_attn.q_proj" }, "model.layers.27.self_attn.q_proj": { "snr": 0.06680692732334137, "type": "self_attn.q_proj" }, "model.layers.0.self_attn.v_proj": { "snr": 1.326789379119873, "type": "self_attn.v_proj" }, "model.layers.1.self_attn.v_proj": { "snr": 3.043806791305542, "type": "self_attn.v_proj" }, "model.layers.2.self_attn.v_proj": { "snr": 2.295107841491699, "type": "self_attn.v_proj" }, "model.layers.3.self_attn.v_proj": { "snr": 5.2584614753723145, "type": "self_attn.v_proj" }, "model.layers.4.self_attn.v_proj": { "snr": 4.038785934448242, "type": "self_attn.v_proj" }, "model.layers.5.self_attn.v_proj": { "snr": 3.0907773971557617, "type": "self_attn.v_proj" }, "model.layers.6.self_attn.v_proj": { "snr": 3.114994525909424, "type": "self_attn.v_proj" }, "model.layers.7.self_attn.v_proj": { "snr": 1.9747973680496216, "type": "self_attn.v_proj" }, "model.layers.8.self_attn.v_proj": { "snr": 3.0469374656677246, "type": "self_attn.v_proj" }, "model.layers.9.self_attn.v_proj": { "snr": 1.602966547012329, "type": "self_attn.v_proj" }, "model.layers.10.self_attn.v_proj": { "snr": 1.489019513130188, "type": "self_attn.v_proj" }, "model.layers.11.self_attn.v_proj": { "snr": 1.7490826845169067, "type": "self_attn.v_proj" }, "model.layers.12.self_attn.v_proj": { "snr": 2.451310396194458, "type": "self_attn.v_proj" }, "model.layers.13.self_attn.v_proj": { "snr": 3.250821590423584, "type": "self_attn.v_proj" }, "model.layers.14.self_attn.v_proj": { "snr": 7.944663047790527, "type": "self_attn.v_proj" }, "model.layers.15.self_attn.v_proj": { "snr": 7.013208389282227, "type": "self_attn.v_proj" }, "model.layers.16.self_attn.v_proj": { "snr": 2.68644118309021, "type": "self_attn.v_proj" }, "model.layers.17.self_attn.v_proj": { "snr": 3.9063122272491455, "type": "self_attn.v_proj" }, "model.layers.18.self_attn.v_proj": { "snr": 4.1816816329956055, "type": "self_attn.v_proj" }, "model.layers.19.self_attn.v_proj": { "snr": 6.794488906860352, "type": "self_attn.v_proj" }, "model.layers.20.self_attn.v_proj": { "snr": 3.401334285736084, "type": "self_attn.v_proj" }, "model.layers.21.self_attn.v_proj": { "snr": 2.051994562149048, "type": "self_attn.v_proj" }, "model.layers.22.self_attn.v_proj": { "snr": 3.614379405975342, "type": "self_attn.v_proj" }, "model.layers.23.self_attn.v_proj": { "snr": 11.180968284606934, "type": "self_attn.v_proj" }, "model.layers.24.self_attn.v_proj": { "snr": 2.3629775047302246, "type": "self_attn.v_proj" }, "model.layers.25.self_attn.v_proj": { "snr": 4.137593746185303, "type": "self_attn.v_proj" }, "model.layers.26.self_attn.v_proj": { "snr": 2.3465518951416016, "type": "self_attn.v_proj" }, "model.layers.27.self_attn.v_proj": { "snr": 3.10064697265625, "type": "self_attn.v_proj" } } ================================================ FILE: src/axolotl/integrations/swanlab/README.md ================================================ # SwanLab Integration for Axolotl SwanLab is an open-source, lightweight AI experiment tracking and visualization tool that provides a platform for tracking, recording, comparing, and collaborating on experiments. This integration enables seamless experiment tracking and visualization of Axolotl training runs using SwanLab. ## Features - 📊 **Automatic Metrics Logging**: Training loss, learning rate, and other metrics are automatically logged - 🎯 **Hyperparameter Tracking**: Model configuration and training parameters are tracked - 📈 **Real-time Visualization**: Monitor training progress in real-time through SwanLab dashboard - ☁️ **Cloud & Local Support**: Works in both cloud-synced and offline modes - 🔄 **Experiment Comparison**: Compare multiple training runs easily - 🤝 **Team Collaboration**: Share experiments with team members - 🎭 **RLHF Completion Logging**: Automatically log model outputs during DPO/KTO/ORPO/GRPO training for qualitative analysis - ⚡ **Performance Profiling**: Built-in profiling decorators to measure and optimize training performance - 🔔 **Lark Notifications**: Send real-time training updates to team chat (Feishu/Lark integration) ## Installation ```bash pip install swanlab ``` ## Quick Start ### 1. Register for SwanLab (Optional for cloud mode) If you want to use cloud sync features, register at [https://swanlab.cn](https://swanlab.cn) to get your API key. ### 2. Configure Axolotl Config File Add SwanLab configuration to your Axolotl YAML config: ```yaml # Enable SwanLab plugin plugins: - axolotl.integrations.swanlab.SwanLabPlugin # SwanLab configuration use_swanlab: true swanlab_project: my-llm-project swanlab_experiment_name: qwen-finetune-v1 swanlab_mode: cloud # Options: cloud, local, offline, disabled swanlab_workspace: my-team # Optional: organization name swanlab_api_key: YOUR_API_KEY # Optional: can also use env var SWANLAB_API_KEY ``` ### 3. Run Training ```bash # Set API key via environment variable (recommended) export SWANLAB_API_KEY=your-api-key-here # Or login once swanlab login # Run training as usual accelerate launch -m axolotl.cli.train your-config.yaml ``` ## Configuration Options ### Basic Configuration | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `use_swanlab` | bool | `false` | Enable SwanLab tracking | | `swanlab_project` | str | `None` | Project name (required) | | `swanlab_experiment_name` | str | `None` | Experiment name | | `swanlab_description` | str | `None` | Experiment description | | `swanlab_mode` | str | `cloud` | Sync mode: `cloud`, `local`, `offline`, `disabled` | ### Advanced Configuration | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `swanlab_workspace` | str | `None` | Workspace/organization name | | `swanlab_api_key` | str | `None` | API key (prefer env var) | | `swanlab_web_host` | str | `None` | Private deployment web host | | `swanlab_api_host` | str | `None` | Private deployment API host | | `swanlab_log_model` | bool | `false` | Log model checkpoints (coming soon) | | `swanlab_lark_webhook_url` | str | `None` | Lark (Feishu) webhook URL for team notifications | | `swanlab_lark_secret` | str | `None` | Lark webhook HMAC secret for authentication | | `swanlab_log_completions` | bool | `true` | Enable RLHF completion table logging (DPO/KTO/ORPO/GRPO) | | `swanlab_completion_log_interval` | int | `100` | Steps between completion logging | | `swanlab_completion_max_buffer` | int | `128` | Max completions to buffer (memory bound) | ## Configuration Examples ### Example 1: Basic Cloud Sync ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: llama-finetune swanlab_experiment_name: llama-3-8b-instruct-v1 swanlab_mode: cloud ``` ### Example 2: Offline/Local Mode ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: local-experiments swanlab_experiment_name: test-run-1 swanlab_mode: local # or 'offline' ``` ### Example 3: Team Workspace ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: research-project swanlab_experiment_name: experiment-42 swanlab_workspace: my-research-team swanlab_mode: cloud ``` ### Example 4: Private Deployment ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: internal-project swanlab_experiment_name: secure-training swanlab_mode: cloud swanlab_web_host: https://swanlab.yourcompany.com swanlab_api_host: https://api.swanlab.yourcompany.com ``` ## Team Notifications with Lark (Feishu) SwanLab supports sending real-time training notifications to your team chat via Lark (Feishu), ByteDance's enterprise collaboration platform. This is especially useful for: - **Production training monitoring**: Get alerts when training starts, completes, or encounters errors - **Team collaboration**: Keep your ML team informed about long-running experiments - **Multi-timezone teams**: Team members can check training progress without being online ### Prerequisites 1. **Lark Bot Setup**: Create a custom bot in your Lark group chat 2. **Webhook URL**: Get the webhook URL from your Lark bot settings 3. **HMAC Secret** (recommended): Enable signature verification in your Lark bot for security For detailed Lark bot setup instructions, see [Lark Custom Bot Documentation](https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN). ### Example 5: Basic Lark Notifications Send training notifications to a Lark group chat: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: production-training swanlab_experiment_name: llama-3-finetune-v2 swanlab_mode: cloud # Lark notification (basic, no HMAC verification) swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx ``` **Note**: This configuration will work, but you'll see a security warning recommending HMAC secret configuration. ### Example 6: Lark Notifications with HMAC Security (Recommended) For production use, enable HMAC signature verification: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: production-training swanlab_experiment_name: llama-3-finetune-v2 swanlab_mode: cloud # Lark notification with HMAC authentication swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx swanlab_lark_secret: your-webhook-secret-key ``` **Why HMAC secret matters**: - Prevents unauthorized parties from sending fake notifications to your Lark group - Ensures notifications genuinely come from your training jobs - Required for production deployments with sensitive training data ### Example 7: Team Workspace + Lark Notifications Combine team workspace collaboration with Lark notifications: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: research-project swanlab_experiment_name: multimodal-experiment-42 swanlab_workspace: ml-research-team swanlab_mode: cloud # Notify team via Lark when training starts/completes swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx swanlab_lark_secret: your-webhook-secret-key ``` ### What Notifications Are Sent? SwanLab's Lark integration sends notifications for key training events: - **Training Start**: When your experiment begins - **Training Complete**: When training finishes successfully - **Training Errors**: If training crashes or encounters critical errors - **Metric Milestones**: Configurable alerts for metric thresholds (if configured in SwanLab) Each notification includes: - Experiment name and project - Training status - Key metrics (loss, learning rate) - Direct link to SwanLab dashboard ### Lark Configuration Validation The plugin validates your Lark configuration at startup: #### ✅ Valid Configurations ```yaml # Option 1: No Lark (default) use_swanlab: true swanlab_project: my-project # No swanlab_lark_webhook_url → Lark disabled, no warnings # Option 2: Lark with HMAC secret (recommended) use_swanlab: true swanlab_project: my-project swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx swanlab_lark_secret: your-secret # ✅ Logs: "Registered Lark notification callback with HMAC authentication" # Option 3: Lark without secret (works but not recommended) use_swanlab: true swanlab_project: my-project swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx # ⚠️ Logs: "Registered Lark notification callback (no HMAC secret)" # ⚠️ Warning: "Lark webhook has no secret configured. For production use, set 'swanlab_lark_secret'..." ``` ### Security Best Practices 1. **Always use HMAC secret in production**: ```yaml swanlab_lark_webhook_url: https://open.feishu.cn/... swanlab_lark_secret: your-secret-key # ✅ Add this! ``` 2. **Store secrets in environment variables** (even better): ```yaml # In your training script/environment export SWANLAB_LARK_WEBHOOK_URL="https://open.feishu.cn/..." export SWANLAB_LARK_SECRET="your-secret-key" ``` Then in config: ```yaml # SwanLab plugin will auto-detect environment variables use_swanlab: true swanlab_project: my-project # Lark URL and secret read from env vars ``` 3. **Rotate webhook secrets periodically**: Update your Lark bot's secret every 90 days 4. **Use separate webhooks for dev/prod**: Don't mix development and production notifications ### Distributed Training Lark notifications are automatically deduplicated in distributed training: - Only **rank 0** sends notifications - Other GPU ranks skip Lark registration - Prevents duplicate messages in multi-GPU training ```bash # Running on 4 GPUs torchrun --nproc_per_node=4 -m axolotl.cli.train config.yml # Expected logs: # [Rank 0] Registered Lark notification callback with HMAC authentication # [Rank 1-3] (no Lark registration messages) ``` ## RLHF Completion Table Logging For RLHF (Reinforcement Learning from Human Feedback) training methods like DPO, KTO, ORPO, and GRPO, SwanLab can log model completions (prompts, chosen/rejected responses, rewards) to a visual table for qualitative analysis. This helps you: - **Inspect model behavior**: See actual model outputs during training - **Debug preference learning**: Compare chosen vs rejected responses - **Track reward patterns**: Monitor how rewards evolve over training - **Share examples with team**: Visual tables in SwanLab dashboard ### Features - ✅ **Automatic detection**: Works with DPO, KTO, ORPO, GRPO trainers - ✅ **Memory-safe buffering**: Bounded buffer prevents memory leaks in long training runs - ✅ **Periodic logging**: Configurable logging interval to reduce overhead - ✅ **Rich visualization**: SwanLab tables show prompts, responses, and metrics side-by-side ### Configuration | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `swanlab_log_completions` | bool | `true` | Enable completion logging for RLHF trainers | | `swanlab_completion_log_interval` | int | `100` | Log completions to SwanLab every N training steps | | `swanlab_completion_max_buffer` | int | `128` | Maximum completions to buffer (memory bound) | ### Example: DPO Training with Completion Logging ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: dpo-training swanlab_experiment_name: llama-3-dpo-v1 swanlab_mode: cloud # RLHF completion logging (enabled by default) swanlab_log_completions: true swanlab_completion_log_interval: 100 # Log every 100 steps swanlab_completion_max_buffer: 128 # Keep last 128 completions # DPO-specific config rl: dpo datasets: - path: /path/to/preference_dataset type: chatml.intel ``` ### Example: Disable Completion Logging If you're doing a quick test run or don't need completion tables: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: dpo-training # Disable completion logging swanlab_log_completions: false ``` ### Supported RLHF Trainers The completion logging callback automatically activates for these trainer types: - **DPO (Direct Preference Optimization)**: Logs prompts, chosen, rejected, reward_diff - **KTO (Kahneman-Tversky Optimization)**: Logs prompts, completions, labels, rewards - **ORPO (Odds Ratio Preference Optimization)**: Logs prompts, chosen, rejected, log_odds_ratio - **GRPO (Group Relative Policy Optimization)**: Logs prompts, completions, rewards, advantages - **CPO (Constrained Policy Optimization)**: Logs prompts, chosen, rejected For non-RLHF trainers (standard supervised fine-tuning), the completion callback is automatically skipped. ### How It Works 1. **Auto-detection**: Plugin detects trainer type at initialization 2. **Buffering**: Completions are buffered in memory (up to `swanlab_completion_max_buffer`) 3. **Periodic logging**: Every `swanlab_completion_log_interval` steps, buffer is logged to SwanLab 4. **Memory safety**: Old completions are automatically dropped when buffer is full (uses `collections.deque`) 5. **Final flush**: Remaining completions are logged when training completes ### Viewing Completion Tables After training starts, you can view completion tables in your SwanLab dashboard: 1. Navigate to your experiment in SwanLab 2. Look for the "rlhf_completions" table in the metrics panel 3. The table shows: - **step**: Training step when completion was generated - **prompt**: Input prompt - **chosen**: Preferred response (DPO/ORPO) - **rejected**: Non-preferred response (DPO/ORPO) - **completion**: Model output (KTO/GRPO) - **reward_diff/reward**: Reward metrics - Trainer-specific metrics (e.g., log_odds_ratio for ORPO) ### Memory Management The completion buffer is **memory-bounded** to prevent memory leaks: ```python # Internal implementation uses deque with maxlen from collections import deque buffer = deque(maxlen=128) # Old completions automatically dropped ``` **Memory usage estimate**: - Average completion: ~500 characters (prompt + responses) - Buffer size 128: ~64 KB (negligible) - Buffer size 1024: ~512 KB (still small) **Recommendation**: Default buffer size (128) works well for most cases. Increase to 512-1024 only if you need to review more historical completions. ### Performance Impact Completion logging has minimal overhead: - **Buffering**: O(1) append operation, negligible CPU/memory - **Logging**: Only happens every N steps (default: 100) - **Network**: SwanLab batches table uploads efficiently **Expected overhead**: < 0.5% per training step ### Troubleshooting #### Completions not appearing in SwanLab **Cause**: Trainer may not be logging completion data in the expected format. **Diagnostic steps**: 1. Check trainer type detection in logs: ```text INFO: SwanLab RLHF completion logging enabled for DPOTrainer (type: dpo) ``` 2. Verify your trainer is an RLHF trainer (DPO/KTO/ORPO/GRPO) 3. Check if trainer logs completion data (this depends on TRL version) **Note**: The current implementation expects trainers to log completion data in the `logs` dict during `on_log()` callback. Some TRL trainers may not expose this data by default. You may need to patch the trainer to expose completions. #### Buffer fills up too quickly **Cause**: High logging frequency with small buffer size. **Solution**: Increase buffer size or logging interval: ```yaml swanlab_completion_log_interval: 200 # Log less frequently swanlab_completion_max_buffer: 512 # Larger buffer ``` #### Memory usage growing over time **Cause**: Buffer should be bounded, so this indicates a bug. **Solution**: 1. Verify `swanlab_completion_max_buffer` is set 2. Check SwanLab version is up to date 3. Report issue with memory profiling data ## Performance Profiling SwanLab integration includes profiling utilities to measure and log execution time of trainer methods. This helps you: - **Identify bottlenecks**: Find slow operations in your training loop - **Optimize performance**: Track improvements after optimization changes - **Monitor distributed training**: See per-rank timing differences - **Debug hangs**: Detect methods that take unexpectedly long ### Features - ✅ **Zero-config profiling**: Automatic timing of key trainer methods - ✅ **Decorator-based**: Easy to add profiling to custom methods with `@swanlab_profile` - ✅ **Context manager**: Fine-grained profiling with `swanlab_profiling_context()` - ✅ **Advanced filtering**: `ProfilingConfig` for throttling and minimum duration thresholds - ✅ **Exception-safe**: Logs duration even if function raises an exception ### Basic Usage: Decorator Add profiling to any trainer method with the `@swanlab_profile` decorator: ```python from axolotl.integrations.swanlab.profiling import swanlab_profile class MyCustomTrainer(AxolotlTrainer): @swanlab_profile def training_step(self, model, inputs): # Your training step logic return super().training_step(model, inputs) @swanlab_profile def prediction_step(self, model, inputs, prediction_loss_only): # Your prediction logic return super().prediction_step(model, inputs, prediction_loss_only) ``` The decorator automatically: 1. Measures execution time with high-precision timer 2. Logs to SwanLab as `profiling/Time taken: ClassName.method_name` 3. Only logs if SwanLab is enabled (`use_swanlab: true`) 4. Gracefully handles exceptions (logs duration, then re-raises) ### Advanced Usage: Context Manager For fine-grained profiling within a method: ```python from axolotl.integrations.swanlab.profiling import swanlab_profiling_context class MyTrainer(AxolotlTrainer): def complex_training_step(self, model, inputs): # Profile just the forward pass with swanlab_profiling_context(self, "forward_pass"): outputs = model(**inputs) # Profile just the backward pass with swanlab_profiling_context(self, "backward_pass"): loss = outputs.loss loss.backward() return outputs ``` ### Advanced Usage: ProfilingConfig Filter and throttle profiling logs with `ProfilingConfig`: ```python from axolotl.integrations.swanlab.profiling import ( swanlab_profiling_context_advanced, ProfilingConfig, ) # Create custom profiling config profiling_config = ProfilingConfig( enabled=True, min_duration_ms=1.0, # Only log if duration > 1ms log_interval=10, # Log every 10th call ) class MyTrainer(AxolotlTrainer): def frequently_called_method(self, data): with swanlab_profiling_context_advanced( self, "frequent_op", config=profiling_config ): # This only logs every 10th call, and only if it takes > 1ms result = expensive_computation(data) return result ``` **ProfilingConfig Parameters**: - `enabled`: Enable/disable profiling globally (default: `True`) - `min_duration_ms`: Minimum duration to log in milliseconds (default: `0.1`) - `log_interval`: Log every Nth function call (default: `1` = log all) **Use cases**: - **High-frequency methods**: Use `log_interval=100` to reduce logging overhead - **Filter noise**: Use `min_duration_ms=1.0` to skip very fast operations - **Debugging**: Use `log_interval=1, min_duration_ms=0.0` to log everything ### Viewing Profiling Metrics In your SwanLab dashboard, profiling metrics appear under the "profiling" namespace: ```text profiling/Time taken: AxolotlTrainer.training_step profiling/Time taken: AxolotlTrainer.prediction_step profiling/Time taken: MyTrainer.forward_pass profiling/Time taken: MyTrainer.backward_pass ``` You can: - **Track over time**: See if methods get faster/slower during training - **Compare runs**: Compare profiling metrics across experiments - **Identify regressions**: Detect if a code change slowed down training ### Configuration in Axolotl Config Profiling is automatically enabled when SwanLab is enabled. No additional config needed: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: my-project # Profiling is automatically enabled # Add @swanlab_profile decorators to your custom trainer methods ``` To disable profiling while keeping SwanLab enabled: ```python # In your custom trainer code from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG # Disable profiling globally DEFAULT_PROFILING_CONFIG.enabled = False ``` ### Performance Impact - **Decorator overhead**: ~2-5 microseconds per call (negligible) - **Context manager overhead**: ~1-3 microseconds (negligible) - **Logging overhead**: Only when SwanLab is enabled and method duration exceeds threshold - **Network overhead**: SwanLab batches metrics efficiently **Expected overhead**: < 0.1% per training step (effectively zero) ### Best Practices 1. **Profile bottlenecks first**: Start by profiling suspected slow operations 2. **Use min_duration_ms**: Filter out fast operations (< 1ms) to reduce noise 3. **Throttle high-frequency calls**: Use `log_interval` for methods called > 100 times/step 4. **Profile across runs**: Compare profiling metrics before/after optimization 5. **Monitor distributed training**: Check for rank-specific slowdowns ### Example: Complete Profiling Setup ```python from axolotl.integrations.swanlab.profiling import ( swanlab_profile, swanlab_profiling_context, ProfilingConfig, ) class OptimizedTrainer(AxolotlTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Custom profiling config for high-frequency operations self.fast_op_config = ProfilingConfig( enabled=True, min_duration_ms=0.5, log_interval=50, ) @swanlab_profile def training_step(self, model, inputs): """Main training step - always profile.""" return super().training_step(model, inputs) @swanlab_profile def compute_loss(self, model, inputs, return_outputs=False): """Loss computation - always profile.""" return super().compute_loss(model, inputs, return_outputs) def _prepare_inputs(self, inputs): """High-frequency operation - throttled profiling.""" with swanlab_profiling_context_advanced( self, "prepare_inputs", config=self.fast_op_config, ): return super()._prepare_inputs(inputs) ``` ### Troubleshooting #### Profiling metrics not appearing in SwanLab **Cause**: SwanLab is not enabled or not initialized. **Solution**: ```yaml # Ensure SwanLab is enabled use_swanlab: true swanlab_project: my-project ``` Check logs for: ```text INFO: SwanLab initialized for project: my-project ``` #### Too many profiling metrics cluttering dashboard **Cause**: Profiling every function call for high-frequency operations. **Solution**: Use `ProfilingConfig` with throttling: ```python config = ProfilingConfig( min_duration_ms=1.0, # Skip fast ops log_interval=100, # Log every 100th call ) ``` #### Profiling overhead impacting training speed **Cause**: Profiling itself should have negligible overhead (< 0.1%). If you see > 1% slowdown, this indicates a bug. **Solution**: 1. Disable profiling temporarily to confirm: ```python DEFAULT_PROFILING_CONFIG.enabled = False ``` 2. Report issue with profiling data and trainer details #### Profiling shows inconsistent timing **Cause**: Normal variation due to GPU warmup, data loading, or system load. **Solution**: - Ignore first few steps (warmup period) - Look at average/median timing over many steps - Use `log_interval` to reduce noise from individual outliers ## Complete Config Example Here's a complete example integrating SwanLab with your RVQ-Alpha training: ```yaml base_model: /path/to/your/model model_type: Qwen2ForCausalLM # SwanLab Integration plugins: - axolotl.integrations.swanlab.SwanLabPlugin - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin use_swanlab: true swanlab_project: RVQ-Alpha-Training swanlab_experiment_name: Qwen2.5-7B-MetaQA-Perturb-P020 swanlab_description: "Training on MetaQA and Perturbation datasets with NEW-RVQ encoding" swanlab_mode: cloud swanlab_workspace: single-cell-genomics # Training configuration sequence_len: 32768 micro_batch_size: 1 gradient_accumulation_steps: 1 num_epochs: 2 learning_rate: 2e-5 optimizer: adamw_torch_fused # Datasets datasets: - path: /path/to/dataset type: chat_template # Output output_dir: ./outputs ``` ## Modes Explained ### `cloud` Mode (Default) - Syncs experiments to SwanLab cloud in real-time - Requires API key and internet connection - Best for: Team collaboration, remote monitoring ### `local` Mode - Saves experiments locally only - No cloud sync - Best for: Local development, air-gapped environments ### `offline` Mode - Saves metadata locally - Can sync to cloud later using `swanlab sync` - Best for: Unstable internet, sync later ### `disabled` Mode - Turns off SwanLab completely - No logging or tracking - Best for: Debugging, testing ## Configuration Validation & Conflict Detection SwanLab integration includes comprehensive validation and conflict detection to help you catch configuration errors early and avoid performance issues. ### Required Fields Validation The plugin validates your configuration at startup and provides clear error messages with solutions: #### Missing Project Name ```yaml # ❌ INVALID: use_swanlab enabled but no project use_swanlab: true # Error: SwanLab enabled but 'swanlab_project' is not set. ``` **Solution**: ```yaml # ✅ VALID: Provide project name use_swanlab: true swanlab_project: my-project ``` #### Invalid Mode ```yaml # ❌ INVALID: Unknown mode use_swanlab: true swanlab_project: my-project swanlab_mode: invalid-mode # Error: Invalid swanlab_mode: 'invalid-mode'. Valid options: cloud, local, offline, disabled ``` **Solution**: ```yaml # ✅ VALID: Use one of the valid modes use_swanlab: true swanlab_project: my-project swanlab_mode: cloud # or: local, offline, disabled ``` #### Empty Project Name ```yaml # ❌ INVALID: Empty string project name use_swanlab: true swanlab_project: "" # Error: swanlab_project cannot be an empty string. ``` **Solution**: ```yaml # ✅ VALID: Provide non-empty project name use_swanlab: true swanlab_project: my-project ``` ### Cloud Mode API Key Warning When using `cloud` mode without an API key, you'll receive a warning with multiple solutions: ```yaml use_swanlab: true swanlab_project: my-project swanlab_mode: cloud # No API key set # Warning: SwanLab cloud mode enabled but no API key found. ``` **Solutions**: 1. Set environment variable: `export SWANLAB_API_KEY=your-api-key` 2. Add to config (less secure): `swanlab_api_key: your-api-key` 3. Run `swanlab login` before training 4. Use `swanlab_mode: local` for offline tracking ### Multi-Logger Performance Warnings Using multiple logging tools simultaneously (SwanLab + WandB + MLflow + Comet) can impact training performance: #### Two Loggers - Warning ```yaml use_swanlab: true swanlab_project: my-project use_wandb: true wandb_project: my-project # Warning: Multiple logging tools enabled: SwanLab, WandB # Expected overhead: ~3.0% per training step. ``` **Impact**: - Performance overhead: ~1-2% per logger (cumulative) - Increased memory usage - Longer training time per step - Potential config/callback conflicts **Recommendations**: - Choose ONE primary logging tool for production training - Use multiple loggers only for: - Migration period (transitioning between tools) - Short comparison runs - Debugging specific tool issues - Monitor system resources (CPU, memory) during training #### Three+ Loggers - Error-Level Warning ```yaml use_swanlab: true swanlab_project: my-project use_wandb: true wandb_project: my-project use_mlflow: true mlflow_tracking_uri: http://localhost:5000 # ERROR: 3 logging tools enabled simultaneously! # Expected overhead: ~4.5% per training step. # STRONGLY RECOMMEND: Disable all but ONE logging tool ``` **Why This Matters**: - With 3 loggers: ~4-5% overhead per step → significant slowdown over long training - Example: 10,000 steps at 2s/step → ~400-500 seconds extra (6-8 minutes) - Memory overhead scales with number of loggers - Rare edge cases with callback ordering conflicts ### Auto-Enable Logic For convenience, SwanLab will auto-enable if you specify a project without setting `use_swanlab`: ```yaml # This configuration: swanlab_project: my-project # Automatically becomes: use_swanlab: true swanlab_project: my-project ``` ### Distributed Training Detection In distributed training scenarios (multi-GPU), the plugin automatically detects and reports: ```yaml use_swanlab: true swanlab_project: my-project swanlab_mode: cloud # When running with torchrun --nproc_per_node=4: # Info: Distributed training detected (world_size=4) # Info: SwanLab mode: cloud # Info: Only rank 0 will initialize SwanLab # Info: Other ranks will skip SwanLab to avoid conflicts ``` **Why Only Rank 0**: - Avoids duplicate experiment runs - Reduces network/cloud API overhead on worker ranks - Prevents race conditions in metric logging ## Authentication ### Method 1: Environment Variable (Recommended) ```bash export SWANLAB_API_KEY=your-api-key-here ``` ### Method 2: Login Command ```bash swanlab login # Enter your API key when prompted ``` ### Method 3: Config File ```yaml swanlab_api_key: your-api-key-here ``` ## What Gets Logged? ### Automatically Logged Metrics - Training loss - Learning rate - Gradient norm - Training steps - Epoch progress ### Automatically Logged Config - Model configuration (base_model, model_type) - Training hyperparameters (learning_rate, batch_size, etc.) - Optimizer settings - Parallelization settings (FSDP, DeepSpeed, Context Parallel) - Axolotl configuration file - DeepSpeed configuration (if used) ## Viewing Your Experiments ### Cloud Mode Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project to view: - Real-time training metrics - Hyperparameter comparison - System resource usage - Configuration files ### Local Mode ```bash # Start local dashboard swanlab watch ./swanlog # Open browser to http://localhost:5092 ``` ## Integration with Existing Tools SwanLab can work alongside other tracking tools: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin # Use both SwanLab and Wandb use_swanlab: true swanlab_project: my-project use_wandb: true wandb_project: my-project ``` ## Troubleshooting ### Configuration Errors #### Error: "SwanLab enabled but 'swanlab_project' is not set" **Cause**: You enabled SwanLab (`use_swanlab: true`) but forgot to specify a project name. **Solution**: ```yaml use_swanlab: true swanlab_project: my-project # Add this line ``` #### Error: "Invalid swanlab_mode: 'xxx'" **Cause**: You provided an invalid mode value. **Solution**: Use one of the valid modes: ```yaml swanlab_mode: cloud # or: local, offline, disabled ``` #### Error: "swanlab_project cannot be an empty string" **Cause**: You set `swanlab_project: ""` (empty string). **Solution**: Either provide a valid name or remove the field: ```yaml # Option 1: Provide valid name swanlab_project: my-project # Option 2: Remove the field entirely # swanlab_project: "" <- Remove this line ``` ### Import Errors #### Error: "SwanLab is not installed" **Cause**: SwanLab package is not installed in your environment. **Solution**: ```bash pip install swanlab # or pip install swanlab>=0.3.0 ``` ### Performance Issues #### Warning: "Multiple logging tools enabled" **Cause**: You have multiple experiment tracking tools enabled (e.g., SwanLab + WandB + MLflow). **Impact**: ~1-2% performance overhead per logger, cumulative. **Solution**: For production training, disable all but one logger: ```yaml # Option 1: Keep only SwanLab use_swanlab: true swanlab_project: my-project use_wandb: false # Disable others use_mlflow: false # Option 2: Keep only WandB use_swanlab: false use_wandb: true wandb_project: my-project ``` **Exception**: Multiple loggers are acceptable for: - Short comparison runs (< 100 steps) - Migration testing between logging tools - Debugging logger-specific issues ### Distributed Training Issues #### SwanLab creates duplicate runs in multi-GPU training **Cause**: All ranks are initializing SwanLab instead of just rank 0. **Expected Behavior**: The plugin automatically ensures only rank 0 initializes SwanLab. You should see: ```text Info: Distributed training detected (world_size=4) Info: Only rank 0 will initialize SwanLab Info: Other ranks will skip SwanLab to avoid conflicts ``` **If you see duplicates**: 1. Check your plugin is loaded correctly 2. Verify you're using the latest SwanLab integration code 3. Check logs for initialization messages on all ranks ### SwanLab not logging metrics **Solution**: Ensure SwanLab is initialized before training starts. The plugin automatically handles this in `pre_model_load`. ### API Key errors **Solution**: ```bash # Verify API key echo $SWANLAB_API_KEY # Re-login swanlab login ``` ### Cloud sync issues **Solution**: Use `offline` mode and sync later: ```yaml swanlab_mode: offline ``` Then sync when ready: ```bash swanlab sync ./swanlog ``` ### Plugin not loaded **Solution**: Verify plugin path in config: ```yaml plugins: - axolotl.integrations.swanlab.SwanLabPlugin # Correct path ``` ### Lark Notification Issues #### Error: "Failed to import SwanLab Lark plugin" **Cause**: Your SwanLab version doesn't include the Lark plugin (requires SwanLab >= 0.3.0). **Solution**: ```bash # Upgrade SwanLab to latest version pip install --upgrade swanlab # Or install specific version pip install 'swanlab>=0.3.0' ``` #### Warning: "Lark webhook has no secret configured" **Cause**: You provided `swanlab_lark_webhook_url` but no `swanlab_lark_secret`. **Impact**: Lark notifications will work, but without HMAC authentication (security risk). **Solution**: Add HMAC secret for production use: ```yaml swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx swanlab_lark_secret: your-webhook-secret # Add this line ``` **When it's OK to skip secret**: - Local development and testing - Internal networks with restricted access - Non-sensitive training experiments **When secret is required**: - Production training jobs - Training with proprietary data - Multi-team shared Lark groups #### Error: "Failed to register Lark callback" **Cause**: Invalid webhook URL or network connectivity issues. **Diagnostic steps**: ```bash # 1. Test webhook URL manually curl -X POST "YOUR_WEBHOOK_URL" \ -H 'Content-Type: application/json' \ -d '{"msg_type":"text","content":{"text":"Test from Axolotl"}}' # 2. Check SwanLab version pip show swanlab # 3. Verify webhook URL format # Should start with: https://open.feishu.cn/open-apis/bot/v2/hook/ ``` **Solution**: 1. Verify webhook URL is correct (copy from Lark bot settings) 2. Check network connectivity to Lark API 3. Ensure webhook is not expired (Lark webhooks can expire) 4. Regenerate webhook URL in Lark bot settings if needed #### Lark notifications not received **Cause**: Multiple possible causes. **Diagnostic checklist**: 1. **Check training logs** for Lark registration confirmation: ```text # Expected log message (rank 0 only): INFO: Registered Lark notification callback with HMAC authentication ``` 2. **Verify webhook in Lark**: Test webhook manually (see above) 3. **Check distributed training**: Only rank 0 sends notifications ```bash # If running multi-GPU, check rank 0 logs specifically grep "Registered Lark" logs/rank_0.log ``` 4. **Verify SwanLab is initialized**: Lark callback needs SwanLab to be running ```yaml use_swanlab: true # Must be enabled swanlab_project: my-project # Must be set ``` 5. **Check Lark bot permissions**: Ensure bot is added to the target group chat #### Duplicate Lark notifications in multi-GPU training **Expected Behavior**: Should NOT happen - only rank 0 sends notifications. **If you see duplicates**: 1. Check that all GPUs are using the same config file 2. Verify plugin is loaded correctly on all ranks 3. Check logs for unexpected Lark initialization on non-zero ranks 4. Ensure `RANK` or `LOCAL_RANK` environment variables are set correctly **Solution**: This is a bug if it occurs. Report with: - Full training command - Logs from all ranks - Config file ## Comparison: SwanLab vs WandB | Feature | SwanLab | WandB | |---------|---------|-------| | Open Source | ✅ Yes | ❌ No | | Self-Hosting | ✅ Easy | ⚠️ Complex | | Free Tier | ✅ Generous | ⚠️ Limited | | Chinese Support | ✅ Native | ⚠️ Limited | | Offline Mode | ✅ Full support | ✅ Supported | | Integration | 🆕 New | ✅ Mature | ## Advanced Usage ### Custom Logging You can add custom metrics in your callbacks: ```python import swanlab # In your custom callback swanlab.log({ "custom_metric": value, "epoch": epoch_num }) ``` ### Experiment Comparison ```bash # Compare multiple experiments swanlab compare run1 run2 run3 ``` ## Support - **Documentation**: [https://docs.swanlab.cn](https://docs.swanlab.cn) - **GitHub**: [https://github.com/SwanHubX/SwanLab](https://github.com/SwanHubX/SwanLab) - **Issues**: Report bugs at [GitHub Issues](https://github.com/SwanHubX/SwanLab/issues) ## License This integration follows the Axolotl Community License Agreement. ## Acknowledgements This integration is built on top of: - [SwanLab](https://github.com/SwanHubX/SwanLab) - Experiment tracking tool - [Transformers](https://github.com/huggingface/transformers) - SwanLabCallback - [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) - Training framework ================================================ FILE: src/axolotl/integrations/swanlab/__init__.py ================================================ """SwanLab integration plugin for Axolotl""" from axolotl.integrations.swanlab.args import SwanLabConfig from axolotl.integrations.swanlab.plugins import SwanLabPlugin __all__ = ["SwanLabConfig", "SwanLabPlugin"] ================================================ FILE: src/axolotl/integrations/swanlab/args.py ================================================ """SwanLab configuration arguments""" from pydantic import BaseModel, Field, field_validator, model_validator class SwanLabConfig(BaseModel): """SwanLab configuration subset""" use_swanlab: bool | None = Field( default=True, json_schema_extra={ "description": "Enable SwanLab experiment tracking and visualization" }, ) swanlab_project: str | None = Field( default=None, json_schema_extra={"description": "Your SwanLab project name"}, ) swanlab_experiment_name: str | None = Field( default=None, json_schema_extra={"description": "Set the name of your SwanLab experiment"}, ) swanlab_description: str | None = Field( default=None, json_schema_extra={"description": "Description for your SwanLab experiment"}, ) swanlab_mode: str | None = Field( default=None, json_schema_extra={ "description": '"cloud" to sync to SwanLab cloud, "local" for local only, "offline" to save metadata locally, "disabled" to turn off SwanLab' }, ) swanlab_workspace: str | None = Field( default=None, json_schema_extra={ "description": "SwanLab workspace name (organization or username)" }, ) swanlab_api_key: str | None = Field( default=None, json_schema_extra={ "description": "SwanLab API key for authentication. Can also be set via SWANLAB_API_KEY environment variable" }, ) swanlab_log_model: bool | None = Field( default=False, json_schema_extra={ "description": "Whether to log model checkpoints to SwanLab (feature coming soon)" }, ) swanlab_web_host: str | None = Field( default=None, json_schema_extra={ "description": "Web address for SwanLab cloud environment (for private deployment)" }, ) swanlab_api_host: str | None = Field( default=None, json_schema_extra={ "description": "API address for SwanLab cloud environment (for private deployment)" }, ) swanlab_lark_webhook_url: str | None = Field( default=None, json_schema_extra={ "description": "Lark (Feishu) webhook URL for sending training notifications to team chat" }, ) swanlab_lark_secret: str | None = Field( default=None, json_schema_extra={ "description": "Secret for Lark webhook HMAC signature authentication (optional)" }, ) swanlab_log_completions: bool | None = Field( default=True, json_schema_extra={ "description": "Enable logging RLHF completions to SwanLab for qualitative analysis (DPO/KTO/ORPO/GRPO)" }, ) swanlab_completion_log_interval: int | None = Field( default=100, json_schema_extra={ "description": "Number of training steps between completion table logging to SwanLab" }, ) swanlab_completion_max_buffer: int | None = Field( default=128, json_schema_extra={ "description": "Maximum number of completions to buffer before logging (prevents memory leaks)" }, ) @field_validator("swanlab_mode") @classmethod def validate_swanlab_mode(cls, v): """Validate swanlab_mode is one of the allowed values.""" if v is None: return v valid_modes = ["cloud", "local", "offline", "disabled"] if v not in valid_modes: raise ValueError( f"Invalid swanlab_mode: '{v}'.\n\n" f"Valid options: {', '.join(valid_modes)}\n\n" f"Examples:\n" f" swanlab_mode: cloud # Sync to SwanLab cloud\n" f" swanlab_mode: local # Local only, no cloud sync\n" f" swanlab_mode: offline # Save metadata locally\n" f" swanlab_mode: disabled # Turn off SwanLab\n" ) return v @field_validator("swanlab_project") @classmethod def validate_swanlab_project(cls, v): """Validate swanlab_project is non-empty when provided.""" if v is not None and isinstance(v, str) and len(v.strip()) == 0: raise ValueError( "swanlab_project cannot be an empty string.\n\n" "Either:\n" " 1. Provide a valid project name: swanlab_project: my-project\n" " 2. Remove the swanlab_project field entirely\n" ) return v @model_validator(mode="after") def validate_swanlab_enabled_requires_project(self): """Validate that if use_swanlab is True, swanlab_project must be set.""" if self.use_swanlab is True and not self.swanlab_project: raise ValueError( "SwanLab enabled (use_swanlab: true) but 'swanlab_project' is not set.\n\n" "Solutions:\n" " 1. Add 'swanlab_project: your-project-name' to your config\n" " 2. Set 'use_swanlab: false' to disable SwanLab\n\n" "Example:\n" " use_swanlab: true\n" " swanlab_project: my-llm-training\n" ) return self ================================================ FILE: src/axolotl/integrations/swanlab/callbacks.py ================================================ """SwanLab callbacks for Axolotl trainers. This module provides HuggingFace Trainer callbacks for logging RLHF completions to SwanLab. """ from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.integrations.swanlab.completion_logger import CompletionLogger from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class SwanLabRLHFCompletionCallback(TrainerCallback): """Callback for logging RLHF completions to SwanLab. This callback periodically logs model completions (prompts, chosen/rejected responses, rewards) to SwanLab during RLHF training for qualitative analysis. Supports DPO, KTO, ORPO, and GRPO trainers. Example usage: >>> callback = SwanLabRLHFCompletionCallback( ... log_interval=100, # Log every 100 steps ... max_completions=128, # Keep last 128 completions ... ) >>> trainer.add_callback(callback) Attributes: logger: CompletionLogger instance log_interval: Number of steps between SwanLab logging trainer_type: Auto-detected trainer type (dpo/kto/orpo/grpo) """ def __init__( self, log_interval: int = 100, max_completions: int = 128, table_name: str = "rlhf_completions", ): """Initialize SwanLab RLHF completion callback. Args: log_interval: Log to SwanLab every N steps. Default: 100 max_completions: Maximum completions to buffer. Default: 128 table_name: SwanLab table name. Default: "rlhf_completions" """ super().__init__() self.logger = CompletionLogger(maxlen=max_completions) self.log_interval = log_interval self.table_name = table_name self.trainer_type: str | None = None # Auto-detected self._last_logged_step = 0 def on_init_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Detect trainer type on initialization.""" trainer = kwargs.get("trainer") if trainer is not None: trainer_name = trainer.__class__.__name__ if "DPO" in trainer_name: self.trainer_type = "dpo" elif "KTO" in trainer_name: self.trainer_type = "kto" elif "ORPO" in trainer_name: self.trainer_type = "orpo" elif "GRPO" in trainer_name: self.trainer_type = "grpo" else: self.trainer_type = "unknown" LOG.info( f"SwanLab RLHF completion logging enabled for {trainer_name} " f"(type: {self.trainer_type})" ) def on_log( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs: dict | None = None, **kwargs, ): """Capture completions from logs and buffer them. Different trainers log completions in different formats: - DPO: logs['dpo/chosen'], logs['dpo/rejected'], logs['dpo/reward_diff'] - KTO: logs['kto/completion'], logs['kto/label'], logs['kto/reward'] - ORPO: logs['orpo/chosen'], logs['orpo/rejected'] - GRPO: logs['grpo/completion'], logs['grpo/reward'] Note: This is a placeholder implementation. Actual log keys depend on the TRL trainer implementation. You may need to patch the trainers to expose completion data in logs. """ if logs is None or self.trainer_type is None: return step = state.global_step # DPO completions if self.trainer_type == "dpo": if all(key in logs for key in ["dpo/prompt", "dpo/chosen", "dpo/rejected"]): self.logger.add_dpo_completion( step=step, prompt=logs.get("dpo/prompt", ""), chosen=logs.get("dpo/chosen", ""), rejected=logs.get("dpo/rejected", ""), reward_diff=logs.get("dpo/reward_diff"), ) # KTO completions elif self.trainer_type == "kto": if all(key in logs for key in ["kto/prompt", "kto/completion"]): self.logger.add_kto_completion( step=step, prompt=logs.get("kto/prompt", ""), completion=logs.get("kto/completion", ""), label=logs.get("kto/label", False), reward=logs.get("kto/reward"), ) # ORPO completions elif self.trainer_type == "orpo": if all( key in logs for key in ["orpo/prompt", "orpo/chosen", "orpo/rejected"] ): self.logger.add_orpo_completion( step=step, prompt=logs.get("orpo/prompt", ""), chosen=logs.get("orpo/chosen", ""), rejected=logs.get("orpo/rejected", ""), log_odds_ratio=logs.get("orpo/log_odds_ratio"), ) # GRPO completions elif self.trainer_type == "grpo": if all(key in logs for key in ["grpo/prompt", "grpo/completion"]): self.logger.add_grpo_completion( step=step, prompt=logs.get("grpo/prompt", ""), completion=logs.get("grpo/completion", ""), reward=logs.get("grpo/reward"), advantage=logs.get("grpo/advantage"), ) # Periodically log to SwanLab if step - self._last_logged_step >= self.log_interval: if len(self.logger) > 0: self.logger.log_to_swanlab(table_name=self.table_name) self.logger.clear() self._last_logged_step = step def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Log remaining completions at end of training.""" if len(self.logger) > 0: LOG.info( f"Training complete, logging final {len(self.logger)} completions to SwanLab" ) self.logger.log_to_swanlab(table_name=self.table_name) self._last_logged_step = state.global_step ================================================ FILE: src/axolotl/integrations/swanlab/completion_logger.py ================================================ """SwanLab completion logger for RLHF/DPO/KTO/ORPO/GRPO training. This module provides utilities for logging model completions during preference training to SwanLab for qualitative analysis. """ from collections import deque from collections.abc import Mapping from typing import Any from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class CompletionLogger: """Memory-bounded logger for RLHF completions. Stores prompts, completions, and rewards in fixed-size deques to prevent memory leaks during long training runs. Logs completion tables to SwanLab for qualitative analysis of model outputs. Example usage: >>> logger = CompletionLogger(maxlen=128) >>> logger.add_dpo_completion( ... step=0, ... prompt="What is AI?", ... chosen="Artificial Intelligence is...", ... rejected="AI means...", ... reward_diff=0.5 ... ) >>> logger.log_to_swanlab() Attributes: maxlen: Maximum number of completions to store (older ones are dropped) data: Deque storing completion dictionaries """ def __init__(self, maxlen: int = 128): """Initialize completion logger with bounded buffer. Args: maxlen: Maximum number of completions to store. When the buffer is full, oldest completions are automatically discarded. Default: 128 (sufficient for most RLHF runs without memory issues) """ self.maxlen = maxlen self.data: deque[Mapping[str, Any]] = deque(maxlen=maxlen) def add_dpo_completion( self, step: int, prompt: str, chosen: str, rejected: str, reward_diff: float | None = None, ) -> None: """Add a DPO completion to the buffer. Args: step: Training step number prompt: Input prompt chosen: Chosen (preferred) completion rejected: Rejected (non-preferred) completion reward_diff: Reward difference (chosen - rejected), if available """ entry = { "step": step, "prompt": prompt, "chosen": chosen, "rejected": rejected, } if reward_diff is not None: entry["reward_diff"] = reward_diff self.data.append(entry) def add_kto_completion( self, step: int, prompt: str, completion: str, label: bool, reward: float | None = None, ) -> None: """Add a KTO completion to the buffer. Args: step: Training step number prompt: Input prompt completion: Model-generated completion label: True if desirable, False if undesirable reward: Reward score, if available """ entry = { "step": step, "prompt": prompt, "completion": completion, "label": "desirable" if label else "undesirable", } if reward is not None: entry["reward"] = reward self.data.append(entry) def add_orpo_completion( self, step: int, prompt: str, chosen: str, rejected: str, log_odds_ratio: float | None = None, ) -> None: """Add an ORPO completion to the buffer. Args: step: Training step number prompt: Input prompt chosen: Chosen (preferred) completion rejected: Rejected (non-preferred) completion log_odds_ratio: Log odds ratio between chosen and rejected """ entry = { "step": step, "prompt": prompt, "chosen": chosen, "rejected": rejected, } if log_odds_ratio is not None: entry["log_odds_ratio"] = log_odds_ratio self.data.append(entry) def add_grpo_completion( self, step: int, prompt: str, completion: str, reward: float | None = None, advantage: float | None = None, ) -> None: """Add a GRPO completion to the buffer. Args: step: Training step number prompt: Input prompt completion: Model-generated completion reward: Reward score from reward model advantage: Advantage estimate (reward - baseline) """ entry = { "step": step, "prompt": prompt, "completion": completion, } if reward is not None: entry["reward"] = reward if advantage is not None: entry["advantage"] = advantage self.data.append(entry) def log_to_swanlab(self, table_name: str = "completions") -> bool: """Log buffered completions to SwanLab as a table. Creates a SwanLab echarts Table with all buffered completions. Only logs if SwanLab is initialized and data is available. Args: table_name: Name of the table in SwanLab dashboard. Default: "completions" Returns: True if logging succeeded, False otherwise """ if not self.data: LOG.debug("No completions to log to SwanLab") return False try: import swanlab if swanlab.get_run() is None: LOG.debug("SwanLab not initialized, skipping completion logging") return False # Convert deque to list of dicts completions = list(self.data) # Extract headers from first entry (all entries should have same structure) headers = list(completions[0].keys()) # Build rows: each completion becomes one row rows = [] for completion in completions: row = [completion.get(header, "") for header in headers] rows.append(row) # Log to SwanLab as echarts Table swanlab.log({table_name: swanlab.echarts.Table().add(headers, rows)}) LOG.info(f"Logged {len(rows)} completions to SwanLab table '{table_name}'") return True except ImportError: LOG.warning( "SwanLab not installed, cannot log completions. " "Install with: pip install swanlab" ) return False except Exception as err: # pylint: disable=broad-except LOG.exception("Failed to log completions to SwanLab: %s", err) return False def clear(self) -> None: """Clear all buffered completions.""" self.data.clear() def __len__(self) -> int: """Return number of buffered completions.""" return len(self.data) def __repr__(self) -> str: """String representation showing buffer status.""" return ( f"CompletionLogger(maxlen={self.maxlen}, " f"buffered={len(self.data)}/{self.maxlen})" ) ================================================ FILE: src/axolotl/integrations/swanlab/plugins.py ================================================ """SwanLab Plugin for Axolotl""" from __future__ import annotations from typing import TYPE_CHECKING from axolotl.integrations.base import BasePlugin from axolotl.utils.logging import get_logger if TYPE_CHECKING: from transformers import TrainerCallback from axolotl.utils.dict import DictDefault LOG = get_logger(__name__) class SwanLabPlugin(BasePlugin): """ SwanLab integration plugin for Axolotl. Provides experiment tracking, visualization, and logging capabilities using SwanLab (https://swanlab.cn). Usage in config.yaml: plugins: - axolotl.integrations.swanlab.SwanLabPlugin use_swanlab: true swanlab_project: my-project swanlab_experiment_name: my-experiment swanlab_mode: cloud # or 'local', 'offline', 'disabled' """ def __init__(self): super().__init__() self.swanlab_initialized = False LOG.info("SwanLab plugin initialized") def get_input_args(self) -> str: """Returns the configuration model for SwanLab integration.""" return "axolotl.integrations.swanlab.SwanLabConfig" def register(self, cfg: dict): """Register SwanLab plugin with configuration and conflict detection.""" LOG.info("Registering SwanLab plugin") # === Conflict Detection: Required Fields === # Check if SwanLab is enabled if cfg.get("use_swanlab"): # 1. Validate project name is set if not cfg.get("swanlab_project"): raise ValueError( "SwanLab enabled but 'swanlab_project' is not set.\n\n" "Solutions:\n" " 1. Add 'swanlab_project: your-project-name' to your config\n" " 2. Set 'use_swanlab: false' to disable SwanLab\n\n" "See: src/axolotl/integrations/swanlab/README.md for examples" ) # 2. Validate swanlab_mode value valid_modes = ["cloud", "local", "offline", "disabled"] mode = cfg.get("swanlab_mode") if mode and mode not in valid_modes: raise ValueError( f"Invalid swanlab_mode: '{mode}'.\n\n" f"Valid options: {', '.join(valid_modes)}\n\n" f"Example:\n" f" swanlab_mode: cloud # Sync to SwanLab cloud\n" f" swanlab_mode: local # Local only, no cloud sync\n" ) # 3. Check API key for cloud mode import os mode = cfg.get("swanlab_mode", "cloud") # Default is cloud if mode == "cloud": api_key = cfg.get("swanlab_api_key") or os.environ.get( "SWANLAB_API_KEY" ) if not api_key: LOG.warning( "SwanLab cloud mode enabled but no API key found.\n" "SwanLab may fail to initialize during training.\n\n" "Solutions:\n" " 1. Set SWANLAB_API_KEY environment variable:\n" " export SWANLAB_API_KEY=your-api-key\n" " 2. Add 'swanlab_api_key: your-api-key' to config (less secure)\n" " 3. Run 'swanlab login' before training\n" " 4. Use 'swanlab_mode: local' for offline tracking\n" ) # === Conflict Detection: Multi-Logger Performance Warning === # Detect all active logging tools active_loggers = [] if cfg.get("use_wandb"): active_loggers.append("WandB") if cfg.get("use_mlflow"): active_loggers.append("MLflow") if cfg.get("comet_api_key") or cfg.get("comet_project_name"): active_loggers.append("Comet") if cfg.get("use_swanlab"): active_loggers.append("SwanLab") if len(active_loggers) > 1: LOG.warning( f"\n{'=' * 70}\n" f"Multiple logging tools enabled: {', '.join(active_loggers)}\n" f"{'=' * 70}\n" f"This may cause:\n" f" - Performance overhead (~1-2% per logger, cumulative)\n" f" - Increased memory usage\n" f" - Longer training time per step\n" f" - Potential config/callback conflicts\n\n" f"Recommendations:\n" f" - Choose ONE primary logging tool for production training\n" f" - Use multiple loggers only for:\n" f" * Migration period (transitioning between tools)\n" f" * Short comparison runs\n" f" * Debugging specific tool issues\n" f" - Monitor system resources (CPU, memory) during training\n" f"{'=' * 70}\n" ) if len(active_loggers) >= 3: LOG.error( f"\n{'!' * 70}\n" f"WARNING: {len(active_loggers)} logging tools enabled simultaneously!\n" f"{'!' * 70}\n" f"This is likely unintentional and WILL significantly impact performance.\n" f"Expected overhead: ~{len(active_loggers) * 1.5:.1f}% per training step.\n\n" f"STRONGLY RECOMMEND:\n" f" - Disable all but ONE logging tool\n" f" - Use config inheritance to manage multiple configs\n" f"{'!' * 70}\n" ) # === Auto-Enable Logic === # Enable SwanLab if project is specified if cfg.get("swanlab_project") and not cfg.get("use_swanlab"): cfg["use_swanlab"] = True LOG.info("Automatically enabled use_swanlab because swanlab_project is set") def pre_model_load(self, cfg: DictDefault): """Initialize SwanLab before model loading with runtime checks.""" if not cfg.use_swanlab: return # === Runtime Check: Import Availability === try: import swanlab except ImportError as err: raise ImportError( "SwanLab is not installed.\n\n" "Install with:\n" " pip install swanlab\n\n" "Or add to requirements:\n" " swanlab>=0.3.0\n\n" f"Original error: {err}" ) from err # Log SwanLab version try: swanlab_version = swanlab.__version__ LOG.info(f"SwanLab version: {swanlab_version}") except AttributeError: LOG.warning("Could not determine SwanLab version") # === Runtime Check: Distributed Training Setup === from axolotl.utils.distributed import get_world_size, is_main_process world_size = get_world_size() if world_size > 1: mode = getattr(cfg, "swanlab_mode", "cloud") LOG.info( f"\n{'=' * 70}\n" f"Distributed training detected (world_size={world_size})\n" f"SwanLab mode: {mode}\n" f"{'=' * 70}\n" f"Behavior:\n" f" - Only rank 0 will initialize SwanLab\n" f" - Other ranks will skip SwanLab to avoid conflicts\n" ) if mode == "cloud": LOG.info( f" - Only rank 0 will upload to SwanLab cloud\n" f" - Other ranks run without SwanLab overhead\n" f"{'=' * 70}\n" ) # Only initialize SwanLab on the main process (rank 0) # to avoid creating multiple runs in distributed training if not is_main_process(): LOG.debug("Skipping SwanLab initialization on non-main process") return # Initialize SwanLab run (passing all params directly to init) try: init_kwargs = self._get_swanlab_init_kwargs(cfg) swanlab.init(**init_kwargs) self.swanlab_initialized = True LOG.info(f"SwanLab initialized with project: {cfg.swanlab_project}") # Register Lark notification callback (if configured) self._register_lark_callback(cfg) # Log configuration (with error handling) try: config_dict = self._prepare_config_for_logging(cfg) swanlab.config.update(config_dict) LOG.debug("Successfully logged config to SwanLab") except Exception as config_err: # pylint: disable=broad-except LOG.warning( f"Failed to log config to SwanLab: {config_err}. Continuing anyway." ) except Exception as err: # pylint: disable=broad-except LOG.exception("Failed to initialize SwanLab: %s", err) self.swanlab_initialized = False def add_callbacks_pre_trainer(self, cfg: DictDefault, model): """Add SwanLab callbacks before trainer creation.""" callbacks: list[TrainerCallback] = [] if not cfg.use_swanlab: return callbacks if not self.swanlab_initialized: LOG.warning("SwanLab not initialized, skipping callback registration") return callbacks try: from axolotl.utils.callbacks.swanlab import ( CustomSwanLabCallback, SaveAxolotlConfigtoSwanLabCallback, ) # Add our custom lightweight SwanLabCallback # (avoids omegaconf/antlr4 version conflicts) swanlab_callback = CustomSwanLabCallback() callbacks.append(swanlab_callback) LOG.info("Added CustomSwanLabCallback for metrics logging") # Add Axolotl config logging callback if cfg.axolotl_config_path: config_callback = SaveAxolotlConfigtoSwanLabCallback( cfg.axolotl_config_path ) callbacks.append(config_callback) LOG.info("Added SaveAxolotlConfigtoSwanLabCallback") except ImportError as err: LOG.exception("Failed to import SwanLab callbacks: %s", err) return callbacks def post_trainer_create(self, cfg: DictDefault, trainer): """Post-trainer creation hook.""" if cfg.use_swanlab and self.swanlab_initialized: try: import swanlab # Log additional trainer information (with safe conversion) trainer_config = { "total_steps": int(trainer.state.max_steps) if trainer.state.max_steps else None, "num_train_epochs": float(trainer.args.num_train_epochs) if trainer.args.num_train_epochs else None, "train_batch_size": int(trainer.args.train_batch_size) if hasattr(trainer.args, "train_batch_size") else None, "gradient_accumulation_steps": int( trainer.args.gradient_accumulation_steps ) if trainer.args.gradient_accumulation_steps else None, } # Remove None values trainer_config = { k: v for k, v in trainer_config.items() if v is not None } if trainer_config: swanlab.config.update(trainer_config) LOG.info("Logged trainer configuration to SwanLab") except Exception as err: # pylint: disable=broad-except LOG.debug(f"Failed to log trainer config to SwanLab: {err}") # Register RLHF completion logging callback if enabled self._register_completion_callback(cfg, trainer) def _get_swanlab_init_kwargs(self, cfg: DictDefault) -> dict: """Prepare kwargs for swanlab.init(). Passes all configuration parameters directly to swanlab.init() instead of using environment variables as an intermediate layer. Returns: dict: Keyword arguments for swanlab.init() """ init_kwargs = {} # Project name (required) if cfg.swanlab_project: init_kwargs["project"] = cfg.swanlab_project # Experiment name if cfg.swanlab_experiment_name: init_kwargs["experiment_name"] = cfg.swanlab_experiment_name # Description if cfg.swanlab_description: init_kwargs["description"] = cfg.swanlab_description # Workspace (organization) if cfg.swanlab_workspace: init_kwargs["workspace"] = cfg.swanlab_workspace # Mode: cloud, local, offline, disabled if cfg.swanlab_mode: init_kwargs["mode"] = cfg.swanlab_mode # API key (pass directly instead of via env var) if cfg.swanlab_api_key: init_kwargs["api_key"] = cfg.swanlab_api_key # Private deployment hosts (pass directly instead of via env var) if cfg.swanlab_web_host: init_kwargs["web_host"] = cfg.swanlab_web_host if cfg.swanlab_api_host: init_kwargs["api_host"] = cfg.swanlab_api_host # Log model checkpoints (coming soon in SwanLab) if cfg.swanlab_log_model: init_kwargs["log_model"] = cfg.swanlab_log_model # Custom branding - adds Axolotl identifier to SwanLab UI # This helps identify runs from Axolotl vs other frameworks init_kwargs["config"] = {"UPPERFRAME": "🦎 Axolotl"} return init_kwargs def _prepare_config_for_logging(self, cfg: DictDefault) -> dict: """Prepare configuration dict for logging to SwanLab.""" def safe_convert(value): """Convert value to JSON-serializable type.""" if value is None: return None if isinstance(value, (int, float, bool)): return value if isinstance(value, str): return value # Convert everything else to string return str(value) try: # Extract important training parameters with safe conversion config_dict = { "base_model": safe_convert(getattr(cfg, "base_model", "")), "model_type": safe_convert(getattr(cfg, "model_type", "")), "sequence_len": safe_convert(getattr(cfg, "sequence_len", None)), "micro_batch_size": safe_convert( getattr(cfg, "micro_batch_size", None) ), "gradient_accumulation_steps": safe_convert( getattr(cfg, "gradient_accumulation_steps", None) ), "num_epochs": safe_convert(getattr(cfg, "num_epochs", None)), "max_steps": safe_convert(getattr(cfg, "max_steps", None)), "learning_rate": safe_convert(getattr(cfg, "learning_rate", None)), "lr_scheduler": safe_convert(getattr(cfg, "lr_scheduler", "")), "optimizer": safe_convert(getattr(cfg, "optimizer", "")), "warmup_ratio": safe_convert(getattr(cfg, "warmup_ratio", None)), "weight_decay": safe_convert(getattr(cfg, "weight_decay", None)), "seed": safe_convert(getattr(cfg, "seed", None)), "bf16": safe_convert(getattr(cfg, "bf16", None)), "tf32": safe_convert(getattr(cfg, "tf32", None)), "flash_attention": safe_convert(getattr(cfg, "flash_attention", None)), "sample_packing": safe_convert(getattr(cfg, "sample_packing", None)), } # Add FSDP/parallel config - only boolean flags if hasattr(cfg, "fsdp_config") and cfg.fsdp_config: config_dict["fsdp_enabled"] = True config_dict["fsdp_version"] = safe_convert( getattr(cfg, "fsdp_version", None) ) if hasattr(cfg, "deepspeed") and cfg.deepspeed: config_dict["deepspeed_enabled"] = True # Add context parallel info if hasattr(cfg, "context_parallel_size"): config_dict["context_parallel_size"] = safe_convert( getattr(cfg, "context_parallel_size", None) ) if hasattr(cfg, "tensor_parallel_size"): config_dict["tensor_parallel_size"] = safe_convert( getattr(cfg, "tensor_parallel_size", None) ) if hasattr(cfg, "dp_shard_size"): config_dict["dp_shard_size"] = safe_convert( getattr(cfg, "dp_shard_size", None) ) # Remove None values and empty strings config_dict = { k: v for k, v in config_dict.items() if v is not None and v != "" and v != "None" } return config_dict except Exception as err: # pylint: disable=broad-except LOG.warning(f"Failed to prepare config for logging: {err}") # Return minimal config try: lr = getattr(cfg, "learning_rate", None) lr_value = float(lr) if lr is not None else None except (TypeError, ValueError): lr_value = None return { "base_model": str(getattr(cfg, "base_model", "unknown")), "learning_rate": lr_value, } def _register_lark_callback(self, cfg: DictDefault): """Register Lark (Feishu) notification callback if configured. Lark notifications enable sending training updates to team chat channels, useful for production monitoring and team collaboration. Args: cfg: Configuration object with Lark webhook settings """ # Check if Lark webhook URL is configured lark_webhook_url = getattr(cfg, "swanlab_lark_webhook_url", None) if not lark_webhook_url: return # Lark not configured, skip try: import swanlab from swanlab.plugin.notification import LarkCallback # Get optional secret for HMAC signature authentication lark_secret = getattr(cfg, "swanlab_lark_secret", None) # Create Lark callback with webhook URL and optional secret lark_callback = LarkCallback( webhook_url=lark_webhook_url, secret=lark_secret, ) # Register callback with SwanLab swanlab.register_callbacks([lark_callback]) if lark_secret: LOG.info( "Registered Lark notification callback with HMAC authentication" ) else: LOG.info("Registered Lark notification callback (no HMAC secret)") LOG.warning( "Lark webhook has no secret configured. " "For production use, set 'swanlab_lark_secret' to enable HMAC signature verification." ) except ImportError as err: LOG.warning( f"Failed to import SwanLab Lark plugin: {err}\n\n" "Lark notifications require SwanLab >= 0.3.0 with plugin support.\n" "Install with: pip install 'swanlab>=0.3.0'\n\n" "Continuing without Lark notifications..." ) except Exception as err: # pylint: disable=broad-except LOG.exception( "Failed to register Lark callback: %s\n\n" "Check your Lark webhook URL and secret configuration.\n" "Continuing without Lark notifications...", err, ) def _register_completion_callback(self, cfg: DictDefault, trainer): """Register RLHF completion logging callback if enabled and applicable. This callback logs model completions (prompts, chosen/rejected responses, rewards) to SwanLab during RLHF training for qualitative analysis. Args: cfg: Configuration object with completion logging settings trainer: The trainer instance to add callback to """ # Check if completion logging is enabled log_completions = getattr(cfg, "swanlab_log_completions", True) if not log_completions: LOG.debug("SwanLab completion logging disabled by config") return # Check if trainer is an RLHF trainer trainer_name = trainer.__class__.__name__ rlhf_trainers = ["DPO", "KTO", "ORPO", "GRPO", "CPO"] is_rlhf_trainer = any(name in trainer_name for name in rlhf_trainers) if not is_rlhf_trainer: LOG.debug( f"Trainer {trainer_name} is not an RLHF trainer, " "skipping completion logging callback" ) return try: from axolotl.integrations.swanlab.callbacks import ( SwanLabRLHFCompletionCallback, ) # Get configuration parameters log_interval = getattr(cfg, "swanlab_completion_log_interval", 100) max_buffer = getattr(cfg, "swanlab_completion_max_buffer", 128) # Create and register callback completion_callback = SwanLabRLHFCompletionCallback( log_interval=log_interval, max_completions=max_buffer, table_name="rlhf_completions", ) trainer.add_callback(completion_callback) LOG.info( f"Registered SwanLab RLHF completion logging callback for {trainer_name} " f"(log_interval={log_interval}, max_buffer={max_buffer})" ) except ImportError as err: LOG.warning( f"Failed to import SwanLab completion callback: {err}\n\n" "This is a bug - the callback should be available.\n" "Please report this issue.\n\n" "Continuing without completion logging..." ) except Exception as err: # pylint: disable=broad-except LOG.exception( "Failed to register SwanLab completion callback: %s\n\n" "Continuing without completion logging...", err, ) ================================================ FILE: src/axolotl/integrations/swanlab/profiling.py ================================================ """SwanLab profiling utilities for Axolotl trainers. This module provides decorators and context managers for profiling trainer methods and logging execution times to SwanLab. """ import time from contextlib import contextmanager from functools import wraps from typing import Any, Callable from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @contextmanager def swanlab_profiling_context(trainer: Any, func_name: str): """Context manager for profiling trainer methods. Measures execution time and logs to SwanLab if enabled. Example usage: >>> with swanlab_profiling_context(self, "training_step"): ... result = do_expensive_computation() Args: trainer: Trainer instance (must have cfg attribute with use_swanlab flag) func_name: Name of the function being profiled Yields: None """ start_time = time.perf_counter() try: yield finally: duration = time.perf_counter() - start_time # Check if SwanLab is enabled and initialized use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False) if use_swanlab: try: import swanlab if swanlab.get_run() is not None: # Log profiling metric trainer_class = trainer.__class__.__name__ metric_name = f"profiling/Time taken: {trainer_class}.{func_name}" swanlab.log({metric_name: duration}) except ImportError: # SwanLab not installed, silently skip pass except Exception as err: # pylint: disable=broad-except # Log error but don't fail training LOG.debug(f"Failed to log profiling metric for {func_name}: {err}") def swanlab_profile(func: Callable) -> Callable: """Decorator to profile and log function execution time to SwanLab. Automatically measures execution time of trainer methods and logs to SwanLab as profiling metrics. Example usage: >>> class MyTrainer: ... @swanlab_profile ... def training_step(self, model, inputs): ... return super().training_step(model, inputs) Args: func: Function to profile (must be a method of a trainer instance) Returns: Wrapped function with profiling """ @wraps(func) def wrapper(self, *args, **kwargs): with swanlab_profiling_context(self, func.__name__): return func(self, *args, **kwargs) return wrapper class ProfilingConfig: """Configuration for SwanLab profiling. This class provides a centralized way to control profiling behavior. Attributes: enabled: Whether profiling is enabled globally min_duration_ms: Minimum duration (in ms) to log (filters out very fast ops) log_interval: Log every N function calls (to reduce overhead) """ def __init__( self, enabled: bool = True, min_duration_ms: float = 0.1, log_interval: int = 1, ): """Initialize profiling configuration. Args: enabled: Enable profiling. Default: True min_duration_ms: Minimum duration to log (ms). Default: 0.1 log_interval: Log every N calls. Default: 1 (log all) """ self.enabled = enabled self.min_duration_ms = min_duration_ms self.log_interval = log_interval self._call_counts: dict[str, int] = {} def should_log(self, func_name: str, duration_seconds: float) -> bool: """Check if a profiling measurement should be logged. Args: func_name: Name of the profiled function duration_seconds: Execution duration in seconds Returns: True if should log, False otherwise """ if not self.enabled: return False # Check minimum duration threshold duration_ms = duration_seconds * 1000 if duration_ms < self.min_duration_ms: return False # Check log interval self._call_counts.setdefault(func_name, 0) self._call_counts[func_name] += 1 # Always log on first call OR at intervals count = self._call_counts[func_name] if count == 1 or count % self.log_interval == 0: return True return False # Global profiling config (can be modified by users) DEFAULT_PROFILING_CONFIG = ProfilingConfig() @contextmanager def swanlab_profiling_context_advanced( trainer: Any, func_name: str, config: ProfilingConfig | None = None, ): """Advanced profiling context with configurable behavior. Similar to swanlab_profiling_context but with additional configuration options for filtering and throttling profiling logs. Example usage: >>> config = ProfilingConfig(min_duration_ms=1.0, log_interval=10) >>> with swanlab_profiling_context_advanced(self, "forward", config): ... output = model(inputs) Args: trainer: Trainer instance func_name: Function name config: Profiling configuration. If None, uses DEFAULT_PROFILING_CONFIG Yields: None """ if config is None: config = DEFAULT_PROFILING_CONFIG start_time = time.perf_counter() try: yield finally: duration = time.perf_counter() - start_time # Check if should log based on config if config.should_log(func_name, duration): # Check if SwanLab is enabled use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False) if use_swanlab: try: import swanlab if swanlab.get_run() is not None: trainer_class = trainer.__class__.__name__ metric_name = ( f"profiling/Time taken: {trainer_class}.{func_name}" ) swanlab.log({metric_name: duration}) except ImportError: pass except Exception as err: # pylint: disable=broad-except LOG.debug(f"Failed to log profiling metric for {func_name}: {err}") ================================================ FILE: src/axolotl/kernels/__init__.py ================================================ ================================================ FILE: src/axolotl/kernels/geglu.py ================================================ """Module for definition of GEGLU Triton kernels. See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202). Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation. """ import torch import triton import triton.language as tl @triton.jit def _geglu_fwd_kernel( gate_ptr, up_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr, ): """GEGLU forward kernel. Args: gate_ptr: Pointer to gate tensor [*, hidden_dim]. up_ptr: Pointer to up-projection tensor [*, hidden_dim]. out_ptr: Pointer to output tensor [*, hidden_dim]. n_elements: Total number of elements in the input tensors. BLOCK_SIZE: Size of thread blocks for parallel computation. """ block_idx = tl.program_id(0) offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32) up = tl.load(up_ptr + offsets, mask=mask, other=0) # Compute activation in fp32 then convert back gelu_gate = 0.5 * gate * (tl.math.erf(tl.math.rsqrt(2.0) * gate) + 1.0) gelu_gate = gelu_gate.to(up.dtype) result = gelu_gate * up tl.store(out_ptr + offsets, result, mask=mask) def geglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor: """GEGLU forward pass. Args: gate: Input gate tensor of shape [batch, seq_len, hidden_dim]. up: Up-projection tensor of shape [batch, seq_len, hidden_dim]. Returns: torch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]. """ batch, seq_len, hidden_dim = gate.shape n_elements = gate.numel() out = torch.empty((batch, seq_len, hidden_dim), dtype=gate.dtype, device="cuda") grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) # noqa: E731 _geglu_fwd_kernel[grid]( gate_ptr=gate, up_ptr=up, out_ptr=out, n_elements=n_elements, BLOCK_SIZE=1024, ) return out @triton.jit def _geglu_bwd_kernel( grad_out_ptr, gate_ptr, up_ptr, n_elements, BLOCK_SIZE: tl.constexpr, ): """GEGLU backward kernel. Stores gradient results in-place. Args: grad_out_ptr: Pointer to gradient output tensor [*, hidden_dim]. gate_ptr: Pointer to gate tensor [*, hidden_dim]. up_ptr: Pointer to up-projection tensor [*, hidden_dim]. n_elements: Total number of elements in the input tensors. BLOCK_SIZE: Size of thread blocks for parallel computation. Note: After kernel execution, tensors are modified in-place: - `grad_out_ptr` contains GEGLU activation output (`h`) - `gate_ptr` contains gradient w.r.t gate (`grad_gate`) - `up_ptr` contains gradient w.r.t up (`grad_up`) """ block_idx = tl.program_id(0) offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements grad_out = tl.load(grad_out_ptr + offsets, mask=mask, other=0) gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32) up = tl.load(up_ptr + offsets, mask=mask, other=0) # Forward pass gelu_partial = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * gate) + 1.0) gelu_gate = gelu_partial * gate gelu_gate = gelu_gate.to(grad_out.dtype) # Forward output h = gelu_gate * up # Compute gradients grad_up = grad_out * gelu_gate # Compute gate gradient using GELU derivative temp = grad_out * up t = 0.3989422804014327 # 1/sqrt(2*pi) dgelu_dgate = gelu_partial + t * gate * tl.exp(-0.5 * gate * gate) grad_gate = temp.to(tl.float32) * dgelu_dgate grad_gate = grad_gate.to(grad_out.dtype) # Store results tl.store(grad_out_ptr + offsets, h, mask=mask) tl.store(gate_ptr + offsets, grad_gate, mask=mask) tl.store(up_ptr + offsets, grad_up, mask=mask) def geglu_backward( grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """GEGLU backward pass using in-place operations. Args: grad_output: Gradient of loss with respect to output, shape `[batch, seq_len, hidden_dim]`. gate: Gate tensor from forward pass, shape `[batch, seq_len, hidden_dim]`. up: Up-projection tensor from forward pass, shape `[batch, seq_len, hidden_dim]`. Returns: Tuple containing: - GEGLU activation output (`h`) - Gradient with respect to gate (`grad_gate`) - Gradient with respect to up (`grad_up`) Note: This function modifies its input tensors in-place to store results. """ n_elements = grad_output.numel() grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) # noqa: E731 _geglu_bwd_kernel[grid]( grad_out_ptr=grad_output, gate_ptr=gate, up_ptr=up, n_elements=n_elements, BLOCK_SIZE=1024, ) return grad_output, gate, up ================================================ FILE: src/axolotl/kernels/lora.py ================================================ """ Module for definition of Low-Rank Adaptation (LoRA) Triton kernels. See "LoRA: Low-Rank Adaptation of Large Language Models" (https://arxiv.org/abs/2106.09685). Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation. """ from typing import Callable import torch from bitsandbytes.functional import QuantState from torch import nn from torch.distributed.tensor import DTensor from .geglu import geglu_backward, geglu_forward from .quantize import dequantize from .swiglu import swiglu_backward, swiglu_forward from .utils import torch_amp_custom_bwd, torch_amp_custom_fwd def get_lora_parameters( proj: nn.Module, ) -> tuple[ torch.Tensor, torch.Tensor | None, QuantState | torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, float | None, ]: """ Gets LoRA parameters from a projection module. Args: proj: The projection module to extract parameters from. Returns: A tuple containing the base weights, quantization state, LoRA A and B weights, scaling factor, and base layer bias. Quant state, weights, and bias may be `None` if not available. """ # For DPO or disabled adapters base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj W = base_layer.weight b = base_layer.bias if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged: quant_state = getattr(W, "quant_state", None) if quant_state is None and W.dtype == torch.float8_e4m3fn: quant_state = getattr(base_layer, "weight_scale_inv", None) return W, b, quant_state, None, None, None quant_state = getattr(W, "quant_state", None) if quant_state is None and W.dtype == torch.float8_e4m3fn: quant_state = getattr(base_layer, "weight_scale_inv", None) active_adapter = ( proj.active_adapters[0] if hasattr(proj, "active_adapters") else proj.active_adapter ) linear_A = proj.lora_A[active_adapter] linear_B = proj.lora_B[active_adapter] # This manual unsharding is needed for FSDP2 + LoRA kernels compatibility. # We fuse linear layers + LoRA adapters calculations into a single # torch.autograd.Function, bypassing the registered unshard / reshard behavior. # Note that we don't apply resharding later in this module (it gets messy quickly), # but LoRA parameters are generally small enough that this is not an issue. if isinstance(linear_A.weight, DTensor): linear_A.unshard() linear_B.unshard() A = linear_A.weight B = linear_B.weight s = proj.scaling[active_adapter] return W, b, quant_state, A, B, s def matmul_lora( X: torch.Tensor, W: torch.Tensor, b: torch.Tensor | None, W_quant: QuantState | torch.Tensor | None, A: torch.Tensor | None, B: torch.Tensor | None, s: float | None, out: torch.Tensor | None = None, ) -> torch.Tensor: """ Efficient fused matmul + LoRA computation. Args: X: Input tensor [*, in_features] W: Base weight matrix [out_features, in_features] W_quant: Quantization state for W A: LoRA A matrix [rank, in_features] B: LoRA B matrix [out_features, rank] s: LoRA scaling factor out: Optional output tensor for inplace operations Returns: Result of X @ W + X @ A @ B """ dtype = X.dtype W = dequantize(W.t(), W_quant) reshape = False if X.dim() == 3: batch, seq_len, _ = X.shape X = X.view(-1, X.shape[-1]) reshape = True out = torch.matmul(X, W, out=out) if W_quant is not None: del W if A is not None: A, B = A.t().to(dtype), B.t().to(dtype) # type: ignore[union-attr] out += s * X @ A @ B if b is not None: out += b return out.view(batch, seq_len, -1) if reshape else out class LoRA_MLP(torch.autograd.Function): """Optimized LoRA MLP implementation.""" @staticmethod @torch_amp_custom_fwd def forward( ctx, X: torch.Tensor, gate_weight: torch.Tensor, gate_bias: torch.Tensor | None, gate_quant: QuantState | None, gate_A: torch.Tensor | None, gate_B: torch.Tensor | None, gate_scale: float, up_weight: torch.Tensor, up_bias: torch.Tensor | None, up_quant: QuantState | None, up_A: torch.Tensor | None, up_B: torch.Tensor | None, up_scale: float, down_weight: torch.Tensor, down_bias: torch.Tensor | None, down_quant: QuantState | None, down_A: torch.Tensor | None, down_B: torch.Tensor | None, down_scale: float, activation_fn: Callable, activation_fn_backward: Callable, inplace: bool | None = True, ) -> torch.Tensor: """ Forward pass for LoRA MLP. Args: ctx: Autograd context X: Input features gate_weight: Gate projection weight gate_bias: Gate projection bias gate_quant: Gate quantization state gate_A: Gate LoRA A matrix gate_B: Gate LoRA B matrix gate_scale: Gate LoRA scale up_weight: Up projection weight up_quant: Up projection quantization state up_A: Up projection LoRA A matrix up_B: Up projection LoRA B matrix up_scale: Up projection LoRA scale down_weight: Down projection weight down_bias: Down projection bias down_quant: Down projection quantization state down_A: Down projection LoRA A matrix down_B: Down projection LoRA B matrix down_scale: Down projection LoRA scale activation_fn: Forward activation function activation_fn_backward: Backward activation function inplace: Whether to perform operations in-place Returns: Output transformed by multi-layer perceptron and activation function """ # Compute projections gate = matmul_lora( X, gate_weight, gate_bias, gate_quant, gate_A, gate_B, gate_scale ) up = matmul_lora(X, up_weight, up_bias, up_quant, up_A, up_B, up_scale) # Activation hidden = activation_fn(gate, up) # Down projection output = matmul_lora( hidden, down_weight, down_bias, down_quant, down_A, down_B, down_scale ) # Save for backward ctx.save_for_backward(X, gate, up, gate_A, gate_B, up_A, up_B, down_A, down_B) ctx.scales = (gate_scale, up_scale, down_scale) ctx.quants = (gate_quant, up_quant, down_quant) ctx.weights = (gate_weight, up_weight, down_weight) ctx.activation_fn = activation_fn ctx.activation_fn_backward = activation_fn_backward ctx.inplace = inplace return output @staticmethod @torch_amp_custom_bwd def backward( ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor, ) -> tuple[ torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, None, ]: """ Performs backward pass computation for LoRA MLP. Args: ctx: Context object storing tensors saved during forward pass grad_output: Gradient of loss with respect to layer output Returns: Tuple containing gradients for all inputs from forward pass: - Input gradient tensor (or `None`) - `None` for weights/biases/quantization states - LoRA A/B matrix gradients (or `None`) - `None` for scaling factors - `None` for activation functions and flags """ ( X, gate, up, gate_A, gate_B, up_A, up_B, down_A, down_B, ) = ctx.saved_tensors gate_scale, up_scale, down_scale = ctx.scales gate_quant, up_quant, down_quant = ctx.quants gate_weight, up_weight, down_weight = ctx.weights # Transpose all LoRA matrices gate_A, gate_B = ( gate_A.t() if gate_A is not None else None, gate_B.t() if gate_B is not None else None, ) up_A, up_B = ( up_A.t() if up_A is not None else None, up_B.t() if up_B is not None else None, ) down_A, down_B = ( down_A.t() if down_A is not None else None, down_B.t() if down_B is not None else None, ) # Reshape inputs batch, seq_len, hd = X.shape grad_output = grad_output.view(-1, grad_output.shape[-1]) X = X.view(-1, X.shape[-1]) gate = gate.view(-1, gate.shape[-1]) up = up.view(-1, up.shape[-1]) dtype = X.dtype # Down projection grad_down = matmul_lora( grad_output, down_weight.t(), None, down_quant, down_B, down_A, down_scale, ) # Activation backward h, grad_gate, grad_up = ctx.activation_fn_backward(grad_down, gate, up) # Initialize and compute LoRA gradients d_down_A = d_down_B = d_up_A = d_up_B = d_gate_A = d_gate_B = None if down_A is not None and down_B is not None: d_down_A = h.t() @ (grad_output @ down_B.t()) d_down_B = (down_A.t() @ h.t()) @ grad_output d_down_A *= down_scale d_down_B *= down_scale if up_A is not None and up_B is not None: d_up_A = X.t() @ (grad_up @ up_B.t()) d_up_B = (up_A.t() @ X.t()) @ grad_up d_up_A *= up_scale d_up_B *= up_scale if gate_A is not None and gate_B is not None: d_gate_A = X.t() @ (grad_gate @ gate_B.t()) d_gate_B = (gate_A.t() @ X.t()) @ grad_gate d_gate_A *= gate_scale d_gate_B *= gate_scale # Compute input gradients dX = torch.zeros_like(X) if ctx.needs_input_grad[0] else None if dX is not None: # Up projection gradients up_weight = dequantize(up_weight.t(), up_quant) if ctx.inplace: dX = torch.matmul(grad_up, up_weight.t(), out=X) else: dX = torch.matmul(grad_up, up_weight.t()) del up_weight # Note the .to(dtype) only where mixing LoRA with base weights if up_A is not None and up_B is not None: dX += grad_up @ up_B.to(dtype).t() @ (up_scale * up_A.to(dtype).t()) # Gate projection gradients gate_weight = dequantize(gate_weight, gate_quant) dX += grad_gate @ gate_weight del gate_weight if gate_A is not None and gate_B is not None: dX += ( grad_gate @ gate_B.to(dtype).t() @ (gate_scale * gate_A.to(dtype).t()) ) # Reshape back dX = dX.view(batch, seq_len, hd) # Return gradients in correct order matching forward inputs return ( dX, None, None, None, d_gate_A.t() if d_gate_A is not None else None, d_gate_B.t() if d_gate_B is not None else None, None, None, None, None, d_up_A.t() if d_up_A is not None else None, d_up_B.t() if d_up_B is not None else None, None, None, None, None, d_down_A.t() if d_down_A is not None else None, d_down_B.t() if d_down_B is not None else None, None, None, None, None, None, ) def apply_lora_mlp_swiglu(self, X: torch.Tensor, inplace: bool = True) -> torch.Tensor: """ Applies LoRA to MLP layer with SwiGLU activation. Args: X: Input tensor for the MLP layer inplace: Whether to perform operations in-place to save memory Returns: Output tensor after applying LoRA-adapted MLP with SwiGLU activation """ gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj) upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj) downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj) out = LoRA_MLP.apply( X, gateW, gateb, gateW_quant, gateA, gateB, gateS, upW, upb, upW_quant, upA, upB, upS, downW, downb, downW_quant, downA, downB, downS, swiglu_forward, swiglu_backward, inplace, ) return out def apply_lora_mlp_geglu(self, X: torch.Tensor, inplace: bool = True) -> torch.Tensor: """ Applies LoRA to MLP layer with GEGLU activation. Args: X: Input tensor for the MLP layer inplace: Whether to perform operations in-place to save memory Returns: Output tensor after applying LoRA-adapted MLP with GEGLU activation """ gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj) upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj) downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj) out = LoRA_MLP.apply( X, gateW, gateb, gateW_quant, gateA, gateB, gateS, upW, upb, upW_quant, upA, upB, upS, downW, downb, downW_quant, downA, downB, downS, geglu_forward, geglu_backward, inplace, ) return out class LoRA_QKV(torch.autograd.Function): """ Optimized LoRA QKV implementation with quantization support. Implements efficient computation of query, key, value projections with LoRA, supporting quantization and memory optimization. """ @staticmethod @torch_amp_custom_fwd def forward( ctx: torch.autograd.function.FunctionCtx, X: torch.Tensor, q_weight: torch.Tensor, q_bias: torch.Tensor | None, q_quant: QuantState | None, q_A: torch.Tensor | None, q_B: torch.Tensor | None, q_scale: float, k_weight: torch.Tensor, k_bias: torch.Tensor | None, k_quant: QuantState | None, k_A: torch.Tensor | None, k_B: torch.Tensor | None, k_scale: float, v_weight: torch.Tensor, v_bias: torch.Tensor | None, v_quant: QuantState | None, v_A: torch.Tensor | None, v_B: torch.Tensor | None, v_scale: float, inplace: bool = True, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Forward pass computing Q, K, V projections with LoRA. Args: ctx: Autograd context X: Input tensor q_weight: Query projection weight q_bias: Query projection bias q_quant: Query quantization state q_A: Query LoRA A matrix q_B: Query LoRA B matrix q_scale: Query LoRA scale k_weight: Key projection weight k_bias: Key projection bias k_quant: Key quantization state k_A: Key LoRA A matrix k_B: Key LoRA B matrix k_scale: Key LoRA scale v_weight: Value projection weight v_bias: Value projection bias v_quant: Value quantization state v_A: Value LoRA A matrix v_B: Value LoRA B matrix v_scale: Value LoRA scale inplace: Whether to perform operations in-place Returns: Tuple of (Query, Key, Value) projection tensors """ Q = matmul_lora(X, q_weight, q_bias, q_quant, q_A, q_B, q_scale) K = matmul_lora(X, k_weight, k_bias, k_quant, k_A, k_B, k_scale) V = matmul_lora(X, v_weight, v_bias, v_quant, v_A, v_B, v_scale) ctx.save_for_backward(X, q_A, q_B, k_A, k_B, v_A, v_B) ctx.scales = (q_scale, k_scale, v_scale) ctx.quants = (q_quant, k_quant, v_quant) ctx.weights = (q_weight, k_weight, v_weight) ctx.biases = (q_bias, k_bias, v_bias) ctx.inplace = inplace return Q, K, V @staticmethod @torch_amp_custom_bwd def backward( ctx: torch.autograd.function.FunctionCtx, q_grad: torch.Tensor, k_grad: torch.Tensor, v_grad: torch.Tensor, ) -> tuple[ torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, ]: """ Backward pass computing gradients for LoRA QKV. Args: ctx: Autograd context q_grad: Gradient for query projection k_grad: Gradient for key projection v_grad: Gradient for value projection Returns: Tuple containing gradients for all forward inputs """ X, A_q, B_q, A_k, B_k, A_v, B_v = ctx.saved_tensors q_weight, k_weight, v_weight = ctx.weights q_quant, k_quant, v_quant = ctx.quants q_scale, k_scale, v_scale = ctx.scales dtype = X.dtype # Reshape gradients batch, seq_len = X.shape[:2] q_grad = q_grad.view(-1, q_grad.shape[-1]) k_grad = k_grad.reshape(-1, k_grad.shape[-1]) v_grad = v_grad.view(-1, v_grad.shape[-1]) X = X.view(-1, X.shape[-1]) # Pre-transpose X once X_t = X.t() # Initialize LoRA gradients as None d_A_q = d_B_q = d_A_k = d_B_k = d_A_v = d_B_v = None # Compute q path LoRA gradients if adapters exist if A_q is not None and B_q is not None: A_q_scaled = (q_scale * A_q).to(dtype) B_q_scaled = B_q.to(dtype) d_A_q = torch.mm(X_t, torch.mm(q_grad, B_q_scaled)) d_B_q = torch.mm(torch.mm(A_q_scaled, X_t), q_grad) # Compute k path LoRA gradients if adapters exist if A_k is not None and B_k is not None: A_k_scaled = (k_scale * A_k).to(dtype) B_k_scaled = B_k.to(dtype) d_A_k = torch.mm(X_t, torch.mm(k_grad, B_k_scaled)) d_B_k = torch.mm(torch.mm(A_k_scaled, X_t), k_grad) # Compute v path LoRA gradients if adapters exist if A_v is not None and B_v is not None: A_v_scaled = (v_scale * A_v).to(dtype) B_v_scaled = B_v.to(dtype) d_A_v = torch.mm(X_t, torch.mm(v_grad, B_v_scaled)) d_B_v = torch.mm(torch.mm(A_v_scaled, X_t), v_grad) # Compute input gradient, reusing X memory if possible out_buffer = X if ctx.inplace else None # Q path q_weight_t = dequantize(q_weight, q_quant) grad_X = torch.mm(q_grad, q_weight_t, out=out_buffer) del q_weight del q_weight_t if A_q is not None and B_q is not None: # Stay decomposed: dQ @ B^T gives [T, R], then [T, R] @ (s*A) gives [T, in] # This is 65x fewer FLOPs than materializing B@A into [out, in] grad_X.addmm_(torch.mm(q_grad, B_q_scaled), A_q_scaled) # K path k_weight_t = dequantize(k_weight, k_quant) grad_X.addmm_(k_grad, k_weight_t) del k_weight del k_weight_t if A_k is not None and B_k is not None: grad_X.addmm_(torch.mm(k_grad, B_k_scaled), A_k_scaled) # V path v_weight_t = dequantize(v_weight, v_quant) grad_X.addmm_(v_grad, v_weight_t) del v_weight del v_weight_t if A_v is not None and B_v is not None: grad_X.addmm_(torch.mm(v_grad, B_v_scaled), A_v_scaled) # Transpose gradients if needed if d_A_q is not None: d_A_q = d_A_q.t() d_B_q = d_B_q.t() # type: ignore[union-attr] if d_A_k is not None: d_A_k = d_A_k.t() d_B_k = d_B_k.t() # type: ignore[union-attr] if d_A_v is not None: d_A_v = d_A_v.t() d_B_v = d_B_v.t() # type: ignore[union-attr] return ( grad_X.view(batch, seq_len, -1), None, None, None, d_A_q, d_B_q, None, None, None, None, d_A_k, d_B_k, None, None, None, None, d_A_v, d_B_v, None, None, ) def apply_lora_qkv( self, X: torch.Tensor, inplace: bool = True ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Applies LoRA to compute Query, Key, Value projections. Args: X: Input tensor inplace: Whether to perform operations in-place Returns: Tuple of (Query, Key, Value) projection tensors """ QW, Qb, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj) KW, Kb, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj) VW, Vb, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj) Q, K, V = LoRA_QKV.apply( X, QW, Qb, QW_quant, QA, QB, QS, KW, Kb, KW_quant, KA, KB, KS, VW, Vb, VW_quant, VA, VB, VS, inplace, ) return Q, K, V class LoRA_O(torch.autograd.Function): """Optimized LoRA implementation for output projection.""" @staticmethod @torch_amp_custom_fwd def forward( ctx: torch.autograd.function.FunctionCtx, X: torch.Tensor, W: torch.Tensor, b: torch.Tensor, W_quant: QuantState | None, A: torch.Tensor, B: torch.Tensor, s: float, ) -> torch.Tensor: """ Forward pass for output projection with LoRA. Args: ctx: Autograd context X: Input tensor W: Output projection weight b: Output projection bias W_quant: Weight quantization state A: LoRA A matrix B: LoRA B matrix s: LoRA scaling factor Returns: Output projection result """ XW = matmul_lora(X, W, b, W_quant, A, B, s) ctx.custom_saved_tensors = ( W, W_quant, s, ) ctx.save_for_backward(A, B, X) return XW @staticmethod @torch_amp_custom_bwd def backward( ctx: torch.autograd.function.FunctionCtx, dY: torch.Tensor, ) -> tuple[ torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None, ]: """ Backward pass computing gradients for LoRA output projection. Args: ctx: Autograd context dY: Gradient of loss with respect to output Returns: Tuple containing gradients for all forward inputs """ W, W_quant, s = ctx.custom_saved_tensors A, B, X = ctx.saved_tensors batch, seq_len, hd = X.shape dY = dY.reshape(-1, dY.shape[-1]) X = X.reshape(-1, X.shape[-1]) dtype = X.dtype # Weight projection dY_X = X.t() @ dY d_A = s * dY_X @ B d_B = s * A @ dY_X # Get derivative for dX W = dequantize(W.t(), W_quant) dX = dY @ W.t() del W A, B = A.to(dtype), B.to(dtype) # Stay decomposed: dY @ B gives [T, R], then [T, R] @ A gives [T, in] dX.addmm_(torch.mm(dY, B), A, alpha=s) # W, b, W_quant, A, B, s return dX.view(batch, seq_len, hd), None, None, None, d_A.t(), d_B.t(), None def apply_lora_o(self, X: torch.Tensor) -> torch.Tensor: """ Applies LoRA to output projection layer. Args: X: Input tensor Returns: Transformed output tensor """ OW, Ob, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj) output = LoRA_O.apply(X, OW, Ob, OW_quant, OA, OB, OS) return output ================================================ FILE: src/axolotl/kernels/quantize.py ================================================ """Dequantization utilities for `bitsandbytes` and FP8 integration.""" import ctypes import bitsandbytes as bnb import torch from bitsandbytes.functional import QuantState, get_ptr from packaging.version import Version cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 CUDA_STREAM: torch.cuda.Stream | None = None HAS_CUDA_STREAM: bool = Version(bnb.__version__) > Version("0.43.3") def dequantize_fp8( W: torch.Tensor, scale_inv: torch.Tensor, dtype: torch.dtype = torch.bfloat16, ) -> torch.Tensor: """Dequantize FP8 block-quantized weights: W_dequant = W_fp8 * scale_inv. Args: W: FP8 weight tensor [out_features, in_features] in float8_e4m3fn. scale_inv: Per-block inverse scale [ceil(out/block), ceil(in/block)] or per-tensor scalar. dtype: Output dtype (default bf16). Returns: Dequantized tensor in the specified dtype. """ W_float = W.to(dtype) if scale_inv.numel() == 1: return W_float * scale_inv.to(dtype) if scale_inv.dim() == 2 and W.dim() == 2: sr, sc = scale_inv.shape br = W.shape[0] // sr bc = W.shape[1] // sc # If dimensions are exactly divisible, use fast reshape path if sr * br == W.shape[0] and sc * bc == W.shape[1]: return ( W_float.reshape(sr, br, sc, bc) * scale_inv[:, None, :, None].to(dtype) ).reshape(W.shape) # Tail-block handling: compute actual block size (ceil division), # tile scale_inv to cover full shape, then crop to W's dimensions br_ceil = -(-W.shape[0] // sr) # ceil(rows / scale_rows) = block_size bc_ceil = -(-W.shape[1] // sc) scale_expanded = ( scale_inv.to(dtype) .repeat_interleave(br_ceil, dim=0) .repeat_interleave(bc_ceil, dim=1) )[: W.shape[0], : W.shape[1]] return W_float * scale_expanded return W_float * scale_inv.to(dtype) def dequantize( W: torch.Tensor, quant_state: QuantState | list | torch.Tensor | None = None, out: torch.Tensor | None = None, ) -> torch.Tensor: """ Fast NF4 dequantization using `bitsandbytes` CUDA kernels. Performs efficient dequantization of weights from NF4 format using `bitsandbytes`' optimized CUDA implementations. Supports both legacy list and new `QuantState` formats. Args: W: Quantized weight tensor to dequantize quant_state: Quantization state containing metadata needed for dequantization. Can be either a `QuantState` object or legacy list format. If None, returns `W` unchanged. out: Optional output tensor for storing dequantized results. Must match expected shape and dtype if provided. Returns: Dequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if input `W` was transposed. Raises: AssertionError: If provided output tensor doesn't match expected shape / dtype. Note: Uses CUDA streams for better performance when available in newer `bitsandbytes` versions (>0.43.3). """ if quant_state is None: return W # FP8 path: quant_state is actually scale_inv tensor if W.dtype == torch.float8_e4m3fn: scale_inv = quant_state # Caller may pass W.t() (non-contiguous) — dequantize in original # layout then transpose back so the result shape matches the input. if not W.is_contiguous() and W.dim() == 2: return dequantize_fp8(W.t(), scale_inv).t() return dequantize_fp8(W, scale_inv) # Get the target device from input tensor W target_device = W.device # Extract quantization state if not isinstance(quant_state, list): # New style quant_state class absmax = quant_state.absmax.to(target_device) shape = quant_state.shape dtype = quant_state.dtype blocksize = quant_state.blocksize offset = quant_state.offset.to(target_device) state2 = quant_state.state2 absmax2 = state2.absmax.to(target_device) code2 = state2.code.to(target_device) blocksize2 = state2.blocksize else: # Legacy list format absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state absmax = absmax.to(target_device) offset, state2 = compressed_stats offset = offset.to(target_device) absmax2, code2, blocksize2, _, _, _, _ = state2 absmax2 = absmax2.to(target_device) code2 = code2.to(target_device) # Setup output tensor on the same device as input if out is None: out = torch.empty(shape, dtype=dtype, device=target_device) else: assert out.shape == shape and out.dtype == dtype out = out.to(target_device) # Dequantize statistics on the target device n_elements_absmax: int = absmax.numel() out_absmax: torch.Tensor = torch.empty( n_elements_absmax, dtype=torch.float32, device=target_device ) ptr_out_absmax: int = get_ptr(out_absmax) # Use CUDA stream if available if HAS_CUDA_STREAM: global CUDA_STREAM if CUDA_STREAM is None: CUDA_STREAM = torch.cuda.current_stream(target_device) cdequantize_blockwise_fp32( get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax, ctypes.c_int(blocksize2), ctypes.c_int(n_elements_absmax), CUDA_STREAM, ) else: cdequantize_blockwise_fp32( get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax, ctypes.c_int(blocksize2), ctypes.c_int(n_elements_absmax), ) out_absmax += offset # Choose appropriate dequantization function fx = ( cdequantize_blockwise_fp16_nf4 if dtype == torch.float16 else cdequantize_blockwise_bf16_nf4 ) # Dequantize weights if HAS_CUDA_STREAM: fx( get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out), ctypes.c_int(blocksize), ctypes.c_int(out.numel()), CUDA_STREAM, ) else: fx( get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out), ctypes.c_int(blocksize), ctypes.c_int(out.numel()), ) # Handle transposed data is_transposed: bool = W.shape[0] == 1 return out.t() if is_transposed else out ================================================ FILE: src/axolotl/kernels/swiglu.py ================================================ """ Module for definition of SwiGLU Triton kernels. See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202). Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation. """ import torch import triton import triton.language as tl @triton.jit def _swiglu_fwd_kernel( gate_ptr, up_ptr, out_ptr, n_elements, block_size: tl.constexpr, ): """ SwiGLU forward kernel. The kernel computes activation in fp32 precision for better numerical stability, then converts back to original dtype for the final result. Args: gate_ptr: Pointer to gate tensor `[*, hidden_dim]`. up_ptr: Pointer to up-projection tensor `[*, hidden_dim]`. out_ptr: Pointer to output tensor `[*, hidden_dim]`. n_elements: Total number of elements in the input tensors. block_size: Size of thread blocks for parallel computation. """ block_idx = tl.program_id(0) offsets = block_idx * block_size + tl.arange(0, block_size) mask = offsets < n_elements # Load gate in fp32, keep up in original dtype gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32) up = tl.load(up_ptr + offsets, mask=mask, other=0) # Compute activation in fp32 then convert back f = gate * tl.sigmoid(gate) f = f.to(up.dtype) result = f * up tl.store(out_ptr + offsets, result, mask=mask) @triton.jit def _swiglu_bwd_kernel( grad_out_ptr, gate_ptr, up_ptr, n_elements, block_size: tl.constexpr, ): """ SwiGLU backward kernel. Stores gradient results in-place. Args: grad_out_ptr: Pointer to gradient output tensor `[*, hidden_dim]`. gate_ptr: Pointer to gate tensor `[*, hidden_dim]`. up_ptr: Pointer to up-projection tensor `[*, hidden_dim]`. n_elements: Total number of elements in the input tensors. block_size: Size of thread blocks for parallel computation. Note: After kernel execution, tensors are modified in-place: - `grad_out_ptr` contains forward output (`h`) - `gate_ptr` contains gradient w.r.t gate (`grad_gate`) - `up_ptr` contains gradient w.r.t up (`grad_up`) """ block_idx = tl.program_id(0) offsets = block_idx * block_size + tl.arange(0, block_size) mask = offsets < n_elements # Load values - only convert gate to fp32 grad_out = tl.load(grad_out_ptr + offsets, mask=mask, other=0) gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32) up = tl.load(up_ptr + offsets, mask=mask, other=0) # Compute SiLU and forward output sigmoid_gate = tl.sigmoid(gate) silu_gate = sigmoid_gate * gate silu_gate = silu_gate.to(grad_out.dtype) h = silu_gate * up # Compute gradients grad_up = grad_out * silu_gate # gradient for up is grad_out * SiLU(gate) # Compute gate gradient temp = grad_out * up grad_gate = temp.to(tl.float32) * sigmoid_gate * (1.0 + gate * (1.0 - sigmoid_gate)) grad_gate = grad_gate.to(grad_out.dtype) # Store results with correct gradient ordering tl.store(grad_out_ptr + offsets, h, mask=mask) tl.store(gate_ptr + offsets, grad_gate, mask=mask) # grad wrt gate tl.store(up_ptr + offsets, grad_up, mask=mask) # grad wrt up def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor: """ SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where `x` is the gate tensor. Args: gate: Input gate tensor of shape `[batch, seq_len, hidden_dim]`. up: Up-projection tensor of shape `[batch, seq_len, hidden_dim]`. Returns: Output tensor of shape `[batch, seq_len, hidden_dim]`. """ batch, seq_len, hidden_dim = gate.shape n_elements = gate.numel() out = torch.empty((batch, seq_len, hidden_dim), dtype=gate.dtype, device="cuda") grid = lambda meta: (triton.cdiv(n_elements, meta["block_size"]),) # noqa: E731 _swiglu_fwd_kernel[grid]( gate_ptr=gate, up_ptr=up, out_ptr=out, n_elements=n_elements, block_size=1024, ) return out def swiglu_backward( grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ SwiGLU backward pass using in-place operations. Args: grad_output: Gradient of loss with respect to output, shape `[batch, seq_len, hidden_dim]`. gate: Gate tensor from forward pass, shape `[batch, seq_len, hidden_dim]`. up: Up-projection tensor from forward pass, shape `[batch, seq_len, hidden_dim]`. Returns: Tuple containing: - Forward pass output (`h`) - Gradient with respect to gate (`df`) - Gradient with respect to up-projection (`de`) """ n_elements = grad_output.numel() grid = lambda meta: (triton.cdiv(n_elements, meta["block_size"]),) # noqa: E731 _swiglu_bwd_kernel[grid]( grad_out_ptr=grad_output, gate_ptr=gate, up_ptr=up, n_elements=n_elements, block_size=1024, ) # After kernel execution, tensors contain: # grad_output: h (forward output) # gate: grad_gate (grad wrt gate) # up: grad_up (grad wrt up) return grad_output, gate, up ================================================ FILE: src/axolotl/kernels/utils.py ================================================ """Utilities for `axolotl.kernels` submodules.""" import torch from packaging.version import Version if Version(torch.__version__) < Version("2.4.0"): torch_amp_custom_fwd = torch.cuda.amp.custom_fwd torch_amp_custom_bwd = torch.cuda.amp.custom_bwd else: torch_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda") torch_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda") ================================================ FILE: src/axolotl/loaders/__init__.py ================================================ """Init for axolotl.loaders module""" # flake8: noqa from .adapter import load_adapter, load_lora from .constants import MULTIMODAL_AUTO_MODEL_MAPPING from .model import ModelLoader from .processor import load_processor from .tokenizer import load_tokenizer ================================================ FILE: src/axolotl/loaders/adapter.py ================================================ """Adapter loading functionality, including LoRA / QLoRA and associated utils""" import os import types from typing import Any import bitsandbytes as bnb import torch from bitsandbytes.nn import Params4bit from peft import ( AdaptionPromptConfig, LoftQConfig, LoraConfig, PeftConfig, PeftMixedModel, PeftModel, TaskType, get_peft_model, ) from transformers import PreTrainedModel from axolotl.loaders.utils import get_linear_embedding_layers from axolotl.telemetry.errors import send_errors from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def setup_quantized_meta_for_peft(model: torch.nn.Module): """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device""" def temp_to_method(self, *args, **kwargs): return self for param in model.parameters(): if isinstance(param, Params4bit) and param.quant_state is not None: param.quant_state._orig_to = param.quant_state.to param.quant_state.to = types.MethodType(temp_to_method, param.quant_state) def setup_quantized_peft_meta_for_training(model: torch.nn.Module): """Replaces dummy `quant_state.to` method with the original function to allow training to continue""" for param in model.parameters(): if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"): param.quant_state.to = param.quant_state._orig_to param.quant_state._orig_to = None def find_all_linear_names(model): cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear) lora_module_names = set() for name, module in model.named_modules(): if ( isinstance(module, cls) or "Linear" in module.__class__.__name__ and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",) ): names = name.split(".") lora_module_names.add(names[0] if len(names) == 1 else names[-1]) embedding_modules = get_linear_embedding_layers(model.config.model_type) output_embedding = embedding_modules[1] if output_embedding in lora_module_names: # needed for 16-bit lora_module_names.remove(output_embedding) return list(lora_module_names) def load_lora( model: PreTrainedModel, cfg: DictDefault, inference: bool = False, config_only: bool = False, ) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]: lora_target_modules = cfg.lora_target_modules or [] lora_target_parameters = cfg.lora_target_parameters or [] if cfg.lora_target_linear: linear_names = find_all_linear_names(model) LOG.info(f"found linear modules: {repr(sorted(linear_names))}") lora_target_modules_as_list = ( lora_target_modules if isinstance(lora_target_modules, list) else [lora_target_modules] ) lora_target_modules = list(set(lora_target_modules_as_list + linear_names)) lora_config_kwargs = {} loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits if loftq_bits: lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits) lora_config_kwargs["init_lora_weights"] = "loftq" if cfg.peft_init_lora_weights: lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights if cfg.peft_use_dora: lora_config_kwargs["use_dora"] = cfg.peft_use_dora LOG.info("Initializing LoRA weights using dora. This might take longer.") if cfg.peft_use_rslora: lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora if cfg.peft_layer_replication: lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication if cfg.peft_trainable_token_indices: lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices if cfg.peft_ensure_weight_tying is not None: lora_config_kwargs["ensure_weight_tying"] = cfg.peft_ensure_weight_tying # Determine the correct PEFT task type model_cls = type(model).__name__ if "SequenceClassification" in model_cls: task_type = TaskType.SEQ_CLS elif "TokenClassification" in model_cls: task_type = TaskType.TOKEN_CLS else: task_type = TaskType.CAUSAL_LM lora_config = LoraConfig( r=cfg.lora_r, lora_alpha=cfg.lora_alpha, target_modules=lora_target_modules, target_parameters=lora_target_parameters, layers_to_transform=cfg.peft_layers_to_transform, layers_pattern=cfg.peft_layers_pattern, lora_dropout=cfg.lora_dropout, fan_in_fan_out=cfg.lora_fan_in_fan_out, modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None, bias="none", task_type=task_type, **lora_config_kwargs, ) if config_only: return None, lora_config rank = int(os.environ.get("LOCAL_RANK", 0)) if ( cfg.fsdp_config and cfg.adapter and cfg.fsdp_config.cpu_ram_efficient_loading and rank != 0 ): setup_quantized_meta_for_peft(model) model_kwargs: Any = {} if cfg.peft_autocast_adapter_dtype is not None: model_kwargs["autocast_adapter_dtype"] = cfg.peft_autocast_adapter_dtype if cfg.lora_model_dir: LOG.debug("Loading pretrained PEFT - LoRA") if cfg.lora_on_cpu: model_kwargs["max_memory"] = {"cpu": "256GiB"} model_kwargs["device_map"] = {"": "cpu"} model = PeftModel.from_pretrained( model, cfg.lora_model_dir, is_trainable=(not inference), **model_kwargs, ) else: model = get_peft_model(model, lora_config, **model_kwargs) # FP8 models: LoRA A/B inherit FP8 dtype from base weights, but training # requires a compute dtype (bf16/fp16). Cast trainable LoRA params. if cfg.torch_dtype: _fp8_cast_dtype = cfg.torch_dtype elif torch.cuda.is_available() and torch.cuda.is_bf16_supported(): _fp8_cast_dtype = torch.bfloat16 else: _fp8_cast_dtype = torch.float16 for _name, param in model.named_parameters(): if param.requires_grad and param.dtype == torch.float8_e4m3fn: param.data = param.data.to(_fp8_cast_dtype) if rank == 0: try: model.print_trainable_parameters() except AttributeError as exc: LOG.warning( "Exception caught during model.print_trainable_parameters(): %s", exc ) elif ( cfg.fsdp_config and cfg.adapter and cfg.fsdp_config.cpu_ram_efficient_loading and rank != 0 ): setup_quantized_peft_meta_for_training(model) return model, lora_config @send_errors def load_adapter( model: PreTrainedModel, cfg: DictDefault, adapter: str | None, inference: bool = False, ) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]: if adapter is None: return model, None if hasattr(model, "enable_input_require_grads"): model.enable_input_require_grads() if adapter in ["lora", "qlora"]: peft_model, lora_config = load_lora(model, cfg, inference=inference) return peft_model, lora_config if adapter == "llama-adapter": peft_model, lora_config = load_llama_adapter(model, cfg) return peft_model, lora_config raise NotImplementedError(f"{adapter} PEFT adapter not available") def load_llama_adapter( model: PreTrainedModel, cfg: DictDefault ) -> tuple[PeftModel | PeftMixedModel, PeftConfig]: peft_config = AdaptionPromptConfig( adapter_layers=cfg.peft_adapter.layers, # layers (L) adapter_len=cfg.peft_adapter.len, # prompt length (K) task_type="CAUSAL_LM", ) if cfg.lora_model_dir: LOG.debug("Loading pretrained PEFT - llama_adapter") peft_model = PeftModel.from_pretrained( model, cfg.lora_model_dir, torch_dtype=torch.float16, ) else: peft_model = get_peft_model(model, peft_config) peft_model.print_trainable_parameters() return peft_model, peft_config ================================================ FILE: src/axolotl/loaders/adapters/__init__.py ================================================ ================================================ FILE: src/axolotl/loaders/constants.py ================================================ """Shared constants for axolotl.loaders module""" from transformers import AutoModelForImageTextToText from transformers.models.auto.modeling_auto import ( MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, ) MULTIMODAL_AUTO_MODEL_MAPPING = dict(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES) MULTIMODAL_AUTO_MODEL_MAPPING["lfm2-vl"] = AutoModelForImageTextToText try: from transformers import VoxtralForConditionalGeneration # transformers >4.53.2 MULTIMODAL_AUTO_MODEL_MAPPING["voxtral"] = VoxtralForConditionalGeneration except ImportError: pass ================================================ FILE: src/axolotl/loaders/model.py ================================================ """ Model loader class implementation for loading, configuring, and patching various models. """ import gc import math import os from functools import cached_property from importlib.util import find_spec from typing import Any import peft import torch import transformers import transformers.modeling_utils from accelerate import init_empty_weights from accelerate.parallelism_config import ParallelismConfig from peft import ( PeftConfig, PeftMixedModel, PeftModel, PeftModelForCausalLM, prepare_model_for_kbit_training, ) from torch.distributed import DeviceMesh from transformers import ( AutoModelForCausalLM, AutoModelForImageTextToText, AwqConfig, BitsAndBytesConfig, GPTQConfig, PreTrainedModel, PreTrainedTokenizerBase, ) from transformers.integrations.deepspeed import ( HfTrainerDeepSpeedConfig, is_deepspeed_zero3_enabled, ) from axolotl.common.architectures import MOE_ARCH_BLOCK from axolotl.integrations.base import PluginManager from axolotl.loaders.adapter import load_adapter, load_lora from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING from axolotl.loaders.patch_manager import PatchManager from axolotl.loaders.utils import ( get_linear_embedding_layers, get_module_class_from_name, load_model_config, ) from axolotl.models.mamba import fix_mamba_attn_for_loss from axolotl.telemetry.errors import send_errors from axolotl.utils.bench import log_gpu_memory_usage from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import ( build_parallelism_config, get_device_count, get_device_type, ) from axolotl.utils.logging import get_logger from axolotl.utils.model_shard_quant import load_sharded_model_quant from axolotl.utils.schemas.enums import RLType LOG = get_logger(__name__) PLUGIN_MANAGER = PluginManager.get_instance() class ModelLoader: """Manages model configuration, initialization and application of patches during model loading. This class orchestrates the entire process of loading a model from configuration to final preparation. It handles device mapping, quantization, attention mechanisms, adapter integration, and various optimizations. The loading process includes: - Loading and validating model configuration - Applying monkey patches for optimizations / fixes - Setting up device mapping (including multi-GPU configurations) - Configuring quantization - Setting attention mechanisms (Flash Attention, SDPA, etc.) - Loading and initializing the model - Applying adapters (LoRA, QLoRA, etc.) Attributes: model: The loaded model instance (available after load() is called). model_kwargs: Dictionary of keyword arguments passed to model initialization. base_model: Name or path of the base model to load. model_type: Type of model to load (e.g., `AutoModelForCausalLM`). model_config: Configuration object for the model. auto_model_loader: class used for loading the model (default: `AutoModelForCausalLM`). """ use_parallel_config: bool | None = False parallelism_config: ParallelismConfig | None = None device_mesh: DeviceMesh | None = None def __init__( self, cfg: DictDefault, tokenizer: PreTrainedTokenizerBase, *, inference: bool = False, reference_model: bool = False, **kwargs, ): """Initializes the ModelLoader. Args: cfg: Configuration dictionary with model and training settings. tokenizer: Tokenizer instance associated with the model. processor: Optional processor for multimodal models. Defaults to None. inference: Whether the model is being loaded for inference mode. Defaults to False. reference_model: Whether this is a reference model (used in setups like DPO training). Defaults to False. **kwargs: Additional keyword arguments (ignored). """ self.cfg = cfg self.tokenizer = tokenizer self.inference: bool = inference self.reference_model: bool = reference_model # Init model kwargs self.model_kwargs: dict[str, Any] = {} if cfg.overrides_of_model_kwargs: for key, val in cfg.overrides_of_model_kwargs.items(): self.model_kwargs[key] = val # Init model self.model: PreTrainedModel | PeftModel | PeftMixedModel self.base_model = cfg.base_model self.model_type = cfg.type_of_model # Init model config self.model_config = load_model_config(cfg) self.auto_model_loader = AutoModelForCausalLM # Initialize the patch manager self.patch_manager = PatchManager( cfg=cfg, model_config=self.model_config, inference=inference, ) @cached_property def has_flash_attn(self) -> bool: """Check if flash attention is installed.""" return find_spec("flash_attn") is not None @property def is_fsdp_enabled(self): """Property that determines if FSDP is enabled.""" return self.cfg.fsdp_config is not None or self.cfg.fsdp is not None @property def is_qlora_and_fsdp_enabled(self): """Property that determines if FSDP with QLoRA is enabled.""" return self.is_fsdp_enabled and self.cfg.adapter == "qlora" @send_errors def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]: """Load and prepare the model with all configurations and patches. Returns: A tuple with the loaded model and its LoRA configuration (if applicable). """ # Initial setup and patches self.patch_manager.apply_pre_model_load_patches() self._apply_pre_model_load_setup() # Build the model PLUGIN_MANAGER.pre_model_load(self.cfg) self.patch_manager.apply_post_plugin_pre_model_load_patches() skip_move_to_device = self._build_model() self.patch_manager.apply_post_model_build_patches(self.model) PLUGIN_MANAGER.post_model_build(self.cfg, self.model) # Post-build model configuration self._apply_post_model_load_setup() # Load adapters (LoRA, etc.) PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model) lora_config = self._load_adapters() PLUGIN_MANAGER.post_lora_load(self.cfg, self.model) # Apply remaining patches and finalize self._apply_post_lora_load_setup(skip_move_to_device) self.patch_manager.apply_post_model_load_patches(self.model) PLUGIN_MANAGER.post_model_load(self.cfg, self.model) return self.model, lora_config def _apply_pre_model_load_setup(self): """Apply patches and setup configurations before model loading.""" if self.use_parallel_config is not None: self.use_parallel_config = ( self.cfg.fsdp_config or (self.cfg.tensor_parallel_size and self.cfg.tensor_parallel_size > 1) or ( self.cfg.context_parallel_size and self.cfg.context_parallel_size > 1 ) ) if self.cfg.fsdp_config and self.cfg.fsdp_version != 2: self.use_parallel_config = False if self.use_parallel_config: self._set_parallel_config() self._set_auto_model_loader() self._set_device_map_config() if self.cfg.revision_of_model: self.model_kwargs["revision"] = self.cfg.revision_of_model if self.cfg.use_kernels: self.model_kwargs["use_kernels"] = self.cfg.use_kernels if "allow_all_kernels" not in self.model_kwargs: self.model_kwargs["allow_all_kernels"] = self.cfg.use_kernels self._set_quantization_config() self._set_attention_config() self._check_model_requirements() def _apply_post_model_load_setup(self): """Configure the model after it has been loaded.""" # Handle PeftModel if needed if ( isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM)) and not self.is_qlora_and_fsdp_enabled ): self.model = self.model.merge_and_unload() self._configure_experts_implementation() self._apply_activation_checkpointing() self._resize_token_embeddings() self._adjust_model_config() self._configure_embedding_dtypes() self._configure_qat() log_gpu_memory_usage(LOG, "Memory usage after model load", 0) def _configure_experts_implementation(self): if self.cfg.experts_implementation is not None: self.model.set_experts_implementation(self.cfg.experts_implementation) def _apply_activation_checkpointing(self): if self.cfg.activation_offloading is True: from axolotl.core.trainers.mixins.activation_checkpointing import ( ac_wrap_hf_model, ) # ^^ importing this at the module level breaks plugins ac_wrap_hf_model(self.model) def _resize_token_embeddings(self): """Resize token embeddings if needed.""" embeddings_len = ( math.ceil(len(self.tokenizer) / 32) * 32 if self.cfg.resize_token_embeddings_to_32x else len(self.tokenizer) ) if hasattr(self.model, "get_input_embeddings") and ( self.model.get_input_embeddings().num_embeddings < embeddings_len or ( self.model.get_input_embeddings().num_embeddings > embeddings_len and self.cfg.shrink_embeddings ) ): resize_kwargs = {} if self.cfg.mean_resizing_embeddings is not None and ( self.model_config.model_type != "llava" ): resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings self.model.resize_token_embeddings(embeddings_len, **resize_kwargs) else: self.model.tie_weights() def _adjust_model_config(self): if ( hasattr(self.model, "config") and hasattr(self.model.config, "max_position_embeddings") and self.model.config.max_position_embeddings and self.cfg.sequence_len > self.model.config.max_position_embeddings ): LOG.warning( "increasing model.config.max_position_embeddings from " f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}" ) self.model.config.max_position_embeddings = self.cfg.sequence_len if ( hasattr(self.model, "config") and hasattr(self.model.config, "bos_token_id") and self.model.config.bos_token_id and self.model.config.bos_token_id != self.tokenizer.bos_token_id ): self.model.config.bos_token_id = self.tokenizer.bos_token_id if ( hasattr(self.model, "config") and hasattr(self.model.config, "eos_token_id") and self.model.config.eos_token_id and self.model.config.eos_token_id != self.tokenizer.eos_token_id ): self.model.config.eos_token_id = self.tokenizer.eos_token_id def _configure_embedding_dtypes(self): """Configure embedding module dtypes.""" # Get embedding modules embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type) # Initial dtype conversion if not self.is_fsdp_enabled: # We don't run this during FSDP because this will leave mixed and bfloat16 # dtypes in the model which FSDP doesn't like if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast: embedding_modules = [] self._convert_embedding_modules_dtype( embedding_modules, dist_dtype=torch.float32, before_kbit_train_or_finetune=True, ) # Handle DeepSpeed Zero3 if ( is_deepspeed_zero3_enabled() or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3" ): self._set_z3_leaf_modules() # Apply gradient checkpointing if needed needs_fa2_dtype = self.cfg.adapter or self.is_fsdp_enabled if self.cfg.adapter in ["lora", "qlora"]: needs_fa2_dtype = True if self.cfg.gradient_checkpointing: self.model.gradient_checkpointing_enable( gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs ) self._prepare_model_for_quantization() # Convert dtypes if needed should_convert = ( # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so # we need to convert them back to fp16/bf16 for flash-attn compatibility. ( ( needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention or self.cfg.sage_attention ) and not self.is_qlora_and_fsdp_enabled ) or ( # CCE requires embedding layers to be in fp16/bf16 for backward pass self.cfg.cut_cross_entropy ) ) if should_convert: LOG.info("Converting modules to %s", self.cfg.torch_dtype) self._convert_embedding_modules_dtype( embedding_modules=embedding_modules, dist_dtype=self.cfg.torch_dtype, before_kbit_train_or_finetune=False, ) def _configure_qat(self): """Configure QAT.""" if self.cfg.qat: from axolotl.utils.quantization import prepare_model_for_qat prepare_model_for_qat( self.model, self.cfg.qat.weight_dtype, self.cfg.qat.group_size, self.cfg.qat.activation_dtype, self.cfg.qat.quantize_embedding, ) def _load_adapters(self) -> PeftConfig | None: """Load LoRA or other adapters.""" # Load LoRA or adapter lora_config = None if not self.reference_model or self.cfg.lora_model_dir: # If we're not loading the reference model, then we're loading the model # for training. Then, the DPO trainer doesn't want the PEFT model loaded # over it, it just wants the LoRA / PEFT config. if ( self.cfg.adapter and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO] and not self.cfg.merge_lora ): _, lora_config = load_lora( self.model, self.cfg, inference=False, config_only=True ) else: self.model, lora_config = load_adapter( self.model, self.cfg, self.cfg.adapter ) return lora_config def _apply_post_lora_load_setup(self, skip_move_to_device: bool): """Apply final optimizations and patches.""" # Place model on accelerator if ( self.cfg.ddp and not self.cfg.load_in_8bit and not (self.cfg.rl and self.cfg.load_in_4bit) and not skip_move_to_device ): self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}") if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1: self.model.is_parallelizable = True self.model.model_parallel = True if not any( param.requires_grad for _, param in self.model.named_parameters(recurse=True) ): LOG.warning("There are no parameters that require gradient updates") if self.cfg.flash_optimum: from optimum.bettertransformer import BetterTransformer self.model = BetterTransformer.transform(self.model) if self.cfg.adapter is not None: log_gpu_memory_usage(LOG, "after adapters", self.model.device) for _ in range(3): gc.collect() torch.cuda.empty_cache() def _set_parallel_config(self): """Set parallelism configuration (DP, FSDP, TP, CP) in PartialState/Accelerator""" parallelism_config, device_mesh = build_parallelism_config(self.cfg) if parallelism_config: self.parallelism_config = parallelism_config self.device_mesh = device_mesh def _set_auto_model_loader(self): """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM` (set at `__init__`). When using a multimodal model, `self.auto_model_loader` should be set according to the type of the model. """ if self.cfg.is_multimodal: self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get( self.model_config.model_type, AutoModelForImageTextToText ) if isinstance(self.auto_model_loader, str): self.auto_model_loader = AutoModelForImageTextToText def _set_device_map_config(self): """Setup `device_map` according to config""" device_map = self.cfg.device_map max_memory = self.cfg.max_memory if self.cfg.gpu_memory_limit: gpu_memory_limit = ( str(self.cfg.gpu_memory_limit) + "GiB" if isinstance(self.cfg.gpu_memory_limit, int) else self.cfg.gpu_memory_limit ) max_memory = {} num_device = get_device_count() for i in range(num_device): max_memory[i] = gpu_memory_limit max_memory["cpu"] = "256GiB" # something sufficiently large to fit anything if max_memory is not None: # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py from accelerate import infer_auto_device_map with init_empty_weights(): model_canvas = self.auto_model_loader.from_config( self.model_config, trust_remote_code=self.cfg.trust_remote_code or False, ) model_canvas.tie_weights() device_map = infer_auto_device_map( model_canvas, max_memory=max_memory, dtype=self.cfg.torch_dtype, ) # We can discard max_memory now as we have a device map set up max_memory = None self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype self.model_kwargs["dtype"] = self.cfg.torch_dtype is_ds_zero3 = is_deepspeed_zero3_enabled() # FSDP requires control over device placement, so don't set device_map when FSDP is enabled if self.is_fsdp_enabled: # For QLoRA + FSDP, we still need to set device_map to "auto" for proper initialization if self.is_qlora_and_fsdp_enabled: self.model_kwargs["device_map"] = { "": int(os.environ.get("LOCAL_RANK", 0)) } # For other FSDP cases, don't set device_map at all elif not is_ds_zero3: self.model_kwargs["device_map"] = device_map # quantize_moe_experts quantizes expert weights on-the-fly during loading, # so the actual VRAM usage is much less than bf16 estimates. # When device_map is "auto", accelerate's infer_auto_device_map computes # the device map at bf16 size (before quantization), causing it to offload # layers to CPU, which BnB then rejects. Force single-GPU placement to # prevent this. Only applies to the non-FSDP, non-ZeRO3 path (DDP/single). if getattr(self.cfg, "quantize_moe_experts", False) and device_map in ( "auto", None, ): self.model_kwargs["device_map"] = { "": int(os.environ.get("LOCAL_RANK", 0)) } cur_device = get_device_type() if "mps" in str(cur_device): self.model_kwargs["device_map"] = "mps:0" elif "npu" in str(cur_device): self.model_kwargs["device_map"] = "npu:0" # TODO: can we put the reference model on it's own gpu? I think we have to move # logits around to calculate loss # if cfg.rl: # if torch.cuda.device_count() > 1: # if reference_model: # model_kwargs["device_map"] = "cuda:" + str( # torch.cuda.current_device() + 1 # ) # else: # model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device()) def _set_quantization_config(self): """Set up quantization config (bitsandbytes, awq, gptq, etc.)""" if self.cfg.model_quantization_config == "Mxfp4Config": from transformers import Mxfp4Config mxfp4_kwargs = {} if self.cfg.model_quantization_config_kwargs: mxfp4_kwargs = self.cfg.model_quantization_config_kwargs self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs) if self.cfg.gptq: if not hasattr(self.model_config, "quantization_config"): LOG.warning( "model config does not contain quantization_config information" ) else: if self.cfg.gptq_disable_exllama is not None: self.model_config.quantization_config["disable_exllama"] = ( self.cfg.gptq_disable_exllama ) self.model_kwargs["quantization_config"] = GPTQConfig( **self.model_config.quantization_config ) if ( self.cfg.adapter in ["qlora", "lora"] and hasattr(self.model_config, "quantization_config") and self.model_config.quantization_config["quant_method"] in ["gptq", "awq", "bitsandbytes"] ): if self.model_config.quantization_config["quant_method"] == "gptq": self.model_kwargs["quantization_config"] = GPTQConfig( **self.model_config.quantization_config ) elif self.model_config.quantization_config["quant_method"] == "awq": self.model_kwargs["quantization_config"] = AwqConfig( **self.model_config.quantization_config ) elif ( self.model_config.quantization_config["quant_method"] == "bitsandbytes" ): self.model_kwargs["quantization_config"] = BitsAndBytesConfig( **self.model_config.quantization_config ) elif self.cfg.adapter == "qlora" and self.cfg.load_in_4bit: bnb_config = { "load_in_4bit": True, "llm_int8_threshold": 6.0, "llm_int8_has_fp16_weight": False, "bnb_4bit_compute_dtype": self.cfg.torch_dtype, "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_quant_storage": torch.bfloat16, } if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not ( self.cfg.deepspeed or self.is_fsdp_enabled ): # for some reason, this causes the loss to be off by an order of magnitude # but deepspeed needs this still in bfloat16 bnb_config["bnb_4bit_quant_storage"] = torch.float32 if self.cfg.model_config_type == "falcon_h1": # output projection cannot be quantized for Falcon-H1 models bnb_config["llm_int8_skip_modules"] = ["out_proj"] if self.cfg.bnb_config_kwargs: bnb_config.update(self.cfg.bnb_config_kwargs) self.model_kwargs["quantization_config"] = BitsAndBytesConfig( **bnb_config, ) elif self.cfg.adapter == "lora" and self.cfg.load_in_8bit: bnb_config = { "load_in_8bit": True, } # Exclude mamba blocks from int8 quantization for jamba if self.cfg.model_config_type == "jamba": bnb_config["llm_int8_skip_modules"] = ["mamba"] if self.cfg.model_config_type == "falcon_h1": # output projection cannot be quantized for Falcon-H1 models bnb_config["llm_int8_skip_modules"] = ["out_proj"] self.model_kwargs["quantization_config"] = BitsAndBytesConfig( **bnb_config, ) def _set_attention_config(self): """Sample packing uses custom FA2 patch""" if self.cfg.attn_implementation: self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation elif self.cfg.flex_attention: self.model_kwargs["attn_implementation"] = "flex_attention" self.model_config._attn_implementation = "flex_attention" elif self.cfg.flash_attention: if not self.cfg.sample_packing and self.cfg.s2_attention: pass self.model_kwargs["attn_implementation"] = "flash_attention_2" self.model_config._attn_implementation = "flash_attention_2" elif self.cfg.sdp_attention: self.model_kwargs["attn_implementation"] = "sdpa" self.model_config._attn_implementation = "sdpa" elif self.cfg.sage_attention: # sets FA2 attention to re-use same internal handling like masking self.model_kwargs["attn_implementation"] = "flash_attention_2" self.model_config._attn_implementation = "flash_attention_2" elif self.cfg.eager_attention: self.model_kwargs["attn_implementation"] = "eager" self.model_config._attn_implementation = "eager" if self.cfg.low_cpu_mem_usage: self.model_kwargs["low_cpu_mem_usage"] = True def _check_model_requirements(self): if self.cfg.model_config_type in ["lfm2-vl", "lfm2"]: from transformers.utils.import_utils import is_causal_conv1d_available if is_causal_conv1d_available(): raise ImportError( "The 'causal-conv1d' package is installed but causes compatibility issues with LFM2 models. " "Please uninstall it by running: `pip uninstall -y causal-conv1d`" ) def _configure_zero3_memory_efficient_loading( self, ) -> HfTrainerDeepSpeedConfig | None: """ Set the deepspeed config to load the model into RAM first before moving to VRAM. IMPORTANT ========== We need to return `hf_ds_cfg` as it needs to exist before model loading for zero3. HfTrainerDeepSpeedConfig is a class that is used to configure the DeepSpeed training. It is not passed anywhere in the model loading function, just need to exist. """ hf_ds_cfg = None if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3": hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed) hf_ds_cfg.fill_match( "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size ) hf_ds_cfg.fill_match( "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps ) hf_ds_cfg.fill_match( "train_batch_size", int(os.getenv("WORLD_SIZE", "1")) * self.cfg.micro_batch_size * self.cfg.gradient_accumulation_steps, ) if "device_map" in self.model_kwargs: del self.model_kwargs["device_map"] transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = lambda: ( True ) return hf_ds_cfg def _load_model_from_config(self, model_loader_class=None) -> PreTrainedModel: """ Load model with random initialization using from_config. Uses the selected loader when provided; otherwise falls back to the auto loader. """ loader = model_loader_class or self.auto_model_loader if loader in [AutoModelForCausalLM, AutoModelForImageTextToText]: model = loader.from_config( config=self.model_config, trust_remote_code=self.cfg.trust_remote_code or False, ) else: model = loader(config=self.model_config) return model def _load_model_from_pretrained(self, model_loader_class=None) -> PreTrainedModel: """Load model from pretrained weights.""" loader = model_loader_class or self.auto_model_loader kwargs = { "config": self.model_config, "trust_remote_code": self.cfg.trust_remote_code or False, **self.model_kwargs, } return loader.from_pretrained(self.base_model, **kwargs) def _build_model(self) -> bool: """Load model, with load strategy depending on config.""" skip_move_to_device = False if self.cfg.tensor_parallel_size > 1: self.model_kwargs["tp_size"] = self.cfg.tensor_parallel_size self.model_kwargs["tp_plan"] = "auto" self.model_kwargs["device_mesh"] = self.device_mesh if "device_map" in self.model_kwargs: del self.model_kwargs["device_map"] # not compatible with `tp_plan` if self.is_fsdp_enabled: if self.cfg.fsdp_config.cpu_ram_efficient_loading: skip_move_to_device = True # Don't delete device_map for QLoRA + FSDP - it was set correctly in # _set_device_map if ( "device_map" in self.model_kwargs and not self.is_qlora_and_fsdp_enabled ): del self.model_kwargs["device_map"] elif self.is_qlora_and_fsdp_enabled: skip_move_to_device = True if ( self.cfg.tensor_parallel_size <= 1 and self.cfg.fsdp_config.cpu_ram_efficient_loading and self.cfg.fsdp_version == 2 ): # setting device_map for TP is not supported local_rank = int(os.getenv("LOCAL_RANK", "0")) if local_rank == 0: self.model_kwargs["device_map"] = "cpu" else: self.model_kwargs["device_map"] = "meta" if ( self.is_qlora_and_fsdp_enabled and self.cfg.fsdp_config.cpu_ram_efficient_loading and ( self.cfg.model_config_type == "dbrx" or self.cfg.qlora_sharded_model_loading ) ): if self.cfg.reinit_weights: LOG.warning( "reinit_weights is not supported with sharded quantized loading. " "Loading from pretrained weights instead." ) quant_storage = self.cfg.torch_dtype quantization_config = getattr( self.model_config, "quantization_config", None ) quantization_config = ( quantization_config or self.model_kwargs["quantization_config"] ) self.model = load_sharded_model_quant( self.base_model, self.model_config, self.cfg, quant_storage=quant_storage, quantization_config=quantization_config, ) skip_move_to_device = True elif self.model_type == "MambaLMHeadModel": if self.cfg.reinit_weights: LOG.warning( "reinit_weights is not supported with MambaLMHeadModel. " "Loading from pretrained weights instead." ) # FIXME this is janky at best and hacked together to make it work MambaLMHeadModel = fix_mamba_attn_for_loss() self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"] self.model_kwargs["device"] = torch.cuda.current_device() self.model_kwargs.pop("torch_dtype", None) self.model_kwargs.pop("device_map", None) self.model = MambaLMHeadModel.from_pretrained( self.base_model, **self.model_kwargs, ) else: # Please don't remove underscore binding without reading the fn docstring _ = self._configure_zero3_memory_efficient_loading() if ( self.model_type and self.model_type != "AutoModelForCausalLM" and not self.cfg.trust_remote_code and not self.cfg.gptq ): # Use model type from transformers model_loader_class = getattr(transformers, self.model_type) else: # Use auto model loader (handles gptq and default cases) model_loader_class = self.auto_model_loader self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"] if self.cfg.reinit_weights: self.model = self._load_model_from_config(model_loader_class) else: self.model = self._load_model_from_pretrained(model_loader_class) if is_deepspeed_zero3_enabled(): skip_move_to_device = True if self.cfg.tensor_parallel_size > 1: # workaround for upstream 4.54.0 not setting _tp_size or _device_mesh # TODO(wing): remove once 4.54.1 is released if self.model._tp_size != self.cfg.tensor_parallel_size: self.model._tp_size = self.cfg.tensor_parallel_size self.model._device_mesh = self.model_kwargs["device_mesh"] if self.cfg.experimental_skip_move_to_device is not None: skip_move_to_device = self.cfg.experimental_skip_move_to_device return skip_move_to_device def _set_z3_leaf_modules(self): from deepspeed.utils import set_z3_leaf_modules moe_type = self.cfg.model_config_type_text or self.cfg.model_config_type if moe_type in MOE_ARCH_BLOCK: moe_blocks = MOE_ARCH_BLOCK[moe_type] moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks set_z3_leaf_modules( self.model, [ get_module_class_from_name(self.model, module_name) for module_name in moe_blocks ], ) def _prepare_model_for_quantization(self): """Prepare loaded model for quantization.""" skip_prepare_model_for_kbit_training = False if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora": # Qwen doesn't play nicely with LoRA if this is enabled skip_prepare_model_for_kbit_training = True loftq_bits = ( self.cfg.peft and self.cfg.peft.loftq_config and self.cfg.peft.loftq_config.loftq_bits ) if self.cfg.adapter == "lora" and loftq_bits: skip_prepare_model_for_kbit_training = True if ( self.is_qlora_and_fsdp_enabled or (self.is_fsdp_enabled and self.cfg.fsdp_config.cpu_ram_efficient_loading) or is_deepspeed_zero3_enabled() ): # Make sure everything is in the same dtype skip_prepare_model_for_kbit_training = True if getattr(self.model, "_moe_experts_quantized", False): # Parametrized expert tensors dequantize on access — would OOM. skip_prepare_model_for_kbit_training = True if ( not skip_prepare_model_for_kbit_training and self.cfg.adapter in ["lora", "qlora"] and (self.cfg.load_in_8bit or self.cfg.load_in_4bit) ): LOG.info("converting PEFT model w/ prepare_model_for_kbit_training") self.model = prepare_model_for_kbit_training( self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing ) def _convert_embedding_modules_dtype( self, embedding_modules: list[str], dist_dtype: torch.dtype, before_kbit_train_or_finetune: bool, ): dest = {"dtype": dist_dtype} if self.cfg.lora_on_cpu: dest["device"] = "cpu" for name, module in self.model.named_modules(): if "norm" in name: module.to(dist_dtype) if before_kbit_train_or_finetune: if name.endswith(".gate"): module.to(dist_dtype) if self.model_config.model_type == "btlm": # don't upcast lm_head for btlm continue if any(m in name for m in embedding_modules) and hasattr(module, "weight"): module.to(**dest) ================================================ FILE: src/axolotl/loaders/patch_manager.py ================================================ """Patch manager class implementation to complement `axolotl.loaders.ModelLoader`. Applies pre- and post-model load patches for various fixes and optimizations. """ import importlib.util import os from functools import cached_property import addict import transformers from transformers import PretrainedConfig, PreTrainedModel from transformers.modeling_flash_attention_utils import is_flash_attn_available from axolotl.integrations.base import PluginManager from axolotl.monkeypatch.multipack import ( SUPPORTED_MULTIPACK_MODEL_TYPES, patch_for_multipack, ) from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) PLUGIN_MANAGER = PluginManager.get_instance() class PatchManager: """Manages the application of patches during the model loading process.""" @staticmethod def apply_pre_config_load_patches(cfg: DictDefault): """ Apply patches that must be set up before config loading. This is for patches that intercept remote code loading from HuggingFace, which needs to be in place before AutoConfig.from_pretrained() is called. Args: cfg: Configuration dictionary with model and training settings. """ if ( hasattr(cfg, "base_model_config") and cfg.base_model_config and "kimi-linear" in cfg.base_model_config.lower() ): from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import ( patch_kimi_config, ) patch_kimi_config() @staticmethod def apply_pre_tokenizer_load_patches(cfg: DictDefault): """ Apply patches that must be set up before tokenizer loading. This is for patches that intercept remote code loading from HuggingFace, which needs to be in place before AutoTokenizer.from_pretrained() is called. Args: cfg: Configuration dictionary with model and training settings. """ if ( hasattr(cfg, "tokenizer_config") and cfg.tokenizer_config and "kimi-linear" in cfg.tokenizer_config.lower() ): from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import ( patch_kimi_tokenizer, ) patch_kimi_tokenizer() def __init__( self, cfg: DictDefault, model_config: PretrainedConfig | addict.Dict, inference: bool = False, ): """Initialize the `PatchManager`. Args: cfg: Configuration dictionary with model and training settings. model_config: Configuration object for the model. inference: Whether the model is being loaded for inference mode. """ self.cfg = cfg self.model_config = model_config self.inference = inference @cached_property def has_flash_attn(self) -> bool: """Check if flash attention is installed.""" return importlib.util.find_spec("flash_attn") is not None def apply_pre_model_load_patches(self): """Apply pre-model load patches based on config.""" self._deactivate_hf_async_load() self._apply_transformers_patches() # self._apply_flex_attention_patches() self._apply_flash_attention_patches() self._apply_chunked_cross_entropy_patch() self._apply_sageattn_patches() self._apply_flash_attn_4_patches() self._apply_fsdp_patches() self._apply_adapter_patches() self._apply_model_specific_patches() self._apply_fp8_patches() self._apply_flash_attention_peft_patches() self._apply_gradient_checkpointing_patches() self._patch_attention() self._apply_multipack_patches() self._patch_loss_llama() self._patch_llama_derived_model() self._apply_mistral_cross_entropy_patch() self._apply_self_attention_lora_patch() self._apply_fsdp2_bnb_patches() self._apply_patch_deepspeed_zero3() self._apply_voxtral_patches() self._apply_apertus_patches() self._apply_trl_vllm_patches() self._apply_trl_trainer_utils_patches() def apply_post_plugin_pre_model_load_patches(self): """Apply post plugin-pre_model_load load patches based on config.""" self._apply_tiled_mlp(self.cfg.model_config_type) self._apply_moe_expert_quantization_patch() def _apply_transformers_patches(self): from axolotl.monkeypatch.transformers.trainer_loss_calc import ( patch_evaluation_loop, patch_maybe_log_save_evaluate, ) patch_evaluation_loop() patch_maybe_log_save_evaluate() if self.cfg.context_parallel_size > 1: from axolotl.monkeypatch.transformers.trainer_context_parallel import ( patch_prepare_context_parallel_inputs, ) patch_prepare_context_parallel_inputs() def apply_post_model_build_patches(self, model: PreTrainedModel): """Apply patches right after model build, before post-load setup.""" self._finalize_moe_expert_quantization(model) def apply_post_model_load_patches(self, model: PreTrainedModel): """Apply patches that require the model instance.""" self._apply_llama_flash_attn_patches(model) self._apply_unsloth_patches(model) self._apply_lora_kernel_patch(model) self._apply_scaling_softmax_patch(model) def _apply_flash_attention_patches(self): """Apply patches related to Flash Attention.""" if self.cfg.xformers_attention and self.cfg.sample_packing: from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2 patch_xformers_attn_over_fa2() self.cfg.flash_attention = True def _apply_chunked_cross_entropy_patch(self): if self.cfg.chunked_cross_entropy: from axolotl.monkeypatch.loss.chunked import patch_chunked_ce_loss_fn if self.cfg.chunked_cross_entropy_num_chunks: patch_chunked_ce_loss_fn(self.cfg.chunked_cross_entropy_num_chunks) else: patch_chunked_ce_loss_fn() def _apply_fsdp_patches(self): """Apply patches for FSDP configurations.""" if self.cfg.fsdp_config: from axolotl.monkeypatch.accelerate.fsdp2 import ( patch_initialize_missing_keys_for_fsdp, ) patch_initialize_missing_keys_for_fsdp() if self.cfg.context_parallel_size > 1 or ( self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2" ): from axolotl.monkeypatch.accelerate.parallelism_config import ( patch_parallelism_config, ) patch_parallelism_config() if self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2": from axolotl.monkeypatch.accelerate.fsdp2 import ( patch_accelerate_fsdp2, patch_tied_keys_for_meta_device, ) patch_accelerate_fsdp2() if self.cfg.fsdp_config.cpu_ram_efficient_loading: patch_tied_keys_for_meta_device() if self.cfg.rl: from axolotl.monkeypatch.trainer.trl import patch_trl_prepare_fsdp2 patch_trl_prepare_fsdp2() # if self.cfg.fsdp_config: # # see transformers#39152 # from axolotl.monkeypatch.trainer_fsdp_optim import ( # patch_training_loop_for_fsdp, # ) # # patch_training_loop_for_fsdp() def _apply_adapter_patches(self): """Apply patches for adapter configurations.""" if self.cfg.adapter and self.cfg.embeddings_skip_upcast: from axolotl.monkeypatch.peft.utils import patch_peft_prep_code patch_peft_prep_code() def _apply_flex_attention_patches(self): """Apply patches for flexible attention.""" if self.cfg.flex_attention: from axolotl.monkeypatch.attention.flex_attn import ( patch_flex_wrapper, ) flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {} patch_flex_wrapper(**flex_attn_compile_kwargs) def _apply_sageattn_patches(self): """Apply patches for SageAttention.""" if self.cfg.sage_attention: from axolotl.monkeypatch.attention.sage_attn import patch_sageattn patch_sageattn() def _apply_flash_attn_4_patches(self): """Auto-apply FA4 when flash_attention is enabled and FA4 is available on SM90+.""" if not self.cfg.flash_attention: return from axolotl.monkeypatch.attention.flash_attn_4 import patch_flash_attn_4 patch_flash_attn_4(self.model_config) def _apply_model_specific_patches(self): """Apply patches specific to model architectures.""" if ( self.cfg.model_config_type == "llama4" and self.cfg.llama4_linearized_experts ): from axolotl.monkeypatch.models.llama4.modeling import ( patch_llama4_linearized_modeling, ) patch_llama4_linearized_modeling() if self.cfg.model_config_type == "qwen3_next" and self.cfg.sample_packing: from axolotl.monkeypatch.models.qwen3_next.modeling import ( patch_qwen3_next_modeling_packing, ) patch_qwen3_next_modeling_packing() if self.cfg.model_config_type == "qwen3_5" and self.cfg.sample_packing: from axolotl.monkeypatch.models.qwen3_5.modeling import ( patch_qwen3_5_modeling_packing, ) patch_qwen3_5_modeling_packing() if self.cfg.model_config_type == "qwen3_5_moe" and self.cfg.sample_packing: from axolotl.monkeypatch.models.qwen3_5.modeling import ( patch_qwen3_5_moe_modeling_packing, ) patch_qwen3_5_moe_modeling_packing() if ( self.cfg.model_config_type in ["qwen3_5", "qwen3_5_moe"] and self.cfg.is_multimodal and self.cfg.flash_attention ): from axolotl.monkeypatch.models.qwen3_5.modeling import ( patch_qwen3_5_vlm_flash_attention, ) patch_qwen3_5_vlm_flash_attention() if self.cfg.model_config_type == "kimi_linear": from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import ( patch_kimi_model, ) patch_kimi_model() def _apply_fp8_patches(self): """Apply patches for FP8 support.""" if self.cfg.fp8: from axolotl.monkeypatch.trainer_accelerator_args import ( patch_create_accelerate_code_for_fp8, ) patch_create_accelerate_code_for_fp8( self.cfg.fp8_enable_fsdp_float8_all_gather ) def _apply_flash_attention_peft_patches(self): """Apply patches for Flash Attention with PEFT.""" if self.cfg.adapter: from axolotl.monkeypatch.transformers_fa_utils import ( patch_fa_peft_integration, ) patch_fa_peft_integration() def _apply_gradient_checkpointing_patches(self): """Apply patches for gradient checkpointing.""" if ( self.cfg.gradient_checkpointing and self.cfg.activation_offloading == "legacy" ): from axolotl.monkeypatch.gradient_checkpointing import ( hf_grad_checkpoint_offload_wrapper, ) transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper elif ( self.cfg.gradient_checkpointing and self.cfg.activation_offloading == "offload_disk" ): from axolotl.monkeypatch.gradient_checkpointing import ( hf_grad_checkpoint_disk_offload_wrapper, ) transformers.modeling_utils.checkpoint = ( hf_grad_checkpoint_disk_offload_wrapper ) def _apply_mistral_cross_entropy_patch(self): """Apply Mistral cross entropy patch if configured.""" if ( self.cfg.model_config_type == "mistral" and self.cfg.flash_attn_cross_entropy_loss ): from axolotl.monkeypatch.mistral_attn_hijack_flash import ( patch_mistral_cross_entropy, ) patch_mistral_cross_entropy() def _apply_self_attention_lora_patch(self): """Apply self-attention LoRA patches if configured.""" if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel: # Only patch if conditions are met can_patch = ( self.cfg.lora_dropout == 0 if hasattr(self.cfg, "lora_dropout") else True ) # default to True if lora_dropout is not set if not can_patch: LOG.warning("Cannot patch self-attention - requires no dropout") return from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora patch_self_attn_lora(self.cfg) def _apply_multipack_patches(self): """Apply multipack patches if necessary.""" if ( self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES and (self.cfg.flash_attention or self.cfg.flex_attention) and self.cfg.sample_packing ): # Get automap config if it exists auto_map_config = None if isinstance(self.model_config, dict) and "auto_map" in self.model_config: auto_map_config = self.model_config["auto_map"] elif hasattr(self.model_config, "auto_map"): auto_map_config = self.model_config.auto_map # Determine if the model has remote code if auto_map_config is not None: has_remote_code = "AutoModelForCausalLM" in auto_map_config else: has_remote_code = False if has_remote_code and self.cfg.trust_remote_code is not None: # If explicitly set in YAML, prefer that has_remote_code = self.cfg.trust_remote_code patch_for_multipack( self.cfg.model_config_type, model_name=self.cfg.base_model, has_remote_code=has_remote_code, ) if self.cfg.sample_packing: from axolotl.monkeypatch.data.batch_dataset_fetcher import ( apply_multipack_dataloader_patch, ) LOG.info("Applying multipack dataloader patch for sample packing...") apply_multipack_dataloader_patch() def _apply_fsdp2_bnb_patches(self): """Apply FSDP2 BNB patches.""" if ( self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2" and (self.cfg.load_in_4bit or self.cfg.load_in_8bit) ): from axolotl.monkeypatch.fsdp2_qlora import ( apply_init_dtype_attrs_patch, apply_init_sharded_param_patch, apply_init_unsharded_param_patch, apply_linear8bitlt_save_patch, ) apply_init_sharded_param_patch() apply_init_unsharded_param_patch() apply_init_dtype_attrs_patch() if self.cfg.load_in_8bit: apply_linear8bitlt_save_patch() def _deactivate_hf_async_load(self): """Load weights synchronously so they can be converted and not OOM.""" if self.cfg.load_in_4bit or self.cfg.load_in_8bit: os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1" def _apply_moe_expert_quantization_patch(self): """Patch transformers weight loading and PEFT for MoE expert quantization.""" has_target_params = bool(getattr(self.cfg, "lora_target_parameters", None)) if not self.cfg.quantize_moe_experts and not has_target_params: return from axolotl.monkeypatch.moe_quant import ( patch_peft_target_parameters_matching, ) if self.cfg.quantize_moe_experts: from axolotl.monkeypatch.moe_quant import patch_moe_quantization_on_load patch_moe_quantization_on_load(self.cfg) patch_peft_target_parameters_matching() def _finalize_moe_expert_quantization(self, model: PreTrainedModel): """Log quantization results and set model flag for downstream use.""" import torch model._moe_experts_quantized = False if self.cfg.quantize_moe_experts: from axolotl.monkeypatch.moe_quant import get_moe_quantized_count count = get_moe_quantized_count() if count > 0: import gc model._moe_experts_quantized = True LOG.info( "Quantized %d MoE expert parameter(s) to %s during model loading", count, "4-bit" if self.cfg.load_in_4bit else "8-bit", ) gc.collect() torch.cuda.empty_cache() def _apply_tiled_mlp(self, model_type: str): if self.cfg.tiled_mlp: from axolotl.monkeypatch.tiled_mlp import ( patch_tiled_mlp, ) patch_tiled_mlp( model_type, use_original_mlp=self.cfg.tiled_mlp_use_original_mlp, cfg_num_shards=self.cfg.tiled_mlp_num_shards, ) def _apply_voxtral_patches(self): """Apply patches for Voxtral model.""" if self.cfg.model_config_type == "voxtral": from axolotl.monkeypatch.models.voxtral.modeling import ( patch_voxtral_conditional_generation_forward, ) patch_voxtral_conditional_generation_forward() def _patch_attention(self): """Apply attention-specific patches based on model type.""" if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")): return if self.model_config.model_type == "btlm": from axolotl.monkeypatch.btlm_attn_hijack_flash import ( replace_btlm_attn_with_flash_attn, ) replace_btlm_attn_with_flash_attn(self.cfg.base_model) if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing: from axolotl.monkeypatch.stablelm_attn_hijack_flash import ( replace_stablelm_attn_with_flash_attn, ) replace_stablelm_attn_with_flash_attn(self.cfg.base_model) if self.model_config.model_type in ("mistral3", "llava"): from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import ( apply_patch_is_packed_sequence, ) apply_patch_is_packed_sequence() def _patch_loss_llama(self): """Patch loss functions and other optimizations for LLaMA models.""" if not self.cfg.is_llama_derived_model: return if self.cfg.flash_attn_cross_entropy and self.has_flash_attn: from axolotl.monkeypatch.llama_attn_hijack_flash import ( patch_fa_llama_cross_entropy, ) patch_fa_llama_cross_entropy() elif self.cfg.unsloth_cross_entropy_loss: from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch integrate_cross_entropy_loss_patch(model_type="llama") if self.cfg.flash_attn_rms_norm and self.has_flash_attn: from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm patch_llama_rms_norm() elif self.cfg.unsloth_rms_norm: from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm patch_unsloth_layernorm() if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o: from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora patch_self_attn_lora() def _patch_llama_flash_attention(self): """Apply Flash Attention patches for LLaMA models.""" from axolotl.monkeypatch.llama_attn_hijack_flash import ( replace_llama_attn_with_flash_attn, ) if self.cfg.s2_attention: LOG.info("patching w/ flash-enabled, shifted-sparse attention") replace_llama_attn_with_flash_attn( cross_entropy=self.cfg.flash_attn_cross_entropy, rms_norm=self.cfg.flash_attn_rms_norm, use_shifted_sparse_attn=True, ) elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm: replace_llama_attn_with_flash_attn( cross_entropy=self.cfg.flash_attn_cross_entropy, rms_norm=self.cfg.flash_attn_rms_norm, ) def _patch_llama_xformers_attention(self): """Apply xformers attention patches for LLaMA models.""" from axolotl.monkeypatch.llama_attn_hijack_xformers import ( hijack_llama_attention, ) LOG.info("Patching with xformers attention...") hijack_llama_attention() def _patch_llama_derived_model(self): """Modify all llama derived models in one block.""" if self.cfg.is_llama_derived_model and not ( self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES and (self.cfg.flash_attention or self.cfg.flex_attention) and self.cfg.sample_packing ): if self.cfg.flash_attention: self._patch_llama_flash_attention() elif self.cfg.xformers_attention: self._patch_llama_xformers_attention() elif self.cfg.s2_attention: raise NotImplementedError( "Shifted-sparse attention not currently implemented without flash attention." ) def _apply_llama_flash_attn_patches(self, model): """Apply LLaMA-specific flash attention patches.""" if ( self.model_config.model_type in ["llama", "llama4"] and not self.cfg.trust_remote_code and not self.cfg.gptq and self.cfg.flash_attention and is_flash_attn_available() and not self.inference ): # TODO(MengqingCao): split these patches separately from axolotl.monkeypatch.llama_attn_hijack_flash import ( is_xformers_swiglu_available, replace_llama_mlp_with_swiglu, ) if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available(): LOG.info("Patching with SwiGLU...") replace_llama_mlp_with_swiglu(model) def _apply_unsloth_patches(self, model): """Apply unsloth optimization patches.""" if self.cfg.unsloth_lora_mlp: from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch integrate_lora_mlp_patch(peft_model=model) if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o: from axolotl.monkeypatch.unsloth_ import integrate_lora_patch integrate_lora_patch(peft_model=model, cfg=self.cfg) if self.cfg.unsloth_rope: from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings integrate_rope_embeddings() def _apply_lora_kernel_patch(self, model): """Apply LoRA kernel patches.""" if ( self.cfg.lora_mlp_kernel or self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel ): from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches apply_lora_kernel_patches(model=model, cfg=self.cfg) def _apply_patch_deepspeed_zero3(self): try: from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches if self.cfg.activation_offloading is True and ( is_deepspeed_zero3_enabled() or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3" ): apply_deepspeed_patches() except ImportError as e: LOG.warning(f"DeepSpeed patches not applied: {e}") def _apply_apertus_patches(self): """Apply patches for Apertus model.""" if self.cfg.model_config_type == "apertus": from axolotl.monkeypatch.models.apertus.activation import ( patch_apertus_xielu_activation, ) patch_apertus_xielu_activation() def _apply_trl_vllm_patches(self): """Apply TRL vLLM patches for batched weight sync, NaN logprobs fix, and scalar handling.""" if ( self.cfg.rl and getattr(self.cfg, "trl", None) and getattr(self.cfg.trl, "use_vllm", False) ): from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm patch_trl_vllm() def _apply_trl_trainer_utils_patches(self): """Replace trl.trainer.utils.{selective_log_softmax, entropy_from_logits} with Triton kernels.""" if not self.cfg.rl: return try: from axolotl.monkeypatch.trainer.utils import ( entropy_from_logits, selective_log_softmax, ) except (ImportError, ModuleNotFoundError): LOG.warning("Triton not available — skipping trl.trainer.utils patches") return import trl.trainer.utils # Guard against repeated calls: only stash the original if trl still # points at its own implementation (not our wrapper). if trl.trainer.utils.selective_log_softmax is not selective_log_softmax: from axolotl.monkeypatch.trainer import utils as _axolotl_trainer_utils _axolotl_trainer_utils.selective_log_softmax_original = ( trl.trainer.utils.selective_log_softmax ) trl.trainer.utils.selective_log_softmax = selective_log_softmax if trl.trainer.utils.entropy_from_logits is not entropy_from_logits: trl.trainer.utils.entropy_from_logits = entropy_from_logits LOG.info( "Patched trl.trainer.utils with Triton selective_log_softmax and entropy_from_logits" ) def _apply_scaling_softmax_patch(self, model: PreTrainedModel): """Apply Scaling Softmax (SSMax) patch. Ref: https://arxiv.org/abs/2501.19399""" if self.cfg.scaling_softmax: from axolotl.monkeypatch.scaled_softmax_attn import ( patch_scaled_softmax_attention, ) patch_scaled_softmax_attention( scaling_factor_init=self.cfg.scaling_softmax_factor or 0.43, bias=self.cfg.scaling_softmax_bias or 0.0, model=model, ) ================================================ FILE: src/axolotl/loaders/processor.py ================================================ """Processor loading functionality for multi-modal models""" import transformers from transformers import ( AutoProcessor, PreTrainedTokenizerBase, ) from axolotl.telemetry.errors import send_errors from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @send_errors def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase): processor_cls = AutoProcessor if cfg.processor_type: processor_cls = getattr(transformers, cfg.processor_type) # Build common kwargs for processor loading processor_kwargs = {} if cfg.revision_of_model: processor_kwargs["revision"] = cfg.revision_of_model if cfg.tokenizer_use_mistral_common: def _patch_mistralcommontokenizer(): """ Transformers v5 stops reading the sub-processor. We need to patch this, so both processors use this. """ import transformers.tokenization_mistral_common as tokenization_mistral_common from axolotl.utils.mistral import HFMistralTokenizer tokenization_mistral_common.MistralCommonBackend = HFMistralTokenizer _patch_mistralcommontokenizer() from transformers import VoxtralProcessor if processor_cls == VoxtralProcessor: return VoxtralProcessor.from_pretrained( cfg.processor_config, **processor_kwargs, ) from axolotl.utils.mistral import Mistral3Processor return Mistral3Processor( tokenizer=tokenizer, ) processor_kwargs["trust_remote_code"] = cfg.trust_remote_code or False processor = processor_cls.from_pretrained( cfg.processor_config, **processor_kwargs, ) processor.tokenizer = tokenizer # Attempt to load image size from processor if available if ( cfg.image_size is None and hasattr(processor, "size") and any(dim in processor.size for dim in ["width", "height"]) ): im_width = None im_height = None if "width" in processor.size: im_width = processor.size["width"] if "height" in processor.size: im_height = processor.size["height"] # If both width and height are set, use a tuple if im_width is not None and im_height is not None: cfg.image_size = (im_width, im_height) # If only width is set, use as integer elif im_width is not None: cfg.image_size = im_width # If only height is set, use as integer elif im_height is not None: cfg.image_size = im_height LOG.debug(f"Loaded image size: {cfg.image_size} from processor") return processor ================================================ FILE: src/axolotl/loaders/tokenizer.py ================================================ """Tokenizer loading functionality and associated utils""" import json import os import transformers from transformers import ( AddedToken, AutoTokenizer, PreTrainedTokenizer, ) from axolotl.integrations.base import PluginManager from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN from axolotl.telemetry.errors import send_errors from axolotl.utils.chat_templates import get_chat_template_from_config from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import ( barrier, is_local_main_process, is_main_process, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) PLUGIN_MANAGER = PluginManager.get_instance() def modify_tokenizer_files( tokenizer_path: str, token_mappings: dict[int, str], output_dir: str, revision: str = "main", ) -> str: """ Modify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer. This only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab. Args: tokenizer_path: Path or name of the original tokenizer token_mappings: Dict mapping {token_id (int): new_token_string} output_dir: Directory to save the modified tokenizer revision: Model revision/branch/tag/commit to load from (HF Hub) Returns: Path to the modified tokenizer directory Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941 """ # Create the tokenizer directory in output_dir if it doesn't exist tokenizer_dir = os.path.join(output_dir, "tokenizer") os.makedirs(tokenizer_dir, exist_ok=True) if is_local_main_process(): # Load the tokenizer temp_tokenizer = AutoTokenizer.from_pretrained( tokenizer_path, use_fast=True, revision=revision ) # Save the tokenizer to the output directory temp_tokenizer.save_pretrained(tokenizer_dir) # Get the token IDs and map them to their new values token_id_mappings = { int(token_id): new_value for token_id, new_value in token_mappings.items() } # 1. Update tokenizer_config.json - added_tokens_decoder config_path = os.path.join(tokenizer_dir, "tokenizer_config.json") if os.path.exists(config_path): with open(config_path, "r", encoding="utf-8") as f: config_data = json.load(f) # Update added_tokens_decoder if "added_tokens_decoder" in config_data: for token_id, new_value in token_id_mappings.items(): token_id_str = str(token_id) if token_id_str in config_data["added_tokens_decoder"]: config_data["added_tokens_decoder"][token_id_str]["content"] = ( new_value ) else: raise ValueError( f"Token ID {token_id_str} not found in added_tokens_decoder" ) # Write the updated config back with open(config_path, "w", encoding="utf-8") as f: json.dump(config_data, f, indent=2) # 2. Update tokenizer.json - added_tokens tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") if os.path.exists(tokenizer_path): with open(tokenizer_path, "r", encoding="utf-8") as f: tokenizer_data = json.load(f) # Update added_tokens if "added_tokens" in tokenizer_data: for token_id, new_value in token_id_mappings.items(): for i, token_entry in enumerate(tokenizer_data["added_tokens"]): if token_entry["id"] == token_id: tokenizer_data["added_tokens"][i]["content"] = new_value break else: # Reaching this section means the token_id was not found in tokenizer.json added_tokens raise ValueError( f"Token ID {token_id} not found in added_tokens" ) if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]: for token_id, new_value in token_id_mappings.items(): for entry_val, entry_id in tokenizer_data["model"]["vocab"].items(): if entry_id == token_id: del tokenizer_data["model"]["vocab"][entry_val] tokenizer_data["model"]["vocab"][new_value] = token_id break # Write the updated tokenizer data back with open(tokenizer_path, "w", encoding="utf-8") as f: json.dump(tokenizer_data, f, indent=2) barrier() return tokenizer_dir @send_errors def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer: """Load and configure the tokenizer based on the provided config.""" # Apply patches that need to be in place before tokenizer loading from axolotl.loaders.patch_manager import PatchManager PatchManager.apply_pre_tokenizer_load_patches(cfg) def _load_mistral_common_tokenizer(cfg: DictDefault): """Load mistral-common tokenizer""" from axolotl.utils.mistral import HFMistralTokenizer # Load the HF-compatible wrapper around MistralTokenizer kwargs = {} if cfg.revision_of_model: kwargs["revision"] = cfg.revision_of_model tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config, **kwargs) return tokenizer if cfg.tokenizer_use_mistral_common: return _load_mistral_common_tokenizer(cfg) model_config = load_model_config(cfg) tokenizer_kwargs = {} use_fast = True # this is the default if cfg.tokenizer_use_fast is not None: use_fast = cfg.tokenizer_use_fast if cfg.tokenizer_legacy is not None: # True is the default w/ https://github.com/huggingface/transformers/pull/25224 tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy if cfg.revision_of_model: tokenizer_kwargs["revision"] = cfg.revision_of_model tokenizer_cls = AutoTokenizer if cfg.tokenizer_type: tokenizer_cls = getattr(transformers, cfg.tokenizer_type) # Set base tokenizer path tokenizer_path = cfg.tokenizer_config # Apply token string overrides if specified if cfg.added_tokens_overrides: # Modify tokenizer files and get path to modified tokenizer modify_kwargs = {"output_dir": cfg.output_dir} if cfg.revision_of_model: modify_kwargs["revision"] = cfg.revision_of_model tokenizer_path = modify_tokenizer_files( tokenizer_path, cfg.added_tokens_overrides, **modify_kwargs ) tokenizer = tokenizer_cls.from_pretrained( tokenizer_path, trust_remote_code=cfg.trust_remote_code or False, use_fast=use_fast, **tokenizer_kwargs, ) if ( tokenizer.__class__.__name__ in [ "LlamaTokenizer", "LlamaTokenizerFast", "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", ] and hasattr(tokenizer, "pad_token") and not tokenizer.pad_token ): # set a pad_token, but use eos_token so we don't add a new token tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast": tokenizer.add_special_tokens({"pad_token": "[PAD]"}) # nosec B105 os.environ["TOKENIZERS_PARALLELISM"] = "false" # Mistral's official FA implementation requires left padding if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: tokenizer.padding_side = "left" # Qwen base only has single token, so we need to set the special tokens # the following check is for Qwen1 base models if cfg.is_qwen_derived_model and hasattr(tokenizer, "eod_id"): token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"] for attr_name in token_ids: if getattr(tokenizer, attr_name) is None: setattr(tokenizer, attr_name, tokenizer.eod_id) token_names = ["bos_token", "eos_token", "pad_token", "unk_token"] for attr_name in token_names: if getattr(tokenizer, attr_name) is None: setattr(tokenizer, attr_name, "<|endoftext|>") additional_special_tokens = None if cfg.special_tokens: special_tokens = cfg.special_tokens.to_dict() additional_special_tokens = special_tokens.pop( "additional_special_tokens", None ) lora_modules_to_save = get_linear_embedding_layers(model_config.model_type) for k, val in special_tokens.items(): # check if new special token is not already in tokenizer and # is adapter training to make sure lora_modules_to_save is set if ( (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val) and (len(tokenizer.encode(val, add_special_tokens=False)) > 2) and cfg.adapter and ( not cfg.lora_modules_to_save or not all( x in cfg.lora_modules_to_save for x in lora_modules_to_save ) ) and k != "pad_token" ): lora_modules_to_save_str = ", ".join( [f"`{x}`" for x in lora_modules_to_save] ) raise ValueError( f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] " "when using an adapter and changing the special tokens." ) tokenizer.add_special_tokens( {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)} ) # If we add bos_token and eos_token, we need to update the post processor to # handle them correctly. # https://github.com/huggingface/transformers/pull/24132 bos_or_eos_in_special_tokens = ( "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens ) if ( tokenizer.__class__.__name__ in ( "LlamaTokenizerFast", "CodeLlamaTokenizerFast", ) and bos_or_eos_in_special_tokens ): tokenizer.update_post_processor() if cfg.tokens: tokenizer.add_tokens( [ AddedToken(token, rstrip=False, lstrip=False, normalized=False) for token in cfg.tokens ] ) # Additional special tokens are a List, and need to be treated differently than regular special # tokens. We add them after we have called `add_tokens` in case these additional special tokens # are new tokens. # # Usage: # # ```py # special_tokens: # additional_special_tokens: ["<|im_start|>", "<|im_end|>"] # ``` if additional_special_tokens is not None: tokenizer.add_special_tokens( {"additional_special_tokens": additional_special_tokens} ) if is_main_process(): LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}") LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}") LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}") if cfg.chat_template: chat_template_string = get_chat_template_from_config( cfg=cfg, tokenizer=tokenizer, ) if cfg.default_system_message and cfg.chat_template == "chatml": chat_template_string = chat_template_string.replace( "You are a helpful assistant.", cfg.default_system_message ) tokenizer.chat_template = chat_template_string elif getattr(tokenizer, "chat_template", None) is None: LOG.info( "No Chat template selected. Consider adding a chat template for easier inference." ) # make the tokenizer.pad call quieter 🤐 if hasattr(tokenizer, "deprecation_warnings"): tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True return tokenizer ================================================ FILE: src/axolotl/loaders/utils.py ================================================ """Utilities for axolotl.loaders module""" import contextlib from typing import Type import addict import torch import transformers from transformers import AutoConfig, PretrainedConfig, PreTrainedModel from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def get_module_class_from_name( module: torch.nn.Module, name: str ) -> Type[torch.nn.Module] | None: """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses` (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805). Args: module: The module to get the class from. name: The name of the class. Returns: The class type of the matching module, or `None` if no match is found. """ modules_children = list(module.children()) if module.__class__.__name__ == name: return module.__class__ if len(modules_children) == 0: return None for child_module in modules_children: module_class = get_module_class_from_name(child_module, name) if module_class is not None: return module_class return None def check_model_config(cfg: DictDefault, model_config: PretrainedConfig): """Validates and adjusts model config based on `axolotl` config. This function performs several important checks and adjustments: - Disables model caching for better memory efficiency - Handles multimodal model-specific configurations - Validates quantization settings - Ensures proper LoRA configuration when using adapters with new tokens Args: cfg: Dictionary mapping `axolotl` config keys to values. model_config: The model's configuration object from `transformers`. Raises: ValueError: If a multimodal model lacks text configuration, if GPTQ settings are inconsistent, or if LoRA `modules_to_save` is improperly configured with new tokens. """ if hasattr(model_config, "use_cache"): model_config.use_cache = False if cfg.is_multimodal: # For multimodal configs, use_cache is set in the text_config if hasattr(model_config, "get_text_config"): text_config = model_config.get_text_config() if hasattr(text_config, "use_cache"): text_config.use_cache = False else: raise ValueError( "No text config found for multimodal model. Please raise an Issue with model details." ) # Check if image_size is not set and load image size from model config if available if ( cfg.image_size is None and hasattr(model_config, "vision_config") and hasattr(model_config.vision_config, "image_size") ): image_size = model_config.vision_config.image_size if isinstance(image_size, list): cfg.image_size = tuple(image_size) else: cfg.image_size = image_size LOG.debug(f"Loaded image size: {cfg.image_size} from model config") quant_config_exists = ( hasattr(model_config, "quantization_config") and model_config.quantization_config ) # Detect compressed-tensors config is_compressed_tensors_config = ( quant_config_exists and model_config.quantization_config.get("quant_method") == "compressed-tensors" ) if is_compressed_tensors_config: if model_config.quantization_config.get("config_groups"): LOG.warning( "Found `config_groups` in a compressed-tensors config. " "QAT integration with llmcompressor is not tested." ) # Skip further quant checks for compressed-tensors return quant_config_method_is_gptq = ( quant_config_exists and "quant_method" in model_config.quantization_config and model_config.quantization_config["quant_method"] == "gptq" ) if cfg.gptq and not quant_config_method_is_gptq: raise ValueError( "model_config.quantization_config is not set or quant_method is not set to gptq. " "Please make sure to point to a GPTQ model." ) lora_modules_to_save = get_linear_embedding_layers(model_config.model_type) if ( cfg.adapter and cfg.tokens and ( not cfg.lora_modules_to_save or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save) ) ): lora_modules_to_save_joined = ", ".join( map(lambda x: f"`{x}`", lora_modules_to_save) ) raise ValueError( "`lora_modules_to_save` not properly set when adding new tokens. " f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`." ) if ( cfg.tensor_parallel_size and cfg.tensor_parallel_size > 1 and hasattr(model_config, "tie_word_embeddings") and model_config.tie_word_embeddings ): raise ValueError( "Tensor parallelism is incompatible with models configured with `tie_word_embeddings` enabled. " "Please use a model without `tie_word_embeddings`, or disable tensor parallelism." ) def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict: """Loads and configures a model configuration from HuggingFace or local sources. This function determines the appropriate model config source, loads it, applies any necessary overrides, and validates it for compatibility with the `axolotl` config. If `cfg.cls_model_config` is set, a custom config class from transformers will be used instead of `AutoConfig` (e.g., 'LlamaConfig', 'MistralConfig'). Args: cfg: Dictionary mapping `axolotl` config keys to values. Returns: A configured model configuration object (`AutoConfig` instance), or a simple dictionary configuration for special cases like Mamba models. Raises: ValueError: If configuration loading fails for reasons other than special cases that are handled (e.g., Mamba models). """ model_config_name = cfg.base_model_config or cfg.base_model if not model_config_name and cfg.tokenizer_config: model_config_name = cfg.tokenizer_config trust_remote_code = cfg.trust_remote_code is True config_kwargs = {} if cfg.revision_of_model: config_kwargs["revision"] = cfg.revision_of_model if cfg.num_labels: # num_labels is used to initialize classifier models config_kwargs["num_labels"] = cfg.num_labels config_cls = AutoConfig if cfg.cls_model_config: config_cls = getattr(transformers, cfg.cls_model_config) try: model_config = config_cls.from_pretrained( model_config_name, trust_remote_code=trust_remote_code, **config_kwargs, ) except ValueError as error: if "mamba" in model_config_name: return addict.Dict( { "model_type": "mamba", } ) raise error if cfg.overrides_of_model_config: for key, val in cfg.overrides_of_model_config.items(): setattr(model_config, key, val) check_model_config(cfg, model_config) return model_config def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16): """Ensures all modules in the model are converted to the specified data type.""" for name, module in model.named_modules(): weight_mismatch = False with contextlib.suppress(AttributeError): weight_mismatch = module.weight.dtype != dtype bias_mismatch = False with contextlib.suppress(AttributeError): bias_mismatch = module.bias.dtype != dtype if weight_mismatch: LOG.debug( f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}" ) if bias_mismatch: LOG.debug(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}") if weight_mismatch or bias_mismatch: module.to(dtype) def get_linear_embedding_layers(model_type: str) -> list[str]: """Returns layer names of linear embeddings needed for LoRA based on model type.""" if model_type == "gpt_neox": return ["embed_in", "embed_out"] if model_type == "falcon": return ["word_embeddings", "lm_head"] return ["embed_tokens", "lm_head"] ================================================ FILE: src/axolotl/logging_config.py ================================================ """Common logging module for axolotl.""" import logging import os from logging import Formatter, Logger, LogRecord from logging.config import dictConfig from typing import Any, Dict from colorama import Fore, Style, init DEFAULT_AXOLOTL_LOG_LEVEL = "INFO" DEFAULT_LOG_LEVEL = "WARNING" class AxolotlOrWarnErrorFilter(logging.Filter): """ Allows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default). """ def __init__(self, **kwargs): super().__init__(**kwargs) axolotl_log_level = os.getenv( "AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL ).upper() other_log_level = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper() try: # py311+ only level_mapping = logging.getLevelNamesMapping() self.axolotl_level = level_mapping[axolotl_log_level] self.other_level = level_mapping[other_log_level] except AttributeError: # For py310, use getLevelName directly self.axolotl_level = logging.getLevelName(axolotl_log_level) self.other_level = logging.getLevelName(other_log_level) def filter(self, record: LogRecord) -> bool: # General filter if record.levelno >= self.other_level: return True # Axolotl filter return ( record.name.startswith("axolotl") and record.levelno >= self.axolotl_level ) class AxolotlLogger(Logger): """Logger that applies filtering to non-axolotl loggers.""" def __init__(self, name: str, level: int = logging.NOTSET): super().__init__(name, level) if not name.startswith("axolotl"): self.addFilter(AxolotlOrWarnErrorFilter()) class ColorfulFormatter(Formatter): """ Formatter to add coloring to log messages by log type """ COLORS = { "WARNING": Fore.YELLOW, "ERROR": Fore.RED, "CRITICAL": Fore.RED + Style.BRIGHT, } def format(self, record): record.rank = int(os.getenv("LOCAL_RANK", "0")) record.rank_fmt = f" [RANK:{record.rank}]" if record.rank != 0 else "" log_message = super().format(record) return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET DEFAULT_LOGGING_CONFIG: Dict[str, Any] = { "version": 1, "disable_existing_loggers": False, "formatters": { "simple": { "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s", }, "colorful": { "()": ColorfulFormatter, "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d]%(rank_fmt)s %(message)s", }, "concise": { "format": "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s", }, "concise_color": { "()": ColorfulFormatter, "format": "[%(asctime)s] [%(levelname)s] [%(name)s]%(rank_fmt)s %(message)s", }, }, "filters": { "ax_or_warn": { "()": "axolotl.logging_config.AxolotlOrWarnErrorFilter", }, }, "handlers": { "console": { "class": "logging.StreamHandler", "formatter": "concise", "filters": ["ax_or_warn"], "stream": "ext://sys.stdout", }, "color_console": { "class": "logging.StreamHandler", "formatter": "concise_color", "filters": ["ax_or_warn"], "stream": "ext://sys.stdout", }, "ax_file_only": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "simple", "stream": "ext://axolotl.utils.tee.file_only_stream", }, "root_file_only": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "simple", "stream": "ext://axolotl.utils.tee.file_only_stream", }, }, "root": { "handlers": ["console", "root_file_only"], "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper(), }, "loggers": { "axolotl": { "handlers": ["color_console", "ax_file_only"], "level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL).upper(), "propagate": False, }, }, } def configure_logging(): """Configure with default logging""" init() # Initialize colorama dictConfig(DEFAULT_LOGGING_CONFIG) logging.setLoggerClass(AxolotlLogger) # Route Python warnings through logging so they reach file handlers logging.captureWarnings(True) # Set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set if "ACCELERATE_LOG_LEVEL" not in os.environ: os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv( "LOG_LEVEL", DEFAULT_LOG_LEVEL ).upper() ================================================ FILE: src/axolotl/models/__init__.py ================================================ ================================================ FILE: src/axolotl/models/mamba/__init__.py ================================================ """ Modeling module for Mamba models """ import importlib def check_mamba_ssm_installed(): mamba_ssm_spec = importlib.util.find_spec("mamba_ssm") if mamba_ssm_spec is None: raise ImportError( "MambaLMHeadModel requires mamba_ssm. Please install it with `pip install -e .[mamba-ssm]`" ) def fix_mamba_attn_for_loss(): check_mamba_ssm_installed() from mamba_ssm.models import mixer_seq_simple from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed return mixer_seq_simple.MambaLMHeadModel ================================================ FILE: src/axolotl/models/mamba/configuration_mamba.py ================================================ """ HF Transformers MambaConfig """ from transformers import PretrainedConfig class MambaConfig(PretrainedConfig): """ modeling configuration for state space model/mamba """ model_type = "mamba" def __init__( self, vocab_size=50280, d_model=2560, n_layer=64, rms_norm=True, residual_in_fp32=True, fused_add_norm=True, pad_vocab_size_multiple=8, pad_token_id=50277, bos_token_id=0, eos_token_id=0, tie_word_embeddings=False, **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model self.n_layer = n_layer self.rms_norm = rms_norm self.residual_in_fp32 = residual_in_fp32 self.fused_add_norm = fused_add_norm self.pad_vocab_size_multiple = pad_vocab_size_multiple super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) ================================================ FILE: src/axolotl/models/mamba/modeling_mamba.py ================================================ import os from collections import namedtuple from functools import partial from typing import Optional, Union import torch from mamba_ssm.models.mixer_seq_simple import MixerModel, _init_weights from mamba_ssm.utils.generation import GenerationMixin from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf from torch import nn from torch.nn import CrossEntropyLoss from axolotl.models.mamba.configuration_mamba import MambaConfig class MambaLMHeadModel(nn.Module, GenerationMixin): def __init__( self, d_model: int, n_layer: int, vocab_size: int, initializer_cfg=None, pad_vocab_size_multiple: int = 1, device=None, dtype=None, **backbone_kwargs, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() if vocab_size % pad_vocab_size_multiple != 0: vocab_size += pad_vocab_size_multiple - ( vocab_size % pad_vocab_size_multiple ) self.config = MambaConfig( vocab_size=vocab_size, d_model=d_model, n_layer=n_layer, pad_vocab_size_multiple=pad_vocab_size_multiple, ) self.backbone = MixerModel( d_model=d_model, n_layer=n_layer, vocab_size=vocab_size, initializer_cfg=initializer_cfg, **backbone_kwargs, **factory_kwargs, ) self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs) # Initialize weights and apply final processing self.apply( partial( _init_weights, n_layer=n_layer, **(initializer_cfg if initializer_cfg is not None else {}), ) ) self.tie_weights() def tie_weights(self): self.lm_head.weight = self.backbone.embedding.weight def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): return self.backbone.allocate_inference_cache( batch_size, max_seqlen, dtype=dtype, **kwargs ) def forward( self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0, labels=None, **kwargs, ): """ "position_ids" is just to be compatible with Transformer generation. We don't use it. num_last_tokens: if > 0, only return the logits for the last n tokens """ hidden_states = self.backbone(input_ids, inference_params=inference_params) if num_last_tokens > 0: hidden_states = hidden_states[:, -num_last_tokens:] lm_logits = self.lm_head(hidden_states) CausalLMOutput = namedtuple("CausalLMOutput", ["logits"]) return CausalLMOutput(logits=lm_logits) loss = None if labels is not None: logits = lm_logits # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) CausalLMOutput = namedtuple("CausalLMOutput", ["logits", "loss"]) print(loss) return CausalLMOutput(logits=lm_logits, loss=loss) else: CausalLMOutput = namedtuple("CausalLMOutput", ["logits"]) return CausalLMOutput(logits=lm_logits) def save_pretrained( self, save_directory: Union[str, os.PathLike], state_dict: Optional[dict] = None, **kwargs, ): if state_dict is None: state_dict = self.state_dict() torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin")) @classmethod def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs): config = load_config_hf(pretrained_model_name) model = cls(**config, device=device, dtype=dtype, **kwargs) model.load_state_dict( load_state_dict_hf(pretrained_model_name, device={"": device}, dtype=dtype) ) return model ================================================ FILE: src/axolotl/monkeypatch/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/accelerate/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/accelerate/fsdp2.py ================================================ """ monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interation, and saving full state dicts """ import copy import functools import os import sys import torch import torch.distributed as dist from torch import nn from axolotl.utils.bench import log_gpu_memory_usage from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def fsdp2_load_full_state_dict( _accelerator, model: torch.nn.Module, full_sd: dict, offload_to_cpu: bool = False ): """ Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the parameters from rank 0 to all other ranks. This function modifies the model in-place. Args: accelerator (`Accelerator`): The accelerator instance model (`torch.nn.Module`): The model to load the state dict into, expected to be on meta device or a VRAM spike can occur full_sd (`dict`): The full state dict to load, can only be on rank 0 """ from torch.distributed.tensor import distribute_tensor LOG.info("Broadcasting full state dict to all ranks...") import time start_time = time.time() meta_sharded_sd = model.state_dict() sharded_sd = {} for param_name, sharded_meta_param in meta_sharded_sd.items(): full_tensor = None if _accelerator.is_main_process: full_tensor = full_sd[param_name] full_tensor = full_tensor.to(sharded_meta_param.dtype) if hasattr(sharded_meta_param, "device_mesh"): device_mesh = sharded_meta_param.device_mesh if _accelerator.is_main_process: full_tensor = full_tensor.to(device_mesh.device_type) else: full_tensor = torch.empty( sharded_meta_param.size(), device=device_mesh.device_type, dtype=sharded_meta_param.dtype, ) sharded_param = distribute_tensor( full_tensor, device_mesh, sharded_meta_param.placements, src_data_rank=0, ) else: # Non-sharded parameters if _accelerator.is_main_process: sharded_param = full_tensor.to(torch.device("cuda")) else: # broadcast manually sharded_param = torch.empty_like( sharded_meta_param, device=torch.device("cuda"), dtype=sharded_meta_param.dtype, ) dist.broadcast(sharded_param, src=0) if offload_to_cpu: sharded_param = sharded_param.cpu() sharded_sd[param_name] = nn.Parameter(sharded_param) del full_tensor full_sd[param_name] = None model.load_state_dict(sharded_sd, assign=True, strict=True) end_time = time.time() LOG.debug( f"Time taken to load full state dict: {(end_time - start_time):.2f} seconds" ) log_gpu_memory_usage(LOG, "Memory usage after broadcasting full state dict", 0) return model def get_state_dict(self, model, unwrap=True): """ Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full precision. Args: model (`torch.nn.Module`): A PyTorch model sent through [`Accelerator.prepare`] unwrap (`bool`, *optional*, defaults to `True`): Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict Returns: `dict`: The state dictionary of the model potentially without full precision. Example: ```python >>> import torch >>> from accelerate import Accelerator >>> accelerator = Accelerator() >>> net = torch.nn.Linear(2, 2) >>> net = accelerator.prepare(net) >>> state_dict = accelerator.get_state_dict(net) ``` """ from accelerate import DistributedType from accelerate.utils import compare_versions if self.distributed_type == DistributedType.DEEPSPEED: zero3_sharding = self.deepspeed_config["zero_optimization"]["stage"] == 3 tp_sharding = ( self.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0) > 1 ) if zero3_sharding or tp_sharding: if model.zero_gather_16bit_weights_on_model_save(): if tp_sharding and not compare_versions("deepspeed", ">=", "0.16.4"): raise ImportError( "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`." ) state_dict = ( model._consolidated_16bit_state_dict() if tp_sharding else model._zero3_consolidated_16bit_state_dict() ) else: raise ValueError( "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. " "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or " "set `zero3_save_16bit_model` to True when using `accelerate config`. " "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights." ) else: from deepspeed.checkpoint.utils import clone_tensors_for_torch_save state_dict = clone_tensors_for_torch_save( self.unwrap_model(model).state_dict() ) elif self.is_fsdp2: # https://github.com/pytorch/torchtune/blob/main/torchtune/training/_distributed.py#L465 from torch.distributed.tensor import DTensor state_dict = {} sharded_state_dict = model.state_dict() for param_name, param in sharded_state_dict.items(): if param.is_cpu: param = param.to(torch.device("cuda")) if isinstance(param, DTensor): param = param.full_tensor() if torch.distributed.get_rank() == 0: state_dict[param_name] = param.cpu() torch.distributed.barrier() elif self.distributed_type == DistributedType.FSDP: from torch.distributed.fsdp import ( FullStateDictConfig, FullyShardedDataParallel as FSDP, StateDictType, ) full_state_dict_config = FullStateDictConfig( offload_to_cpu=True, rank0_only=True ) with FSDP.state_dict_type( model, StateDictType.FULL_STATE_DICT, full_state_dict_config ): state_dict = model.state_dict() else: if unwrap: model = self.unwrap_model(model) state_dict = model.state_dict() return state_dict def patch_peft_param_wrapper_for_fsdp2(): """Patch PEFT's _LoraParameterProxy.forward for FSDP2 DTensor compatibility. PEFT's ParamWrapper applies LoRA via torch.nn.utils.parametrize, which adds delta_weight to the base weight W inside _LoraParameterProxy.forward(). Under FSDP2, W may be a DTensor (from FSDP unshard) while delta_weight is a regular Tensor (or vice versa), causing a RuntimeError on mixed types. This patch promotes the non-DTensor operand to match the DTensor's spec using DTensor.from_local(), which is free for Replicate placement (just metadata wrapping, no communication). """ from peft.tuners.lora.layer import _LoraParameterProxy if getattr(_LoraParameterProxy, "_axolotl_fsdp2_patched", False): return _original_forward = _LoraParameterProxy.forward # NOTE: Replaces (not wraps) forward; assumes original is just `W + self.delta_weight`. def _patched_forward(self, W): from torch.distributed.tensor import DTensor delta = self.delta_weight w_is_dt = isinstance(W, DTensor) d_is_dt = isinstance(delta, DTensor) with torch.nn.utils.parametrize.cached(): if w_is_dt == d_is_dt: return W + delta if w_is_dt: return W + DTensor.from_local(delta, W.device_mesh, W.placements) return DTensor.from_local(W, delta.device_mesh, delta.placements) + delta _LoraParameterProxy.forward = _patched_forward _LoraParameterProxy._axolotl_fsdp2_patched = True LOG.info("Patched PEFT _LoraParameterProxy.forward for FSDP2 DTensor compatibility") def _process_lora_module_for_fsdp(module, fsdp2_kwargs): """Helper function to process LoRA modules for FSDP2.""" from peft.tuners.lora.layer import ParamWrapper from torch.distributed.fsdp import fully_shard # Skip ParamWrapper — its lora_A/B must not be independently sharded. # The parent decoder layer's FSDP wrapper handles unsharding them. # TODO: review if we even need to shard them separately in first place. if isinstance(module, ParamWrapper): return False log_bias_dtype_mismatch = False # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to # wrap this. Therefore we must ensure the bias has the same dtype as the weight if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None: if module.base_layer.weight.dtype != module.base_layer.bias.dtype: log_bias_dtype_mismatch = True module.base_layer.bias.data = module.base_layer.bias.data.to( module.base_layer.weight.dtype ) for active_adapter in module.active_adapters: if module.lora_A: fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs) if module.lora_B: fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs) if module.lora_magnitude_vector: fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs) # lora_embedding_A/B are ParameterDicts containing nn.Parameter (Tensors), # not nn.Module. fully_shard() only accepts nn.Module, so we cannot shard # individual embedding Parameters. Instead, shard the entire LoraLayer module. fully_shard() can be used hierarchically because it does not # override groups already assigned by fully_shard(), so modules # where fully_shard() was already called are not affected [see https://docs.pytorch.org/docs/stable/distributed.fsdp.fully_shard.html] if module.lora_embedding_A or module.lora_embedding_B: from torch.distributed.fsdp import FSDPModule if not isinstance(module, FSDPModule): fully_shard(module, **fsdp2_kwargs) return log_bias_dtype_mismatch def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module: """Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model. Args: accelerator (`Accelerator`): The accelerator instance model (`torch.nn.Module`): The model to prepare Returns: `torch.nn.Module`: Prepared model """ from accelerate.utils import get_module_children_bottom_up, is_compiled_module from accelerate.utils.fsdp_utils import fsdp2_prepare_auto_wrap_policy from accelerate.utils.modeling import get_non_persistent_buffers from peft import PeftModel from peft.tuners.lora import LoraLayer from torch.distributed.fsdp import ( CPUOffloadPolicy, FSDPModule, MixedPrecisionPolicy, fully_shard, ) is_type_fsdp = isinstance(model, FSDPModule) or ( is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule) ) if is_type_fsdp: return model fsdp2_plugin = accelerator.state.fsdp_plugin original_sd = model.state_dict() from torch.distributed.fsdp.wrap import ( size_based_auto_wrap_policy, transformer_auto_wrap_policy, ) # We need the `auto_wrap_policy` original type to create a custom poilicy function for sharding # This is because `fully_shard` doesn't support old auto wrap policies, rather we have to imitate the behaviour if fsdp2_plugin.auto_wrap_policy is transformer_auto_wrap_policy: pass # auto_wrap_policy_type = "transformer" elif fsdp2_plugin.auto_wrap_policy is size_based_auto_wrap_policy: pass # auto_wrap_policy_type = "size" # We set `auto_wrap_policy` to `functools.partial` to avoid creating it again # This is because of `apply_activation_checkpointing` which will can reuse this function fsdp2_plugin.set_auto_wrap_policy(model) if fsdp2_plugin.activation_checkpointing: from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper, ) # Apply activation checkpointing before applying `fully_shard` apply_activation_checkpointing( model, checkpoint_wrapper_fn=functools.partial( checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, ), auto_wrap_policy=fsdp2_plugin.auto_wrap_policy, ) mesh = getattr(accelerator.state, "device_mesh", None) # Disable memory pinning if requested offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy) if offload_to_cpu and os.environ.get("FSDP_CPU_OFFLOAD_PIN_MEMORY", "") == "false": fsdp2_plugin.cpu_offload.pin_memory = False fsdp2_kwargs = { "reshard_after_forward": fsdp2_plugin.reshard_after_forward, "offload_policy": fsdp2_plugin.cpu_offload, # `fully_shard` doesn't accept `None` in case of `MixedPrecisionPolicy` "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(), "mesh": ( mesh[tuple(accelerator.state.parallelism_config.fsdp_dim_names)] if mesh is not None else None ), } model_has_params4bit = False for _, param in model.named_parameters(): # this is a temporary fix whereby loading models with bnb params cannot be moved from # GPU to a meta device due with FSDP2 because torch operations don't return the original class type # bypassing the move to meta will still cause the VRAM spike, but at least it still will load if param.__class__.__name__ == "Params4bit": model_has_params4bit = True break if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit: # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard` # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.emtpy`), `fully_shard` would move it to GPU # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device # Also, these buffers aren't getting sharded by default # We get the FQNs of all non-persistent buffers, to re-register them after non_persistent_buffer_fqns = get_non_persistent_buffers( model, recurse=True, fqns=True ) original_non_persistent_buffers = copy.deepcopy( {k: v for k, v in model.named_buffers() if k in non_persistent_buffer_fqns} ) # We move the model to meta device, as then sharding happens on meta device model = model.to(torch.device("meta")) # We need to re-tie the weights, not exactly sure why, but if we don't do this, reference to `lm_head/embed_tokens` stay hanging -> more VRAM usage # We assume `transformers` models have a `tie_weights` method if they support it if hasattr(model, "tie_weights"): model.tie_weights() is_peft_model = isinstance(model, PeftModel) # Patch PEFT's _LoraParameterProxy for DTensor compatibility if any # ParamWrapper modules exist (used for target_parameters / 3D expert params). if is_peft_model: from peft.tuners.lora.layer import ParamWrapper if any(isinstance(m, ParamWrapper) for m in model.modules()): patch_peft_param_wrapper_for_fsdp2() auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model) log_bias_dtype_mismatch = False if auto_wrap_policy is not None: for module in get_module_children_bottom_up(model)[:-1]: if is_peft_model and isinstance(module, LoraLayer): module_log_bias_mismatch = _process_lora_module_for_fsdp( module, fsdp2_kwargs ) log_bias_dtype_mismatch |= module_log_bias_mismatch if auto_wrap_policy(module) and not isinstance(module, FSDPModule): fully_shard(module, **fsdp2_kwargs) fully_shard(model, **fsdp2_kwargs) if log_bias_dtype_mismatch: LOG.warning( "Bias dtype mismatch detected in LoRA base linear layer. Bias parameters have been cast to weight dtype." ) if fsdp2_plugin.cpu_ram_efficient_loading: fsdp2_load_full_state_dict( accelerator, model, original_sd, offload_to_cpu=offload_to_cpu ) if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit: # We re-register the buffers, as they may not be in the state_dict for fqn, buffer_tensor in original_non_persistent_buffers.items(): buffer_tensor = buffer_tensor.to(accelerator.device) if "." in fqn: parent_fqn, local_buffer_name = fqn.rsplit(".", 1) parent_module = model.get_submodule(parent_fqn) else: local_buffer_name = fqn parent_module = model parent_module.register_buffer( local_buffer_name, buffer_tensor, persistent=False ) # We need to tie the weights again, as call to `load_full_state_dict` breaks the tie # Needs to be called both here and above # removing this call makes the have slightly different loss # removing the call above leads to extra memory usage as explained in the comment above if hasattr(model, "tie_weights"): model.tie_weights() return model def patch_tied_keys_for_meta_device(): """Patch _adjust_tied_keys_with_tied_pointers to skip meta tensors. Meta tensors all share data_ptr()==0, causing every parameter to be incorrectly grouped as "tied". Skipping them is safe since they have no real storage. """ from collections import defaultdict from transformers import PreTrainedModel def _patched_adjust_tied_keys_with_tied_pointers(self, missing_keys): param_pointers = defaultdict(list) for param_name, param_value in self.state_dict().items(): if param_value.is_meta: continue param_pointers[param_value.data_ptr()].append(param_name) tied_param_names = [ names for names in param_pointers.values() if len(names) > 1 and not any(name in self.all_tied_weights_keys.keys() for name in names) and not all(name in missing_keys for name in names) ] tied_weights_keys_by_pointers = { param_name: group[0] for group in tied_param_names for param_name in group[1:] } self.all_tied_weights_keys.update(tied_weights_keys_by_pointers) PreTrainedModel._adjust_tied_keys_with_tied_pointers = ( _patched_adjust_tied_keys_with_tied_pointers ) def patch_initialize_missing_keys_for_fsdp(): """Patch _initialize_missing_keys to skip re-initialization on FSDP non-rank-0. When using cpu_ram_efficient_loading, non-rank-0 processes load weights on meta device and move them to CPU as empty tensors. Without this patch, initialize_weights() re-initializes ALL parameters (via guarded init functions), which is slow and uses extra RAM per process. The fix marks all params/buffers with _is_hf_initialized=True before calling the original method, so guarded init functions (init.normal_, init.zeros_, etc.) become no-ops on non-rank-0 processes. The real weights arrive later via FSDP broadcast from rank 0. Upstream fix: https://github.com/huggingface/transformers/pull/44473 Remove this patch once transformers includes the fix in a stable release. """ from transformers import PreTrainedModel from transformers.modeling_utils import is_fsdp_enabled, is_local_dist_rank_0 if getattr(PreTrainedModel._initialize_missing_keys, "_axolotl_patched", False): return _original_initialize_missing_keys = PreTrainedModel._initialize_missing_keys def _patched_initialize_missing_keys(self, is_quantized: bool) -> None: if is_fsdp_enabled() and not is_local_dist_rank_0(): for key in self.state_dict(): try: param_or_buffer = self.get_parameter_or_buffer(key) param_or_buffer._is_hf_initialized = True except AttributeError: pass # may happen when handling pre-quantized weights self._is_hf_initialized = True _original_initialize_missing_keys(self, is_quantized) PreTrainedModel._initialize_missing_keys = _patched_initialize_missing_keys PreTrainedModel._initialize_missing_keys._axolotl_patched = True def patch_accelerate_fsdp2(): import accelerate accelerate.accelerator.fsdp2_prepare_model = fsdp2_prepare_model accelerate.Accelerator.get_state_dict = get_state_dict setattr( sys.modules["accelerate"], "Accelerator.get_state_dict", get_state_dict, ) ================================================ FILE: src/axolotl/monkeypatch/accelerate/parallelism_config.py ================================================ """ workaround to allow parallelism config for pure CP """ import os import warnings from accelerate import DistributedType def _validate_accelerator(self, accelerator): _warnings = set() if not accelerator.multi_device and self.total_size == 1: # No distributed setup, valid parallelism config return # We need this to ensure DDP works if self.total_size == 1: self._set_size("dp_replicate", accelerator.num_processes) if self.total_size != accelerator.num_processes: raise ValueError( f"ParallelismConfig total_size ({self.total_size}) does not match " f"num_processes ({accelerator.num_processes}). Please adjust dp_replicate_size/ " f"dp_shard_size/tp_size/cp_size." ) # allow parallelism config when not using fsdp if using pure context parallelism allow_parallelism_config = False if ( self.cp_size > 1 and self.dp_shard_size <= 1 and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true" ): allow_parallelism_config = True if ( self.total_size > 1 and not allow_parallelism_config and not (accelerator.is_fsdp2 or accelerator.multi_device) ): raise ValueError( f"ParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{{Device}}, but got {accelerator.distributed_type}." ) for parallelism, size in self._sizes.items(): if size == 1 and getattr(self, f"{parallelism}_handler", None) is not None: _warnings.add( f"ParallelismConfig.{parallelism}_handler is set, but {parallelism}_size is set to 1. This handler will be ignored." ) if _warnings and accelerator.is_main_process: warnings.warn( "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings), UserWarning, stacklevel=2, ) def patched_is_fsdp2(self) -> bool: """ Patched version of is_fsdp2 that guards against a None fsdp_plugin. """ # The new logic checks if fsdp_plugin exists before accessing its attributes return ( self.distributed_type == DistributedType.FSDP and self.fsdp_plugin and self.fsdp_plugin.fsdp_version == 2 ) def patch_parallelism_config(): from accelerate.accelerator import AcceleratorState, ParallelismConfig ParallelismConfig._validate_accelerator = _validate_accelerator AcceleratorState.is_fsdp2 = property(patched_is_fsdp2) def patch_prepare_cp(): import contextlib from accelerate import Accelerator def patched_prepare_cp(self, *args): if self.parallelism_config.cp_backend == "deepspeed": return args @contextlib.contextmanager def _noop_cp_context( buffers=None, buffer_seq_dims=None, no_restore_buffers=None ): yield self._cp_context = _noop_cp_context return args Accelerator._prepare_cp = patched_prepare_cp ================================================ FILE: src/axolotl/monkeypatch/attention/__init__.py ================================================ """ attention module for attention monkeypatches """ from transformers.integrations.flash_attention import flash_attention_forward def patch_xformers_attn_over_fa2(): from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from .xformers import xformers_attention_forward ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward def unpatch_xformers_attn_over_fa2(): from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward() ================================================ FILE: src/axolotl/monkeypatch/attention/flash_attn_4.py ================================================ """Transparently upgrade FA2 to FA4 when available on SM90+ hardware.""" import torch from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def _get_head_dims(model_config): """Extract (head_dim, head_dim_v) from a model config. Handles composite models (e.g. Qwen3.5 VL) via text_config and MLA models (DeepSeek/Kimi) that have separate Q/V head dimensions. """ cfg = model_config if hasattr(cfg, "text_config"): cfg = cfg.text_config # MLA models: Q head_dim = qk_nope + qk_rope, V head_dim = v_head_dim if hasattr(cfg, "qk_nope_head_dim") and hasattr(cfg, "qk_rope_head_dim"): head_dim = cfg.qk_nope_head_dim + cfg.qk_rope_head_dim head_dim_v = getattr(cfg, "v_head_dim", head_dim) return head_dim, head_dim_v # Standard models if hasattr(cfg, "head_dim"): return cfg.head_dim, cfg.head_dim if hasattr(cfg, "hidden_size") and hasattr(cfg, "num_attention_heads"): head_dim = cfg.hidden_size // cfg.num_attention_heads return head_dim, head_dim return None, None def patch_flash_attn_4(model_config=None): """Patch _lazy_imports to redirect FA2 imports to FA4 if available on supported hardware.""" if not torch.cuda.is_available(): return major, _ = torch.cuda.get_device_capability() # Matches flash_attn/cute/interface.py: arch / 10 in [9, 10, 11] if major not in (9, 10, 11): return try: from flash_attn.cute import ( # noqa: F401 flash_attn_func, flash_attn_varlen_func, ) except ImportError: LOG.info( "Flash Attention 4 is available for your GPU and offers faster training speeds. " "To enable: pip install flash-attn-4" ) return # Validate head dimensions against FA4's own constraints head_dim = None if model_config is not None: head_dim, head_dim_v = _get_head_dims(model_config) if head_dim is not None: try: from flash_attn.cute.interface import _validate_head_dims except ImportError: LOG.warning( "Could not import _validate_head_dims from flash_attn.cute.interface, " "unable to verify head dimension compatibility, falling back to FA2" ) return # alignment = 16 // element_size; bf16/fp16 = 2 bytes -> alignment = 8 alignment = 8 try: _validate_head_dims(head_dim, head_dim_v, major, alignment) except AssertionError as exc: LOG.warning( "Model head dimensions not supported by FA4, " "falling back to FA2: %s", exc, ) return import transformers.modeling_flash_attention_utils as fa_utils if getattr(fa_utils._lazy_imports, "_axolotl_patched", False): return def _patched_lazy_imports( implementation, attention_wrapper=None, allow_all_kernels=False ): return ( flash_attn_func, flash_attn_varlen_func, fa_utils._pad_input, fa_utils._unpad_input, ) _patched_lazy_imports._axolotl_patched = True fa_utils._lazy_imports = _patched_lazy_imports LOG.info( "Flash Attention 4 enabled (head_dim=%s)", head_dim if model_config else "unknown", ) ================================================ FILE: src/axolotl/monkeypatch/attention/flex_attn.py ================================================ """Flex attention monkey patch""" import sys import torch import transformers from packaging import version from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def patch_flex_wrapper(**flex_attn_compile_kwargs): # TODO remove this patch when transformers#37285 is merged and in a release is_torch_2_6 = torch.__version__.startswith("2.6") if not is_torch_2_6: return from torch.nn.attention.flex_attention import flex_attention class WrappedFlexAttention: """ We are doing a singleton class so that flex attention is compiled once when it's first called. """ _instance = None _is_flex_compiled = False _compiled_flex_attention = None def __new__(cls, *args, **kwargs): if cls._instance is None: # Create a new instance if one doesn't already exist cls._instance = super().__new__(cls) return cls._instance @classmethod def del_singleton(cls): cls._instance = None @torch.compiler.disable(recursive=False) def __init__(self, training): """ Initialize or update the singleton instance. """ self.training = None if not self._is_flex_compiled or training != self.training: self.training = training if is_torch_less_or_equal("2.5.1"): self._compiled_flex_attention = torch.compile( flex_attention, dynamic=False ) # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs" # see https://github.com/pytorch/pytorch/issues/146260 for training elif version.parse(_torch_version).base_version == "2.6.0" and training: self._compiled_flex_attention = torch.compile( flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs" ) # Fallback, usually the most recent torch 2.7.x+ versions else: LOG.info( "Compiling flex attention with kwargs: %s. This may take a while...", flex_attn_compile_kwargs, ) self._compiled_flex_attention = torch.compile( flex_attention, **flex_attn_compile_kwargs, ) LOG.info("Flex attention compiled successfully.") self._is_flex_compiled = True def __call__(self): return self._compiled_flex_attention transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention sys.modules[ "transformers.integrations.flex_attention" ].WrappedFlexAttention = WrappedFlexAttention ================================================ FILE: src/axolotl/monkeypatch/attention/sage_attn.py ================================================ """ Monkeypatch for SageAttention for use with transformers. https://github.com/thu-ml/SageAttention/ """ import torch from transformers.integrations.sdpa_attention import repeat_kv from axolotl.utils.logging import get_logger LOG = get_logger(__name__) sageattn = None # pylint: disable=invalid-name sageattn_varlen = None # pylint: disable=invalid-name def _is_sageattn_available(): """Determine if SageAttention is available""" try: import sageattention # noqa: F401 # pylint: disable=unused-import return True except ImportError: return False if _is_sageattn_available(): # import sageattn here if available from sageattention import sageattn, sageattn_varlen def _check_sageattn_imported(): """Check if SageAttention is imported. Raises an ImportError if not.""" if sageattn is None: raise ImportError( "SageAttention is not installed. Please install it from source: " "`pip install git+https://github.com/thu-ml/SageAttention.git@1718ddc06dbc694bcf3c6b49ac28c1921aa2d8bd`" ) def sage_attention_forward( module: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None = None, dropout: float = 0.0, scaling: float | None = None, is_causal: bool | None = None, **kwargs, ) -> tuple[torch.Tensor, None]: """ Forward pass for SageAttention compatible with transformers attention interfaces. https://github.com/thu-ml/SageAttention/ """ _check_sageattn_imported() if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None: raise NotImplementedError( "SageAttention does not support `output_attentions=True` or `head_mask`." ) # The base sageattn API does not support dropout. if dropout > 0.0: raise NotImplementedError("SageAttention does not support dropout.") # Handle Grouped-Query Attention (GQA) and Multi-Query Attention (MQA) if hasattr(module, "num_key_value_groups"): key = repeat_kv(key, module.num_key_value_groups) value = repeat_kv(value, module.num_key_value_groups) # Calculate is_causal following transformers assert is_causal is not False, "is_causal must be True or None" is_causal = True position_ids = kwargs.get("position_ids", None) query_length = query.shape[2] cu_seqlens_q = kwargs.get("cu_seqlens_q", None) cu_seqlens_k = kwargs.get("cu_seqlens_k", None) max_length_q = kwargs.get("max_length_q", None) max_length_k = kwargs.get("max_length_k", None) # Sample packing uses position_ids, so we check for it first if position_ids is not None and ( max_length_q is not None or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all()) ): # transpose inputs to NHD layout for use with FA2 utils query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) batch_size = query.size(0) from transformers.modeling_flash_attention_utils import ( prepare_fa2_from_position_ids, ) if cu_seqlens_q is None or cu_seqlens_k is None: query, key, value, indices_q, cu_seq_lens, max_seq_lens = ( prepare_fa2_from_position_ids(query, key, value, position_ids) ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_length_q, max_length_k = max_seq_lens else: query = query.reshape(-1, query.size(-2), query.size(-1)) key = key.reshape(-1, key.size(-2), key.size(-1)) value = value.reshape(-1, value.size(-2), value.size(-1)) attn_output_unpad = sageattn_varlen( q=query, k=key, v=value, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_length_q, max_seqlen_k=max_length_k, is_causal=is_causal, sm_scale=scaling, smooth_k=False, # reduces loss 0 / nan grad norms tensor_layout="NHD", ) attn_output = attn_output_unpad.view( batch_size, -1, attn_output_unpad.size(-2), attn_output_unpad.size(-1) ) elif attention_mask is not None: # NOTE: When used without `pad_to_sequence_len`, the loss becomes unstable after a few steps. assert attention_mask.ndim == 2, "Attention mask must be 2D" from transformers.modeling_flash_attention_utils import ( _upad_input, ) # transpose inputs to NHD layout for use with FA2 utils query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) batch_size = query.shape[0] query, key, value, indices_q, cu_seq_lens, max_seq_lens = _upad_input( query, key, value, attention_mask, query_length ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_seqlen_q, max_seqlen_k = max_seq_lens attn_output_unpad = sageattn_varlen( q=query, k=key, v=value, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, is_causal=is_causal, sm_scale=scaling, tensor_layout="NHD", ) from flash_attn.bert_padding import pad_input attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: # Use standard sageattn # The input layout for transformers models is (batch_size, num_heads, seq_len, head_dim), # which corresponds to SageAttention's "HND" layout. attn_output = sageattn( q=query, k=key, v=value, tensor_layout="HND", is_causal=is_causal, sm_scale=scaling, ) # SageAttention with "HND" returns (batch, heads, seq_len, head_dim) # Transformers expects (batch, seq_len, heads, head_dim) for the output # So we need to transpose dimensions 1 and 2 attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, None def patch_sageattn(): """Patch SageAttention for use with transformers.""" _check_sageattn_imported() from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS # Replace flash attention with sage attention ALL_ATTENTION_FUNCTIONS.register("flash_attention_2", sage_attention_forward) # Note: New method after transformers refactor to use ALL_MASK_ATTENTION_FUNCTIONS # Register sage_attention with the global attention interface # ALL_ATTENTION_FUNCTIONS.register("sage_attention", sage_attention_forward) # from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, flash_attention_mask # ALL_MASK_ATTENTION_FUNCTIONS.register("sage_attention", flash_attention_mask) LOG.info("SageAttention patched successfully") ================================================ FILE: src/axolotl/monkeypatch/attention/xformers.py ================================================ """ xformers attention implementation for packing """ from typing import Optional import torch import xformers import xformers.ops.fmha from transformers.modeling_flash_attention_utils import ( _upad_input, ) from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids xformers_attention = xformers.ops.fmha.memory_efficient_attention def xformers_attention_forward( module: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, dropout: float = 0.0, scaling: Optional[float] = None, sliding_window: Optional[int] = None, softcap: Optional[float] = None, cu_seq_lens_q: Optional[torch.LongTensor] = None, cu_seq_lens_k: Optional[torch.LongTensor] = None, max_length_q: Optional[int] = None, max_length_k: Optional[int] = None, **kwargs, ): # Get dimensions # query: [batch, heads, seq_len, hidden_dim] batch_size = query.size(0) query_length = query.shape[2] key_length = key.shape[2] # Default causal mask attn_bias = xformers.ops.LowerTriangularMask() # Check if we have sliding window attention has_sliding_window = sliding_window is not None and sliding_window < query_length # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d]) query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) # Get GQA parameters num_attention_heads = module.config.num_attention_heads num_key_value_heads = module.config.num_key_value_heads head_dim = query.size(-1) is_gqa = num_attention_heads != num_key_value_heads n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1 # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage. # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach if position_ids is not None and ( max_length_q is not None or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all()) ): if cu_seq_lens_q is None or cu_seq_lens_k is None: cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0] cu_seq_lens_q = cu_seq_lens_q.squeeze() seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1] attn_bias = ( xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens( q_seqlen=seq_lengths.tolist(), ) ) else: query = query.reshape(-1, query.size(-2), query.size(-1)) key = key.reshape(-1, key.size(-2), key.size(-1)) value = value.reshape(-1, value.size(-2), value.size(-1)) # Handle GQA if is_gqa: key = key.repeat_interleave(n_groups, dim=2) value = value.repeat_interleave(n_groups, dim=2) elif attention_mask is not None: query, key, value, _, cu_seq_lens, _ = _upad_input( query, key, value, attention_mask, query_length ) cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens seq_lengths = [] for i in range(len(cu_seq_lens_q) - 1): seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i]) attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens( q_seqlen=seq_lengths, kv_seqlen=seq_lengths, ) # Handle GQA if is_gqa: key = key.repeat_interleave(n_groups, dim=2) value = value.repeat_interleave(n_groups, dim=2) else: # Handle Group Query Attention (GQA) using view/expand approach from reference key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim) value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim) key = key.expand( batch_size, key_length, num_key_value_heads, n_groups, head_dim ) value = value.expand( batch_size, key_length, num_key_value_heads, n_groups, head_dim ) if module.training: key = key.reshape(batch_size, key_length, num_attention_heads, head_dim) value = value.reshape(batch_size, key_length, num_attention_heads, head_dim) if has_sliding_window: query = query.view( 1, batch_size * query_length, num_attention_heads, head_dim ) key = key.view( 1, batch_size * key_length, num_attention_heads, head_dim ) value = value.view( 1, batch_size * key_length, num_attention_heads, head_dim ) else: query = query.view( batch_size, query_length, num_key_value_heads, n_groups, head_dim ) # If we need a sliding window attention if has_sliding_window: query = query.view( 1, batch_size * query_length, num_key_value_heads, n_groups, head_dim, ) key = key.view( 1, batch_size * key_length, num_key_value_heads, n_groups, head_dim ) value = value.view( 1, batch_size * key_length, num_key_value_heads, n_groups, head_dim ) # Run the xformers attention attn_output = xformers_attention( query, key, value, attn_bias=attn_bias, ) attn_output = attn_output.view( batch_size, -1, attn_output.size(-2), attn_output.size(-1) ) return attn_output, None ================================================ FILE: src/axolotl/monkeypatch/btlm_attn_hijack_flash.py ================================================ """ Flash attention monkey patch for cerebras btlm model """ import importlib from typing import Optional, Tuple import torch from accelerate import init_empty_weights from flash_attn.flash_attn_interface import flash_attn_func from transformers import AutoConfig, AutoModelForCausalLM from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"): # this is a wonky hack to get the remotely loaded module model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) # we need to load the model here in order for modeling_btlm to be available with init_empty_weights(): AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) module_name = model_config.__class__.__module__.replace( ".configuration_btlm", ".modeling_btlm" ) modeling_btlm = importlib.import_module(module_name) modeling_btlm.BTLMAttention._attn = flashattn_attn def flashattn_attn( self, query: torch.Tensor, key: Optional[torch.Tensor] = None, value: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, position_bias: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: softmax_scale = ( 1 / (key.size(-1) ** self.attn_scale_power) if self.scale_attn_weights else None ) query = query.permute(0, 2, 1, 3) key = key.permute(0, 2, 1, 3) value = value.permute(0, 2, 1, 3) # Perform Flash attention attn_output = flash_attn_func( query, key, value, dropout_p=0.0, # Assuming you have this attribute softmax_scale=softmax_scale, # Set this if you have specific scaling in mind causal=not self.is_cross_attention, # Assuming you have this attribute return_attn_probs=False, # Set this based on your needs ) # Optional: Apply head mask if it's not None if head_mask is not None: attn_output *= head_mask attn_output = attn_output.permute(0, 2, 1, 3) return attn_output, None # We don't have explicit attn_weights in Flash attention ================================================ FILE: src/axolotl/monkeypatch/data/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/data/batch_dataset_fetcher.py ================================================ """Monkey patches for the dataset fetcher to handle batches of packed indexes.""" import torch from torch.utils.data._utils.fetch import _BaseDatasetFetcher from torch.utils.data._utils.worker import _worker_loop _ORIGINAL_MAP_DATASET_FETCHER = None _ORIGINAL_WORKER_LOOP = None _IS_PATCHED = False class _MapDatasetFetcher(_BaseDatasetFetcher): """ Custom dataset fetcher that handles nested batch structures from MultipackBatchSampler. """ def fetch(self, possibly_batched_index): if isinstance(possibly_batched_index[0], list): # Handle nested structure from MultipackBatchSampler data = [None for i in possibly_batched_index] for i, possibly_batched_index_ in enumerate(possibly_batched_index): if self.auto_collation: if ( hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__ ): data[i] = self.dataset.__getitems__(possibly_batched_index_) else: data[i] = [self.dataset[idx] for idx in possibly_batched_index_] else: data[i] = self.dataset[possibly_batched_index_] else: # Standard batch handling if self.auto_collation: if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__: data = self.dataset.__getitems__(possibly_batched_index) else: data = [self.dataset[idx] for idx in possibly_batched_index] else: data = self.dataset[possibly_batched_index] return self.collate_fn(data) def patch_fetchers(): """Apply patches to PyTorch's DataLoader components.""" torch.utils.data._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher def patched_worker_loop(*args, **kwargs): """Worker loop that ensures patches are applied in worker processes.""" patch_fetchers() return _worker_loop(*args, **kwargs) def apply_multipack_dataloader_patch(): """ This patch allows DataLoader to correctly process batches that contain multiple bins of packed sequences. """ # pylint: disable=global-statement global _ORIGINAL_MAP_DATASET_FETCHER, _ORIGINAL_WORKER_LOOP, _IS_PATCHED if _IS_PATCHED: return # Store original implementations _ORIGINAL_MAP_DATASET_FETCHER = torch.utils.data._utils.fetch._MapDatasetFetcher _ORIGINAL_WORKER_LOOP = torch.utils.data._utils.worker._worker_loop # Apply patches patch_fetchers() torch.utils.data._utils.worker._worker_loop = patched_worker_loop _IS_PATCHED = True def remove_multipack_dataloader_patch(): """Remove the monkeypatch and restore original PyTorch DataLoader behavior.""" # pylint: disable=global-statement global _IS_PATCHED if not _IS_PATCHED: return if _ORIGINAL_MAP_DATASET_FETCHER: torch.utils.data._utils.fetch._MapDatasetFetcher = _ORIGINAL_MAP_DATASET_FETCHER torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = ( _ORIGINAL_MAP_DATASET_FETCHER ) if _ORIGINAL_WORKER_LOOP: torch.utils.data._utils.worker._worker_loop = _ORIGINAL_WORKER_LOOP _IS_PATCHED = False ================================================ FILE: src/axolotl/monkeypatch/deepspeed_utils.py ================================================ import importlib import importlib.util from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def patch_checkpoint_wrapper_setattr(): """ Patch CheckpointWrapper to properly forward DeepSpeed attributes to wrapped modules. This fixes the issue where CheckpointWrapper doesn't forward ds_* attributes (like ds_grads_remaining) to the actual wrapped module, causing DeepSpeed ZeRO-3 to fail when gradient checkpointing is enabled. This issue occurs specifically with: - QLoRA + DeepSpeed ZeRO-3 - gradient_checkpointing: true - activation_offloading: true References: - https://github.com/deepspeedai/DeepSpeed/issues/7203 - https://github.com/deepspeedai/DeepSpeed/blob/38d1a9eb64c9e01e32eccc50b25ba18925287441/deepspeed/runtime/zero/parameter_offload.py#L424-L458 - https://github.com/axolotl-ai-cloud/axolotl/pull/3102 """ try: from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( CheckpointWrapper, ) # Check if already patched if hasattr(CheckpointWrapper, "_axolotl_setattr_patched"): LOG.debug("CheckpointWrapper already patched") return original_setattr = CheckpointWrapper.__setattr__ def new_setattr(self, name: str, value) -> None: if name.startswith("ds_") and hasattr(self, "_checkpoint_wrapped_module"): setattr(self._checkpoint_wrapped_module, name, value) LOG.debug( f"Forwarded {name} to wrapped module {type(self._checkpoint_wrapped_module).__name__}" ) else: original_setattr(self, name, value) CheckpointWrapper.__setattr__ = new_setattr CheckpointWrapper._axolotl_setattr_patched = True LOG.info("CheckpointWrapper patched to forward DeepSpeed attributes") except ImportError as e: LOG.debug(f"CheckpointWrapper not available: {e}") except Exception as e: LOG.warning(f"Failed to patch CheckpointWrapper: {e}") def apply_deepspeed_patches(): """ Apply DeepSpeed-related patches """ if importlib.util.find_spec("deepspeed") is not None: patch_checkpoint_wrapper_setattr() else: LOG.debug("DeepSpeed not available, skipping patches") ================================================ FILE: src/axolotl/monkeypatch/fsdp2_qlora.py ================================================ """ Monkeypatch to add Params4bit and Int8Params support to FSDP2. This enables QLoRA + FSDP2 and 8-bit LoRA + FSDP2, as well as our LoRA / QLoRA Triton kernels to work with FSDP2. This patch modifies the _init_sharded_param and init_unsharded_param methods in FSDPParam to handle bitsandbytes Params4bit and Int8Params parameters, preserving their quantization metadata through the FSDP2 shard/unshard cycle. """ import importlib import inspect from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def apply_init_sharded_param_patch(): """Apply patch to FSDPParam._init_sharded_param to support Params4bit.""" if getattr(apply_init_sharded_param_patch, "_axolotl_patched", False): return from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam # Get original source original_source = inspect.getsource(FSDPParam._init_sharded_param) original_source, _ = detab_code(original_source) # Define the replacement original_param_creation = """ self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param)) self.sharded_param.requires_grad_(param.requires_grad)""" patched_param_creation = """ import bitsandbytes as bnb if isinstance(param, bnb.nn.modules.Params4bit): self.sharded_param = bnb.nn.modules.Params4bit( data=sharded_param, requires_grad=param.requires_grad, quant_state=param.quant_state, blocksize=param.blocksize, compress_statistics=param.compress_statistics, quant_type=param.quant_type, quant_storage=param.quant_storage, module=param.module, bnb_quantized=param.bnb_quantized, ) self.sharded_param = self.to_sharded_dtensor(self.sharded_param) elif isinstance(param, bnb.nn.modules.Int8Params): self.sharded_param = bnb.nn.modules.Int8Params( data=sharded_param, requires_grad=param.requires_grad, has_fp16_weights=param.has_fp16_weights, CB=None, SCB=param.SCB, ) self.sharded_param = self.to_sharded_dtensor(self.sharded_param) else: self.sharded_param = nn.Parameter( self.to_sharded_dtensor(sharded_param), requires_grad=param.requires_grad, )""" # Apply the replacement if original_param_creation in original_source: patched_source = original_source.replace( original_param_creation, patched_param_creation ) patched_source = patched_source.replace( "def _init_sharded_param(", "def patched_init_sharded_param(", 1, ) # Load necessary imports module_name = FSDPParam.__module__ module = importlib.import_module(module_name) items_to_import = [] for item in dir(module): if item in patched_source: items_to_import.append(item) exec( # nosec B102 f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) exec(patched_source, globals()) # nosec B102 # Replace the method FSDPParam._init_sharded_param = patched_init_sharded_param apply_init_sharded_param_patch._axolotl_patched = True LOG.info("Successfully applied FSDP _init_sharded_param patch") else: LOG.warning("Could not find target code for _init_sharded_param patching") def apply_init_unsharded_param_patch(): """Apply patch to FSDPParam.init_unsharded_param to support Params4bit.""" if getattr(apply_init_unsharded_param_patch, "_axolotl_patched", False): return from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam # Get original source original_source = inspect.getsource(FSDPParam.init_unsharded_param) original_source, _ = detab_code(original_source) # Define the replacement original_param_creation = """ self._unsharded_param = nn.Parameter( unsharded_param, requires_grad=self.sharded_param.requires_grad )""" patched_param_creation = """ import bitsandbytes as bnb local_tensor = self.sharded_param._local_tensor if isinstance(local_tensor, bnb.nn.modules.Params4bit): self._unsharded_param = bnb.nn.modules.Params4bit( data=unsharded_param, requires_grad=self.sharded_param.requires_grad, quant_state=local_tensor.quant_state, blocksize=local_tensor.blocksize, compress_statistics=local_tensor.compress_statistics, quant_type=local_tensor.quant_type, quant_storage=local_tensor.quant_storage, module=local_tensor.module, bnb_quantized=local_tensor.bnb_quantized, ) elif isinstance(local_tensor, bnb.nn.modules.Int8Params): self._unsharded_param = bnb.nn.modules.Int8Params( data=unsharded_param, requires_grad=self.sharded_param.requires_grad, has_fp16_weights=local_tensor.has_fp16_weights, CB=unsharded_param, SCB=local_tensor.SCB, ) else: self._unsharded_param = nn.Parameter( unsharded_param, requires_grad=self.sharded_param.requires_grad )""" # Apply the replacement if original_param_creation in original_source: patched_source = original_source.replace( original_param_creation, patched_param_creation ) patched_source = patched_source.replace( "def init_unsharded_param(", "def patched_init_unsharded_param(", 1, ) # Load necessary imports module_name = FSDPParam.__module__ module = importlib.import_module(module_name) items_to_import = [] for item in dir(module): if item in patched_source: items_to_import.append(item) exec( # nosec B102 f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) exec(patched_source, globals()) # nosec B102 # Replace the method FSDPParam.init_unsharded_param = patched_init_unsharded_param apply_init_unsharded_param_patch._axolotl_patched = True LOG.info("Successfully applied FSDP init_unsharded_param patch") else: LOG.warning("Could not find target code for patching") def apply_linear8bitlt_save_patch(): """Patch Linear8bitLt._save_to_state_dict to handle DTensor-wrapped Int8Params. After FSDP2 sharding, Linear8bitLt.weight is a DTensor wrapping Int8Params. BnB's _save_to_state_dict accesses self.weight.SCB directly, but DTensor doesn't proxy custom attribute access to its _local_tensor. This patch temporarily unwraps the DTensor during saving so BnB can find the SCB attribute. """ if getattr(apply_linear8bitlt_save_patch, "_axolotl_patched", False): return import bitsandbytes as bnb from torch.distributed.tensor import DTensor original_save = bnb.nn.Linear8bitLt._save_to_state_dict def _patched_save_to_state_dict(self, destination, prefix, keep_vars): # Use _parameters dict directly to bypass nn.Module.__setattr__ type check. weight = self._parameters["weight"] unwrapped = False if isinstance(weight, DTensor) and hasattr(weight, "_local_tensor"): self._parameters["weight"] = weight._local_tensor unwrapped = True try: original_save(self, destination, prefix, keep_vars) finally: if unwrapped: self._parameters["weight"] = weight bnb.nn.Linear8bitLt._save_to_state_dict = _patched_save_to_state_dict apply_linear8bitlt_save_patch._axolotl_patched = True LOG.info("Patched Linear8bitLt._save_to_state_dict for DTensor compatibility") def apply_init_dtype_attrs_patch(): """Prevent FSDP2 mixed precision from casting non-float quantized params. When mixed precision is enabled (e.g., bf16), FSDP2's init_dtype_attrs sets param_dtype=bf16 for ALL params. During all-gather, _to_dtype_if_needed casts the sharded param to param_dtype. For non-float params (uint8 packed 4-bit, int8 quantized) without FSDP2 extensions, this destroys the quantized data. Params4bit handles this via fsdp_pre/post_all_gather extensions, but our parametrize-based expert quantization uses plain nn.Parameter(uint8/int8) without extensions. """ if getattr(apply_init_dtype_attrs_patch, "_axolotl_patched", False): return from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam original_init_dtype_attrs = FSDPParam.init_dtype_attrs def patched_init_dtype_attrs(self, mp_policy): original_init_dtype_attrs(self, mp_policy) # Skip casting non-float quantized params (uint8/int8) without FSDP2 # extensions — the parametrization chain handles dequantization. if self.param_dtype is not None and not self.sharded_param.is_floating_point(): local = self.sharded_param if hasattr(local, "_local_tensor"): local = local._local_tensor if not hasattr(local, "fsdp_pre_all_gather"): self.param_dtype = None FSDPParam.init_dtype_attrs = patched_init_dtype_attrs apply_init_dtype_attrs_patch._axolotl_patched = True LOG.info("Patched FSDPParam.init_dtype_attrs for non-float quantized params") ================================================ FILE: src/axolotl/monkeypatch/gradient_checkpointing/__init__.py ================================================ """custom checkpointing utils""" import importlib from functools import partial from packaging import version from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import ( # noqa: F401 CPU_Offloaded_Gradient_Checkpointer, ) from axolotl.monkeypatch.gradient_checkpointing.offload_disk import ( Disco, ) transformers_version = version.parse(importlib.metadata.version("transformers")) if transformers_version > version.parse("4.51.3"): from transformers.modeling_layers import GradientCheckpointingLayer def uses_gc_layers(decoder_layer): return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer) else: def uses_gc_layers(_): return False def hf_grad_checkpoint_offload_wrapper(decoder_layer, *args, use_reentrant=None): if uses_gc_layers(decoder_layer): return CPU_Offloaded_Gradient_Checkpointer.apply( decoder_layer, *args, ) return CPU_Offloaded_Gradient_Checkpointer.apply( ( decoder_layer.func.__self__ if isinstance(decoder_layer, partial) else decoder_layer.__self__ ), *args, ) def hf_grad_checkpoint_disk_offload_wrapper(decoder_layer, *args, use_reentrant=None): if uses_gc_layers(decoder_layer): return Disco.apply( decoder_layer, *args, ) return Disco.apply( ( decoder_layer.func.__self__ if isinstance(decoder_layer, partial) else decoder_layer.__self__ ), *args, ) ================================================ FILE: src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py ================================================ """CPU offloaded checkpointing""" # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import torch from packaging import version from torch.utils.checkpoint import ( set_device_states, ) # support different pytorch versions has_device_type = "device_type" in inspect.signature(set_device_states).parameters torch_version = version.parse(torch.__version__) if torch_version < version.parse("2.4.0"): torch_cuda_amp_custom_fwd = torch.cuda.amp.custom_fwd torch_cuda_amp_custom_bwd = torch.cuda.amp.custom_bwd else: torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda") torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda") class CPU_Offloaded_Gradient_Checkpointer(torch.autograd.Function): """ Saves VRAM by smartly offloading to RAM. Tiny hit to performance, since we mask the movement via non blocking calls. """ @staticmethod @torch_cuda_amp_custom_fwd def forward(ctx, forward_function, hidden_states, *args): saved_hidden_states = hidden_states.to("cpu", non_blocking=True) with torch.no_grad(): output = forward_function(hidden_states, *args) ctx.save_for_backward(saved_hidden_states) ctx.forward_function = forward_function ctx.args = args return output @staticmethod @torch_cuda_amp_custom_bwd def backward(ctx, dY): (hidden_states,) = ctx.saved_tensors hidden_states = hidden_states.to("cuda", non_blocking=True).detach() hidden_states.requires_grad = True with torch.enable_grad(): output = ctx.forward_function(hidden_states, *ctx.args) # Newer HF models (e.g. Qwen3MoE) using GradientCheckpointingLayer # return a plain tensor, not a tuple. Older models return tuples # like (hidden_states, present_kv, ...). Unwrap if needed. if isinstance(output, (tuple, list)): (output,) = output torch.autograd.backward(output, dY) return ( None, hidden_states.grad, ) + (None,) * len(ctx.args) ================================================ FILE: src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py ================================================ """ DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching """ # Copyright 2025 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import atexit import concurrent.futures import os import queue import shutil import tempfile import threading import time import uuid from collections import deque from concurrent.futures import Future from typing import Dict import torch from axolotl.utils.logging import get_logger torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda") torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda") # Setup logger logger = get_logger(__name__) class DiskOffloadManager: """ Manages offloaded tensors and handles prefetching in a separate thread. Includes synchronization to prevent race conditions. """ def __init__( self, prefetch_size: int = 3, prefetch_to_gpu: bool = True, save_workers: int = 4, ): """ Args: prefetch_size: Maximum number of tensors to prefetch in the background. prefetch_to_gpu: Whether to prefetch tensors directly to GPU memory. save_workers: Maximum number of concurrent save operations. """ self.temp_dir = tempfile.mkdtemp(prefix="disco_") # Track tensor paths and their status self.tensor_paths: deque = deque() # Ordered history of tensor paths (LIFO) self.file_locks: Dict[ str, threading.Lock ] = {} # Maps file_path -> threading.Lock() # Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted") self.file_status: Dict[str, str] = {} self.max_prefetch = prefetch_size self.prefetch_to_gpu = prefetch_to_gpu # Thread synchronization self.manager_lock = threading.RLock() # Used for thread-safe operations # Prefetch queue and cache self.prefetch_queue: queue.Queue = queue.Queue() self.prefetch_cache: Dict[str, torch.Tensor] = {} # Maps file_path -> tensor # Save queue and thread pool self.save_queue: queue.Queue = queue.Queue() self.save_pool = concurrent.futures.ThreadPoolExecutor(max_workers=save_workers) self.save_futures: Dict[str, Future] = {} self.save_semaphore = threading.Semaphore( save_workers * 2 ) # Limit concurrent save operations # Start prefetch worker thread self.stop_event = threading.Event() # start multiple threads for prefetching self.prefetch_worker_count = 2 self.prefetch_workers = [] for _ in range(self.prefetch_worker_count): worker = threading.Thread(target=self._prefetch_worker, daemon=True) worker.start() self.prefetch_workers.append(worker) # Start save worker thread self.save_worker = threading.Thread(target=self._save_worker, daemon=True) self.save_worker.start() self.idx = 0 atexit.register(self.cleanup) def _save_worker(self): """Background thread that processes the save queue""" while not self.stop_event.is_set(): try: save_item = self.save_queue.get(timeout=0.5) if save_item is None: continue tensor, file_path = save_item # Submit the save task to the thread pool future = self.save_pool.submit( self._save_tensor_to_disk, tensor, file_path ) with self.manager_lock: self.save_futures[file_path] = future self.save_queue.task_done() except queue.Empty: time.sleep(0.01) # Small sleep to prevent CPU spinning continue def _save_tensor_to_disk(self, tensor: torch.Tensor, file_path: str): """Actually save the tensor to disk""" try: # Save tensor to disk cpu_tensor = tensor.detach().cpu() torch.save(cpu_tensor, file_path) del cpu_tensor with self.manager_lock: # Mark file as ready self.file_status[file_path] = "ready" # Release semaphore self.save_semaphore.release() return True except FileNotFoundError as e: logger.error(f"Error saving tensor to {file_path}: {e}") with self.manager_lock: self.file_status[file_path] = "error" # Release semaphore self.save_semaphore.release() return False def _prefetch_worker(self): """Background thread that loads tensors from disk ahead of time""" while not self.stop_event.is_set(): try: file_path = self.prefetch_queue.get(timeout=0.5) if file_path is None: continue # Check if file is available and not already in cache with self.manager_lock: if ( file_path not in self.file_status or self.file_status[file_path] == "deleted" ): self.prefetch_queue.task_done() if file_path in self.prefetch_cache: self.prefetch_queue.task_done() continue # If file is still being saved, wait for it if ( self.file_status[file_path] == "saving" and file_path in self.save_futures ): # Re-queue this prefetch request with a little delay self.prefetch_queue.task_done() time.sleep(0.1) self.prefetch_queue.put(file_path) continue # Mark file as being prefetched self.file_status[file_path] = "prefetching" # Load tensor from disk and store in cache try: if os.path.exists(file_path): if self.prefetch_to_gpu: tensor = torch.load( file_path, map_location=torch.device("cuda"), weights_only=True, ) else: tensor = torch.load(file_path, weights_only=True) with self.manager_lock: self.prefetch_cache[file_path] = tensor self.file_status[file_path] = "ready" else: with self.manager_lock: if self.file_status.get(file_path) != "deleted": logger.warning( f"Prefetch error: File not found {file_path}" ) self.file_status[file_path] = "missing" except FileNotFoundError as e: with self.manager_lock: if self.file_status.get(file_path) != "deleted": logger.warning(f"Prefetch error for {file_path}: {e}") self.file_status[file_path] = "error" self.prefetch_queue.task_done() except queue.Empty: time.sleep(0.01) # Small sleep to prevent CPU spinning continue def save_tensor(self, tensor: torch.Tensor): """Save tensor to disk asynchronously and return file path with thread-safe operations""" # Generate unique file path self.idx += 1 file_path: str = os.path.join( self.temp_dir, f"{self.idx:06d}-{uuid.uuid4()}.pt" ) with self.manager_lock: # Mark file as being saved self.file_locks[file_path] = threading.Lock() self.file_status[file_path] = "saving" # Add to history self.tensor_paths.append(file_path) # Acquire semaphore to limit concurrent save operations self.save_semaphore.acquire() # Queue tensor for saving in background self.save_queue.put((tensor.detach(), file_path)) return file_path def wait_for_save(self, file_path, timeout=None) -> None: """Wait for a tensor to be saved to disk""" start_time = time.time() while timeout is None or time.time() - start_time < timeout: with self.manager_lock: if self.file_status.get(file_path) == "ready": return if self.file_status.get(file_path) in ["error", "missing", "deleted"]: return if file_path in self.save_futures: future = self.save_futures[file_path] if future.done(): return # Small sleep to prevent CPU spinning time.sleep(0.01) # Timeout logger.warning(f"Timeout waiting for tensor to be saved: {file_path}") return def load_tensor(self, file_path, target_device="cuda"): """Load tensor from disk or prefetch cache with proper synchronization""" # Wait for tensor to be saved if it's still in progress self.wait_for_save(file_path) tensor = None # Try to get from cache first with self.manager_lock: # Check if tensor is already in cache if file_path in self.prefetch_cache: tensor = self.prefetch_cache[file_path] del self.prefetch_cache[file_path] self.file_status[file_path] = "loaded" if tensor is not None: # Ensure tensor is on correct device if target_device != "cpu" and tensor.device.type == "cpu": tensor = tensor.to(target_device, non_blocking=True) return tensor # If not in cache, load directly from disk try: if not os.path.exists(file_path): logger.error(f"File not found for loading: {file_path}") raise FileNotFoundError(f"File not found: {file_path}") tensor = torch.load(file_path, weights_only=True) with self.manager_lock: self.file_status[file_path] = "loaded" if target_device != "cpu": tensor = tensor.to(target_device, non_blocking=True) return tensor except Exception as e: logger.error(f"Error loading tensor from {file_path}: {e}") raise def _safe_delete_file(self, file_path): """Safely delete a file with proper synchronization""" with self.manager_lock: # Make sure any save operation is completed if file_path in self.save_futures: future = self.save_futures[file_path] try: if not future.done(): future.cancel() del self.save_futures[file_path] except FileNotFoundError as e: logger.warning( f"Error canceling save operation for {file_path}: {e}" ) # Only delete if file exists and is not being prefetched status = self.file_status.get(file_path) if status in ["ready", "loaded", "error", "missing"]: try: if os.path.exists(file_path): os.remove(file_path) self.file_status[file_path] = "deleted" return True except FileNotFoundError as e: logger.warning(f"Error deleting file {file_path}: {e}") return False def trigger_prefetch(self, n=None): """Trigger prefetching of the next N tensors with proper synchronization""" if n is None: n = self.max_prefetch prefetch_paths = [] with self.manager_lock: # Find files that are ready to be prefetched (not already in cache or being prefetched) for path in reversed(self.tensor_paths): if ( path not in self.prefetch_cache and self.file_status.get(path) == "ready" ): prefetch_paths.append(path) if len(prefetch_paths) >= n: break # Queue files for prefetching for path in prefetch_paths: self.prefetch_queue.put(path) def cleanup_tensor(self, file_path: str): """Clean up a specific tensor file after it's been used""" with self.manager_lock: if file_path in self.tensor_paths: self.tensor_paths.remove(file_path) # Remove from prefetch cache if present if file_path in self.prefetch_cache: del self.prefetch_cache[file_path] # Remove from save futures if present if file_path in self.save_futures: future = self.save_futures[file_path] if not future.done(): future.cancel() del self.save_futures[file_path] # Try to delete the file self._safe_delete_file(file_path) def cleanup(self): """Clean up all temp files and stop prefetch thread with proper synchronization""" self.stop_event.set() # Cancel all pending save operations with self.manager_lock: for _, future in self.save_futures.items(): if not future.done(): future.cancel() self.save_futures.clear() # Drain the save queue while not self.save_queue.empty(): try: self.save_queue.get_nowait() self.save_queue.task_done() except queue.Empty: break # Shutdown the save pool self.save_pool.shutdown(wait=False) # Join the save worker thread if self.save_worker.is_alive(): self.save_worker.join(timeout=2.0) # Join the prefetch worker threads for thread in self.prefetch_workers: if thread.is_alive(): thread.join(timeout=2.0) # Clear cache and remove all temporary files with self.manager_lock: self.prefetch_cache.clear() paths_to_delete = list(self.tensor_paths) self.tensor_paths.clear() # Delete all temporary files for path in paths_to_delete: self._safe_delete_file(path) # Remove temp directory try: if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir, ignore_errors=True) except FileNotFoundError as e: logger.warning(f"Error removing temporary directory {self.temp_dir}: {e}") class Disco(torch.autograd.Function): """ Disco: DIsk-based Storage and Checkpointing with Optimized prefetching Advanced disk-based gradient checkpointer with prefetching. """ # Shared manager instance across all checkpointing operations _manager = None @staticmethod def get_instance(prefetch_size=1, prefetch_to_gpu=True, save_workers=4): """Get or create the offload manager""" if Disco._manager is None: Disco._manager = DiskOffloadManager( prefetch_size=prefetch_size, prefetch_to_gpu=prefetch_to_gpu, save_workers=save_workers, ) return Disco._manager @staticmethod @torch_cuda_amp_custom_fwd def forward( ctx, forward_function, hidden_states, *args, prefetch_size=1, prefetch_to_gpu=True, save_workers=4, ): """Forward pass that offloads activations to disk asynchronously""" # Get or create the manager manager = Disco.get_instance( prefetch_size=prefetch_size, prefetch_to_gpu=prefetch_to_gpu, save_workers=save_workers, ) # Save tensor to disk asynchronously file_path = manager.save_tensor(hidden_states) # Run forward pass immediately without waiting for save to complete with torch.no_grad(): output = forward_function(hidden_states, *args) # Store what we need for backward ctx.save_for_backward(torch.tensor([0])) # Dummy tensor ctx.file_path = file_path ctx.forward_function = forward_function ctx.args = args return output @staticmethod @torch_cuda_amp_custom_bwd def backward(ctx, *grad_outputs): """Backward pass that loads activations from disk with prefetching""" # Get the manager manager = Disco._manager # Trigger prefetching for future tensors # This happens at the start of backward, so should have time to complete manager.trigger_prefetch() # Load hidden states from disk or prefetch cache file_path = ctx.file_path try: # Ensure the file is saved before we try to load it manager.wait_for_save(file_path) hidden_states = manager.load_tensor(file_path) hidden_states.requires_grad = True # Compute gradients with torch.enable_grad(): output = ctx.forward_function(hidden_states, *ctx.args) # Handle tuple outputs properly if isinstance(output, tuple): if len(grad_outputs) == len(output): torch.autograd.backward(output, grad_outputs) else: torch.autograd.backward(output, grad_outputs[0]) else: torch.autograd.backward(output, grad_outputs[0]) # Clean up the file after we're done with it manager.cleanup_tensor(file_path) return ( ( None, # forward_function hidden_states.grad, # hidden_states grad ) + (None,) * len(ctx.args) # for each arg + ( None, # prefetch_size None, # prefetch_to_gpu None, # save_workers ) ) except Exception as e: logger.error(f"Error in backward pass: {e}") # Clean up the file even on error manager.cleanup_tensor(file_path) raise ================================================ FILE: src/axolotl/monkeypatch/llama_attn_hijack_flash.py ================================================ """Flash attention monkey patch for llama model""" # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py import importlib.util import warnings from typing import Optional, Tuple import torch import transformers from einops import rearrange from flash_attn.bert_padding import pad_input, unpad_input from transformers.models.llama.modeling_llama import ( LlamaMLP, apply_rotary_pos_emb, repeat_kv, ) from axolotl.monkeypatch.utils import set_module_name from axolotl.utils.logging import get_logger try: from flash_attn.flash_attn_interface import ( flash_attn_varlen_qkvpacked_func, ) except ImportError: from flash_attn.flash_attn_interface import ( flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func, ) LOG = get_logger(__name__) def is_xformers_available() -> bool: return importlib.util.find_spec("xformers") is not None def is_xformers_swiglu_available() -> bool: if not is_xformers_available(): return False from xformers.ops.common import get_xformers_operator try: get_xformers_operator("swiglu_packedw")() return True except RuntimeError as exc: if "No such operator xformers::swiglu_packedw " in str(exc): return False return True def replace_llama_mlp_with_swiglu(model): if is_xformers_swiglu_available(): from axolotl.monkeypatch.xformers_ import FusedMLP else: raise RuntimeError("xformers SwiGLU not available for this environment") for name, module in model.named_modules(): if isinstance(module, LlamaMLP): mlp = FusedMLP( module.config, module.gate_proj, module.up_proj, module.down_proj ) set_module_name(model, name, mlp) def patch_fa_llama_cross_entropy(): LOG.info( "patching transformers.loss.loss_utils.fixed_cross_entropy with flash_attn.ops.triton.cross_entropy" ) from flash_attn.ops.triton.cross_entropy import ( cross_entropy_loss as flash_attn_cross_entropy_loss, ) def fa2_fixed_cross_entropy( source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs, ): reduction = "sum" if num_items_in_batch is not None else "mean" loss, _ = flash_attn_cross_entropy_loss( source, target, ignore_index=ignore_index ) if reduction == "sum": loss = loss.sum() / num_items_in_batch else: loss = loss.sum() / (target != ignore_index).sum() return loss transformers.loss.loss_utils.fixed_cross_entropy = fa2_fixed_cross_entropy def patch_llama_rms_norm(): try: from flash_attn.ops.rms_norm import RMSNorm class LlamaRMSNorm(RMSNorm): """Patched LLamaRMSNorm""" def __init__(self, hidden_size, eps=1e-6): super().__init__(hidden_size, eps=eps) LOG.info("patching with flash_attn.ops.rms_norm") transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm except ImportError: LOG.warning( "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)" ) def replace_llama_attn_with_flash_attn( cross_entropy: Optional[bool] = False, rms_norm: Optional[bool] = False, use_shifted_sparse_attn: Optional[bool] = False, ): transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask if use_shifted_sparse_attn: transformers.models.llama.modeling_llama.LlamaAttention.forward = ( flashattn_forward_with_s2attn ) # skip only if explicitly disabled if cross_entropy: patch_fa_llama_cross_entropy() # skip only if explicitly disabled if rms_norm: patch_llama_rms_norm() # Disable the transformation of the attention mask in LlamaModel as the flash attention # requires the attention mask to be the same as the key_padding_mask def _prepare_decoder_attention_mask( self, attention_mask, input_shape, inputs_embeds, past_key_values_length, ): # [bsz, seq_len] return attention_mask GROUP_SIZE_RATIO = 1 / 4 def flashattn_forward_with_s2attn( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, padding_mask: Optional[torch.LongTensor] = None, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py attention_mask: [bsz, q_len] `cu_seqlens` will be ignored if provided `max_seqlen` will be ignored if provided """ if output_attentions: warnings.warn( "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.", stacklevel=2, ) bsz, q_len, _ = hidden_states.size() query_states = ( self.q_proj(hidden_states) .view(bsz, q_len, self.num_heads, self.head_dim) .transpose(1, 2) ) key_states = ( self.k_proj(hidden_states) .view(bsz, q_len, self.num_key_value_heads, self.head_dim) .transpose(1, 2) ) value_states = ( self.v_proj(hidden_states) .view(bsz, q_len, self.num_key_value_heads, self.head_dim) .transpose(1, 2) ) # [bsz, q_len, nh, hd] # [bsz, nh, q_len, hd] cos, sin = self.rotary_emb(value_states, position_ids=position_ids) query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids ) # Past Key value support if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) past_key_value = (key_states, value_states) if use_cache else None # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) # Flash attention codes from # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py # transform the data into the format required by flash attention qkv = torch.stack( [query_states, key_states, value_states], dim=2 ) # [bsz, nh, 3, q_len, hd] qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] # We have disabled _prepare_decoder_attention_mask in LlamaModel # the attention_mask should be the same as the key_padding_mask key_padding_mask = attention_mask.repeat(2, 1) nheads = qkv.shape[-2] # shift group_size = int(q_len * GROUP_SIZE_RATIO) if q_len % group_size > 0: raise ValueError( f"q_len {q_len} should be divisible by group size {group_size}." ) qkv = ( qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim) .permute(0, 3, 1, 2, 4, 5) .reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim) ) x = rearrange(qkv, "b s three h d -> b s (three h d)") x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) cu_q_len_tmp = torch.arange( 0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype ) cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp + group_size // 2]).repeat( bsz, 1 ) + cu_q_lens[:-1].unsqueeze(-1) cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1) x_unpad = rearrange( x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2 ) output_unpad = flash_attn_varlen_qkvpacked_func( x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True ) output = rearrange( pad_input( rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len ), "b s (h d) -> b s h d", h=nheads // 2, ) output = ( output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim) .transpose(1, 2) .reshape(bsz, q_len, nheads, self.head_dim) ) return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value ================================================ FILE: src/axolotl/monkeypatch/llama_attn_hijack_xformers.py ================================================ """ Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments """ import warnings from typing import Optional, Tuple import torch import torch.nn.functional as F import transformers.models.llama.modeling_llama from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv from axolotl.utils.logging import get_logger LOG = get_logger(__name__) try: import xformers.ops except ImportError: LOG.error("xformers not found! Please install it before trying to use it.") def hijack_llama_attention(): transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward def xformers_forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, padding_mask: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() if not hasattr(self, "pretraining_tp"): self.pretraining_tp = 1 if self.pretraining_tp > 1: key_value_slicing = ( self.num_key_value_heads * self.head_dim ) // self.pretraining_tp query_slices = self.q_proj.weight.split( (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0 ) key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) query_states = [ F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp) ] query_states = torch.cat(query_states, dim=-1) key_states = [ F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp) ] key_states = torch.cat(key_states, dim=-1) value_states = [ F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp) ] value_states = torch.cat(value_states, dim=-1) else: query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = query_states.view( bsz, q_len, self.num_heads, self.head_dim ).transpose(1, 2) key_states = key_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) # [bsz, q_len, nh, hd] # [bsz, nh, q_len, hd] cos, sin = self.rotary_emb(value_states) query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids ) # [bsz, nh, t, hd] if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) past_key_value = (key_states, value_states) if use_cache else None # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) if output_attentions: warnings.warn( "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.", stacklevel=2, ) # # xformers-attn start # query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros. # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros. if attention_mask is None or attention_mask[0, 0, 0, 1] == 0: # input and output should be of form (bsz, q_len, num_heads, head_dim) attn_output = xformers.ops.memory_efficient_attention( query_states, key_states, value_states, attn_bias=None ) else: # input and output should be of form (bsz, q_len, num_heads, head_dim) attn_output = xformers.ops.memory_efficient_attention( query_states, key_states, value_states, # attn_bias=attention_mask, attn_bias=xformers.ops.LowerTriangularMask(), ) if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) # # xformers-attn end # if self.pretraining_tp > 1: attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2) o_proj_slices = self.o_proj.weight.split( self.hidden_size // self.pretraining_tp, dim=1 ) attn_output = sum( F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp) ) else: attn_output = self.o_proj(attn_output) return attn_output, None, past_key_value ================================================ FILE: src/axolotl/monkeypatch/lora_kernels.py ================================================ """Module for patching custom LoRA Triton kernels and `torch.autograd` functions.""" import importlib import inspect import logging import types from typing import Generator, Tuple, Type import torch from peft import PeftModelForCausalLM from torch import nn from transformers import AutoConfig from axolotl.kernels.lora import ( apply_lora_mlp_geglu, apply_lora_mlp_swiglu, apply_lora_o, apply_lora_qkv, ) from axolotl.monkeypatch.utils import detab_code from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) QKV_PATCHES = [ ( """ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) """.lstrip("\n"), """ query_states, key_states, value_states = self.apply_qkv(hidden_states) query_states = query_states.view(hidden_shape).transpose(1, 2) key_states = key_states.view(hidden_shape).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) """.lstrip("\n"), ), ( """ query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) """.lstrip("\n"), """ query_states, key_states, value_states = self.apply_qkv(hidden_states) query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) """.lstrip("\n"), ), ( """ query_states, gate = torch.chunk( self.q_proj(hidden_states).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1 ) gate = gate.reshape(*input_shape, -1) query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) """.lstrip("\n"), """ query_states, key_states, value_states = self.apply_qkv(hidden_states) query_states, gate = torch.chunk( query_states.view(*input_shape, -1, self.head_dim * 2), 2, dim=-1 ) gate = gate.reshape(*input_shape, -1) query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) """.lstrip("\n"), ), ] ORIGINAL_O_CODE = """ attn_output = self.o_proj(attn_output) """.lstrip("\n") PATCHED_O_CODE = """ attn_output = self.apply_o(attn_output) """.lstrip("\n") SUPPORTED_ACTIVATIONS = ["silu", "gelu"] APPLY_FN_MAPPING = { "silu": apply_lora_mlp_swiglu, "gelu": apply_lora_mlp_geglu, } def original_apply_qkv( self: nn.Module, hidden_states: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Original implementation of QKV projection without optimizations. Args: self: The attention module instance. hidden_states: Input tensor of shape [batch_size, seq_len, hidden_dim]. Returns: A tuple `(query_states, key_states, value_states)` containing the projected states for query, key, and value. """ query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) return query_states, key_states, value_states def original_apply_o(self: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor: """ Original implementation of output projection without optimizations. Args: self: The attention module instance. hidden_states: Input tensor of shape `[`batch_size, seq_len, hidden_dim]`. Returns: The output projection result. """ attn_output = self.o_proj(hidden_states) return attn_output def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]: """ Get the appropriate attention class by inspecting the model config. Uses dynamic import to support any model architecture that follows the standard transformers naming convention. Args: cfg: Dictionary mapping `axolotl` config keys to values. Returns: The appropriate attention class for the model. Raises: ValueError: If `base_model` not specified or attention class cannot be imported ImportError: If the model module or attention class doesn't exist """ if "base_model" not in cfg: raise ValueError("base_model must be specified in config") # Get model config without loading the model model_config = AutoConfig.from_pretrained(cfg["base_model"]) model_type = model_config.model_type # Special case for model_type = "qwen2" if model_type == "qwen2": from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention return Qwen2Attention if model_type == "qwen3_vl": from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention return Qwen3VLTextAttention if model_type == "mllama": from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention return MllamaTextSelfAttention if model_type == "llama4": from transformers.models.llama4.modeling_llama4 import Llama4TextAttention return Llama4TextAttention if model_type == "mistral3": from transformers.models.mistral.modeling_mistral import MistralAttention return MistralAttention if model_type == "gemma3_text": from transformers.models.gemma3.modeling_gemma3 import Gemma3Attention return Gemma3Attention try: # Dynamically import the module and attention class module_path = f"transformers.models.{model_type}.modeling_{model_type}" model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type) module = __import__(module_path, fromlist=[f"{model_cls_prefix}Attention"]) attention_cls = getattr(module, f"{model_cls_prefix}Attention") return attention_cls except (ImportError, AttributeError) as e: raise ValueError( f"Axolotl could not import attention class for model_type: {model_type}. " "Please raise an Issue and turn off lora kernels to continue training. " f"Error: {str(e)}" ) from e def patch_self_attn_lora(cfg: DictDefault): """ Given an `axolotl` config, this method patches the inferred attention class forward pass with optimized LoRA implementations. It modifies the attention class to use optimized QKV and output projections. The original implementation is preserved and can be restored if needed. Args: cfg: Dictionary mapping `axolotl` config keys to values. Raises: AssertionError: If the required code blocks are not found in the attention implementation. """ attention_cls = get_attention_cls_from_config(cfg) # Check if already patched if hasattr(attention_cls, "_original_forward"): LOG.info(f"{attention_cls.__name__} already patched") return self_attn_forward = inspect.getsource(attention_cls.forward) attention_cls._original_forward = self_attn_forward self_attn_forward, _ = detab_code(self_attn_forward) assert any(qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES), ( "Original QKV code not found" ) assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found" for qkv_orig, qkv_patched in QKV_PATCHES: if qkv_orig in self_attn_forward: self_attn_forward = self_attn_forward.replace( qkv_orig, qkv_patched, ) break self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE) self_attn_forward = self_attn_forward.replace( "def forward(", "def axolotl_attn_forward(", 1, ) # Load necessary imports module_name = attention_cls.__module__ module = importlib.import_module(module_name) items_to_import = [] for item in dir(module): if item in self_attn_forward: items_to_import.append(item) exec( f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) exec(self_attn_forward, globals()) LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}") attention_cls.forward = axolotl_attn_forward def find_self_attn_in_layer( layer: nn.Module, ) -> Generator[Tuple[nn.Module], None, None]: # general case of most models if hasattr(layer, "self_attn"): if all( hasattr(layer.self_attn, proj) for proj in ["q_proj", "k_proj", "v_proj", "o_proj"] ): yield layer.self_attn def find_mlp_in_layer( layer: nn.Module, ) -> Generator[Tuple[nn.Module, nn.Module, nn.Module, nn.Module], None, None]: # general case of most models if hasattr(layer, "mlp"): if all( hasattr(layer.mlp, proj) for proj in ["gate_proj", "up_proj", "down_proj"] ): yield layer.mlp.gate_proj, layer.mlp.up_proj, layer.mlp.down_proj, layer.mlp # llama4 linearized experts if hasattr(layer, "feedforward") and hasattr(layer.feedforward, "shared_expert"): mlp = layer.feedforward.shared_expert yield mlp.gate_proj, mlp.up_proj, mlp.down_proj, mlp if hasattr(layer, "feedforward") and hasattr(layer.feedforward, "experts"): if all( hasattr(layer.feedforward.experts, proj) for proj in ["gate_projs", "up_projs", "down_projs"] ): for gate_proj, up_proj, down_proj in zip( layer.feedforward.experts.gate_projs, layer.feedforward.experts.up_projs, layer.feedforward.experts.down_projs, strict=False, ): yield ( gate_proj, up_proj, down_proj, FakeMLP(gate_proj, up_proj, down_proj), ) def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]: """ Get the layers of the model. Handles text-only and multimodal models. Args: model: A PEFT model. Returns: A list of layers. """ pretrained_model = model.model # check for multimodal models first if hasattr(pretrained_model, "language_model"): return pretrained_model.language_model.layers if hasattr(pretrained_model, "model"): if hasattr(pretrained_model.model, "language_model"): return pretrained_model.model.language_model.layers return pretrained_model.model.layers raise NotImplementedError( f"Model type {model.config.model_type} is not supported yet. Please create an Issue." ) def apply_lora_kernel_patches( model: PeftModelForCausalLM, cfg: DictDefault ) -> PeftModelForCausalLM: """ Applies optimized Triton kernel patches to a PEFT model. Patches a PEFT model with optimized implementations for MLP and attention computations. The optimizations include custom Triton kernels for activation functions and specialized autograd functions for LoRA computations. Args: model: A PEFT model to be patched with optimized kernels. cfg: Dictionary mapping `axolotl` config keys to values. Returns: PeftModelForCausalLM: The patched model with optimized kernels. Raises: TypeError: If the provided model is not a `PeftModelForCausalLM`. NotImplementedError: If the model type is not supported. AssertionError: If multiple adapters are active (currently unsupported). Note: The optimizations require LoRA adapters with no dropout and no bias terms. The function will skip patching if these conditions aren't met. """ if not isinstance(model, PeftModelForCausalLM): raise TypeError("Model must be a PeftModelForCausalLM") # Get active LoRA adapter config if hasattr(model, "active_adapters"): assert len(model.active_adapters) == 1, ( "Axolotl currently does not support LoRA Triton kernels for multiple adapters" ) active_adapter = model.active_adapters[0] else: active_adapter = model.active_adapter lora_config = model.model.peft_config[active_adapter] # Only patch if conditions are met can_patch = lora_config.lora_dropout == 0 and lora_config.bias == "none" if not can_patch: LOG.warning("Cannot patch layers - requires no dropout and no bias") LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file") return model # This needs to be reset after patching original_level = LOG.getEffectiveLevel() LOG.setLevel(logging.INFO) # Choose activation based on model type activation = None text_config = ( model.config.get_text_config() if hasattr(model.config, "get_text_config") else model.config ) if hasattr(text_config, "hidden_act"): activation = text_config.hidden_act elif hasattr(text_config, "hidden_activation"): activation = text_config.hidden_activation # map activation to supported activation if "gelu" in activation: # gemma3 uses gelu_pytorch_tanh activation = "gelu" if activation not in SUPPORTED_ACTIVATIONS: raise NotImplementedError(f"Activation {activation} is not supported") layers = get_layers(model) # Patch each layer for layer in layers: # Add QKV, O fallback implementations to start # These will be overwritten later (if some conditions apply) for self_attn in find_self_attn_in_layer(layer): self_attn.apply_qkv = types.MethodType(original_apply_qkv, self_attn) self_attn.apply_o = types.MethodType(original_apply_o, self_attn) if cfg.lora_qkv_kernel: # Query, key, value patching layer_modules = [ getattr(self_attn, linear_proj) for linear_proj in ["q_proj", "k_proj", "v_proj"] ] can_patch_qkv = all( hasattr(module, "lora_A") and len(getattr(module, "lora_magnitude_vector", []) or []) == 0 for module in layer_modules ) if can_patch_qkv: # Add optimized implementation self_attn.apply_qkv = types.MethodType(apply_lora_qkv, self_attn) else: LOG.warning_once( "Cannot patch some attention QKV projections - requires LoRA " "adapters and no lora_magnitude_vector (DoRA)" ) if cfg.lora_o_kernel: # Output patching layer_modules = [ getattr(self_attn, linear_proj) for linear_proj in ["o_proj"] ] can_patch_o = all( hasattr(module, "lora_A") and len(getattr(module, "lora_magnitude_vector", []) or []) == 0 for module in layer_modules ) if can_patch_o: self_attn.apply_o = types.MethodType(apply_lora_o, self_attn) else: LOG.warning_once( "Cannot patch some attention output projection - requires LoRA " "adapters and no lora_magnitude_vector (DoRA)" ) for gate_proj, up_proj, down_proj, mlp in find_mlp_in_layer(layer): if cfg.lora_mlp_kernel: # MLP patching can_patch_mlp = all( hasattr(proj, "lora_A") and len(getattr(proj, "lora_magnitude_vector", []) or []) == 0 for proj in (gate_proj, up_proj, down_proj) ) if can_patch_mlp: apply_fn = APPLY_FN_MAPPING[activation] layer.mlp.forward = types.MethodType(apply_fn, mlp) else: LOG.warning_once( "Cannot patch some MLP layers - requires LoRA adapters and no " "lora_magnitude_vector (DoRA)" ) LOG.setLevel(original_level) return model class FakeMLP(nn.Module): """ placeholder MLP for triton patching """ gate_proj: nn.Linear up_proj: nn.Linear down_proj: nn.Linear def __init__(self, gate_proj, up_proj, down_proj): super().__init__() self.gate_proj = gate_proj self.up_proj = up_proj self.down_proj = down_proj ================================================ FILE: src/axolotl/monkeypatch/loss/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/loss/chunked.py ================================================ """ chunked ce loss """ from typing import List, Optional import torch import torch.nn.functional as F # copied and modified from torchtune.modules.loss.CEWithChunkedOutputLoss class CEWithChunkedOutputLoss(torch.nn.Module): """ Cross-entropy with chunked outputs that saves memory by only upcasting one chunk at a time. For more details, please refer to: https://github.com/pytorch/torchtune/pull/1390 """ def __init__(self, num_output_chunks: int = 8, ignore_index: int = -100): super().__init__() self.num_output_chunks = num_output_chunks self.ignore_index = ignore_index def compute_cross_entropy( self, logits: torch.Tensor, labels: torch.Tensor, normalize: bool = True, ) -> torch.Tensor: """ Upcast logits to fp32 and compute cross entropy loss. """ return F.cross_entropy( logits.float(), labels, ignore_index=self.ignore_index, reduction="sum" ) def forward( self, logits: List[torch.Tensor], labels: torch.Tensor, reduction="sum" ) -> torch.Tensor: """ Args: logits (List[torch.Tensor]): List of chunked logits of length ``self.num_output_chunks``, where each chunk has shape ``(batch_size, num_tokens / num_output_chunks, vocab_size)``. labels (torch.Tensor): Ground truth labels of shape ``(batch_size, num_tokens)``. reduction (str): The reduction to apply to the output. Returns: torch.Tensor: Cross entropy loss of shape (1,). """ total_elements = (labels != self.ignore_index).sum() # chunk and reshape labels (bsz, num_tokens, vocab) -> [(bsz*num_tokens/num_chunks, vocab)] labels = [ target_chunk.reshape(-1) for target_chunk in labels.chunk(self.num_output_chunks, dim=1) ] # reshape logits [(bsz, num_tokens/num_chunks, vocab)] -> [(bsz*num_tokens/num_chunks, vocab)] logits = [ logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits ] # compute one chunk at a time total_loss = 0.0 for logits_chunk, labels_chunk in zip(logits, labels, strict=False): total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk) if reduction == "sum": return total_loss return total_loss / total_elements def _build_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100): loss_fn_ce = CEWithChunkedOutputLoss(num_output_chunks, ignore_index) loss_fn_ce.compute_cross_entropy = torch.compile( loss_fn_ce.compute_cross_entropy, backend="inductor" ) return loss_fn_ce def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100): loss_fn_ce = _build_chunked_ce_loss_fn(num_output_chunks, ignore_index) def chunked_fix_cross_entropy( source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs, ): reduction = "sum" if num_items_in_batch is not None else "mean" logit_chunks = [ chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1) ] loss = loss_fn_ce(logit_chunks, target, reduction=reduction) if reduction == "sum": loss = loss / num_items_in_batch return loss def for_causal_lm_chunked_loss( logits, labels, vocab_size: int = None, num_items_in_batch: Optional[int] = None, ignore_index: int = -100, shift_labels: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: # skip the upcast to float since we handle that in the chunking loss if shift_labels is None: # Shift so that tokens < n predict n labels = F.pad(labels, (0, 1), value=ignore_index) shift_labels = labels[..., 1:].contiguous() # Skip Flattening the tokens # Enable model parallelism shift_labels = shift_labels.to(logits.device) loss = chunked_fix_cross_entropy( logits, shift_labels, num_items_in_batch, ignore_index, **kwargs ) return loss return for_causal_lm_chunked_loss def patch_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100): import transformers.loss.loss_utils for_causal_lm_chunked_loss = get_causal_lm_loss(num_output_chunks, ignore_index) transformers.loss.loss_utils.ForCausalLMLoss = for_causal_lm_chunked_loss transformers.loss.loss_utils.LOSS_MAPPING["ForCausalLM"] = ( for_causal_lm_chunked_loss ) ================================================ FILE: src/axolotl/monkeypatch/loss/eaft.py ================================================ """ eaft (entropy-aware focal training) loss implementation weights examples by entropy approximation from top-k logits Reference: https://github.com/ymxyll/LlamaFactory-EAFT/blob/e2ce19e8efcc226450ee8f2b81dfe4e69f1f945d/src/llamafactory/train/trainer_utils.py """ import torch import torch.nn.functional as F def eaft_loss(outputs, labels, num_items_in_batch=None, alpha=1.0, k=20): """ compute eaft loss with entropy weighting args: outputs: model outputs containing logits labels: target labels for computing loss num_items_in_batch: for sample packing support alpha: exponent for entropy weighting (default 1.0) k: number of top logits for entropy approximation (default 20) """ logits = outputs.logits shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() vocab_size = shift_logits.size(-1) shift_logits_view = shift_logits.view(-1, vocab_size) shift_labels_view = shift_labels.view(-1) mask = shift_labels_view != -100 with torch.no_grad(): top_k_logits, _ = torch.topk( shift_logits_view[mask].float(), k=min(k, vocab_size), dim=-1 ) top_k_probs = F.softmax(top_k_logits, dim=-1) entropy = -(top_k_probs * torch.log(top_k_probs + 1e-10)).sum(dim=-1) weights = torch.pow(entropy, alpha) loss_fct = torch.nn.CrossEntropyLoss(reduction="none") per_token_loss = loss_fct(shift_logits_view[mask], shift_labels_view[mask]) weighted_loss = per_token_loss * weights if num_items_in_batch is not None: loss = weighted_loss.sum() / num_items_in_batch else: loss = weighted_loss.mean() return loss ================================================ FILE: src/axolotl/monkeypatch/mistral_attn_hijack_flash.py ================================================ """Flash attention monkey patch for mistral model""" from functools import partial import transformers from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def patch_mistral_cross_entropy(): from flash_attn.losses.cross_entropy import CrossEntropyLoss LOG.info("patching with flash_attn.losses.cross_entropy") transformers.models.mistral.modeling_mistral.CrossEntropyLoss = partial( CrossEntropyLoss, inplace_backward=True ) ================================================ FILE: src/axolotl/monkeypatch/mixtral/__init__.py ================================================ """ Patches to support multipack for mixtral """ import torch def patch_mixtral_moe_forward_zero3() -> None: import torch.nn.functional as F def mlp_forward(self, hidden_states): current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3( hidden_states ) current_hidden_states = self.w2(current_hidden_states) return current_hidden_states # Ref. https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py def moe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) # router_logits: (batch * sequence_length, n_experts) router_logits = self.gate(hidden_states) routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) topk_weight, topk_idx = torch.topk( routing_weights, self.top_k, dim=-1, sorted=False ) topk_weight /= topk_weight.sum(dim=-1, keepdim=True) # we cast back to the input dtype topk_weight = topk_weight.to(hidden_states.dtype) hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0) y = torch.empty_like(hidden_states) flat_topk_idx = topk_idx.view(-1) for i in range(self.num_experts): expert = self.experts[i] y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i]) y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1) final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim) return final_hidden_states, router_logits from transformers.models.mixtral.modeling_mixtral import ( MixtralBlockSparseTop2MLP, MixtralSparseMoeBlock, ) MixtralBlockSparseTop2MLP.forward = mlp_forward MixtralSparseMoeBlock.forward = moe_forward ================================================ FILE: src/axolotl/monkeypatch/models/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/apertus/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/apertus/activation.py ================================================ """Monkeypatch for Apertus to dtype mismatch in XIELU act""" from torch import Tensor def patch_apertus_xielu_activation(): try: from transformers.activations import XIELUActivation except ImportError as err: raise ImportError( "Cannot import XIELUActivation. " "Please make sure to update your transformers version >= 4.56.1." ) from err from transformers.activations import logger # Store the original method old_fn = XIELUActivation._xielu_cuda def _xielu_cuda_fixed(self, x: Tensor) -> Tensor: """Firewall function to prevent torch.compile from seeing .item() calls""" original_shape = x.shape # CUDA kernel expects 3D tensors, reshape if needed while x.dim() < 3: x = x.unsqueeze(0) if x.dim() > 3: x = x.view(-1, 1, x.size(-1)) if original_shape != x.shape: logger.warning_once( "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).", original_shape, x.shape, ) result = self._xielu_cuda_obj.forward( x, self.alpha_p.to(x.dtype), self.alpha_n.to(x.dtype), # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item() self._beta_scalar, self._eps_scalar, self.with_vector_loads, ) return result.view(original_shape) # Apply the patch XIELUActivation._xielu_cuda = _xielu_cuda_fixed def unpatch(): """Restore the original method""" XIELUActivation._xielu_cuda = old_fn return unpatch ================================================ FILE: src/axolotl/monkeypatch/models/kimi_linear/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py ================================================ """ Kimi-Linear configuration. Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/configuration_kimi.py Revision: 6e163f3 """ from typing import Optional from transformers.configuration_utils import PretrainedConfig class KimiLinearConfig(PretrainedConfig): model_type = "kimi_linear" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, model_type="kimi_linear", vocab_size=163840, hidden_size=4096, head_dim=None, intermediate_size=11008, num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=None, hidden_act="silu", initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, rope_theta=10000.0, rope_scaling=None, tie_word_embeddings=False, moe_intermediate_size: Optional[int] = None, moe_renormalize: bool = True, moe_router_activation_func: str = "sigmoid", num_experts: Optional[int] = None, num_experts_per_token: Optional[int] = None, num_shared_experts: int = 0, routed_scaling_factor: float = 1.0, first_k_dense_replace: int = 0, moe_layer_freq: int = 1, use_grouped_topk: bool = True, num_expert_group: int = 1, topk_group: int = 1, q_lora_rank: Optional[int] = None, kv_lora_rank: Optional[int] = None, qk_nope_head_dim: Optional[int] = None, qk_rope_head_dim: Optional[int] = None, v_head_dim: Optional[int] = None, mla_use_nope: Optional[bool] = False, num_nextn_predict_layers: int = 0, linear_attn_config: Optional[dict] = None, router_aux_loss_coef: float = 0.01, **kwargs, ): self.model_type = model_type self.vocab_size = vocab_size self.hidden_size = hidden_size self.head_dim = ( head_dim if head_dim is not None else hidden_size // num_attention_heads ) self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank self.qk_nope_head_dim = qk_nope_head_dim self.qk_rope_head_dim = qk_rope_head_dim self.v_head_dim = v_head_dim self.mla_use_nope = mla_use_nope # moe config self.num_experts = num_experts self.num_experts_per_token = num_experts_per_token self.moe_renormalize = moe_renormalize self.num_shared_experts = num_shared_experts self.routed_scaling_factor = routed_scaling_factor self.moe_router_activation_func = moe_router_activation_func assert self.moe_router_activation_func in ("softmax", "sigmoid") self.moe_intermediate_size = moe_intermediate_size self.first_k_dense_replace = first_k_dense_replace self.moe_layer_freq = moe_layer_freq self.use_grouped_topk = use_grouped_topk self.num_expert_group = num_expert_group self.topk_group = topk_group self.num_nextn_predict_layers = num_nextn_predict_layers self.router_aux_loss_coef = router_aux_loss_coef if linear_attn_config is not None: assert linear_attn_config["kda_layers"] is not None assert linear_attn_config["full_attn_layers"] is not None self.linear_attn_config = linear_attn_config super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) @property def is_mla(self): return ( self.q_lora_rank is not None or self.kv_lora_rank is not None or self.qk_nope_head_dim is not None or self.qk_rope_head_dim is not None or self.v_head_dim is not None or self.mla_use_nope is True ) @property def is_moe(self): return self.num_experts is not None @property def is_linear_attn(self) -> bool: return not ( self.linear_attn_config is None or ( isinstance(self.linear_attn_config, dict) and self.linear_attn_config["kda_layers"] is not None and len(self.linear_attn_config["kda_layers"]) == 0 ) ) def is_kda_layer(self, layer_idx: int): return ( self.linear_attn_config is not None and (layer_idx + 1) in self.linear_attn_config["kda_layers"] ) ================================================ FILE: src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py ================================================ """ Adapted Kimi-Linear modeling to enable MoE differentiable. Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/modeling_kimi.py Revision: 6e163f3 """ import math from collections.abc import Callable from typing import Any, List, Optional, Tuple, Union import torch import torch.nn.functional as F import transformers from einops import rearrange from packaging import version from torch import nn from transformers.activations import ACT2FN from transformers.cache_utils import Cache from transformers.generation import GenerationMixin from transformers.masking_utils import create_causal_mask from transformers.modeling_flash_attention_utils import FlashAttentionKwargs from transformers.modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, MoeCausalLMOutputWithPast, ) from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from transformers.processing_utils import Unpack from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS from transformers.utils import ( TransformersKwargs, can_return_tuple, logging, ) from transformers.utils.generic import OutputRecorder try: from fla.layers.utils import get_unpad_data, index_first_axis, pad_input from fla.modules import FusedRMSNormGated, ShortConvolution from fla.ops.kda import chunk_kda, fused_recurrent_kda from fla.ops.kda.gate import fused_kda_gate except ImportError as err: raise ImportError( "Plese run `pip uninstall fla-core flash-linear-attention -y && pip install git+https://github.com/fla-org/flash-linear-attention@v0.4.0`" ) from err from axolotl.monkeypatch.models.kimi_linear.configuration_kimi import KimiLinearConfig assert version.parse(transformers.__version__) >= version.parse("4.56.0"), ( "Please upgrade transformers to >= 4.56.0" ) logger = logging.get_logger(__name__) def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, top_k=2, attention_mask: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, int]: """Standard Switch Transformer load balancing loss.""" if gate_logits is None or not isinstance(gate_logits, tuple): return 0 # Concatenate all layer logits concatenated_gate_logits = torch.cat( [layer_gate for layer_gate in gate_logits], dim=0 ) routing_weights = F.softmax(concatenated_gate_logits, dim=-1) _, selected_experts = torch.topk(routing_weights, top_k, dim=-1) expert_mask = F.one_hot(selected_experts, num_experts) tokens_per_expert = torch.mean(expert_mask.float(), dim=0) router_prob_per_expert = torch.mean(routing_weights, dim=0) overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0)) return overall_loss * num_experts class KimiDynamicCache: """ Dynamic cache for Kimi model. Inspired by Qwen3-Next """ is_compileable = False def __init__(self, config: KimiLinearConfig): super().__init__() self.config = config if config.linear_attn_config is not None: self.layer_types = [] for i in range(config.num_hidden_layers): if config.is_kda_layer(i): self.layer_types.append("linear_attention") else: self.layer_types.append("full_attention") else: self.layer_types = ["full_attention"] * config.num_hidden_layers self.transformer_layers = [ i for i in range(config.num_hidden_layers) if self.layer_types[i] == "full_attention" ] linear_layers = [ i for i in range(config.num_hidden_layers) if self.layer_types[i] == "linear_attention" ] self.last_linear_layer = linear_layers[-1] if linear_layers else -1 self.conv_states = [None for _ in range(config.num_hidden_layers)] self.recurrent_states = [None for _ in range(config.num_hidden_layers)] self.key_cache = [None for _ in range(config.num_hidden_layers)] self.value_cache = [None for _ in range(config.num_hidden_layers)] def __len__(self): return len(self.layer_types) def update( self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int, cache_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[torch.Tensor, torch.Tensor]: if self.key_cache[layer_idx] is None: self.key_cache[layer_idx] = key_states self.value_cache[layer_idx] = value_states else: self.key_cache[layer_idx] = torch.cat( [self.key_cache[layer_idx], key_states], dim=2 ) self.value_cache[layer_idx] = torch.cat( [self.value_cache[layer_idx], value_states], dim=2 ) return self.key_cache[layer_idx], self.value_cache[layer_idx] def reorder_cache(self, beam_idx: torch.LongTensor): """Reorders the cache for beam search, given the selected beam indices.""" for layer_idx in range(len(self.key_cache)): if self.key_cache[layer_idx] is not None: device = self.key_cache[layer_idx].device beam_idx = beam_idx.to(device) self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select( 0, beam_idx ) self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select( 0, beam_idx ) if self.conv_states[layer_idx] is not None: device = self.conv_states[layer_idx][0].device beam_idx = beam_idx.to(device) q_conv, k_conv, v_conv = self.conv_states[layer_idx] self.conv_states[layer_idx] = ( q_conv.index_select(0, beam_idx), k_conv.index_select(0, beam_idx), v_conv.index_select(0, beam_idx), ) self.recurrent_states[layer_idx] = self.recurrent_states[ layer_idx ].index_select(0, beam_idx) def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # take any layer that contains cache and not empty tensor layer_idx = ( self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx ) if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None: return 0 return self.key_cache[layer_idx].shape[-2] def get_mask_sizes( self, cache_position: torch.Tensor, layer_idx: int ) -> tuple[int, int]: """ Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for the given layer at `layer_idx`. The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer. """ kv_offset = 0 query_length = cache_position.shape[0] past_seen_tokens = self.get_seq_length(layer_idx) kv_length = query_length + past_seen_tokens return kv_length, kv_offset @property def has_previous_state(self): """We have a previous state if the last linear (conv) layer was already updated.""" if self.last_linear_layer == -1: return False return self.conv_states[self.last_linear_layer] is not None class KimiRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ KimiRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) ALL_LAYERNORM_LAYERS.append(KimiRMSNorm) class KimiBlockSparseMLP(nn.Module): def __init__( self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None ): super().__init__() self.config = config self.ffn_dim = ( config.intermediate_size if intermediate_size is None else intermediate_size ) self.hidden_dim = config.hidden_size if hidden_size is None else hidden_size self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # gate self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False) # down self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # up self.act_fn = ACT2FN[config.hidden_act] def forward(self, hidden_states): current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3( hidden_states ) current_hidden_states = self.w2(current_hidden_states) return current_hidden_states class KimiMLP(nn.Module): def __init__( self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None ): super().__init__() self.config = config self.hidden_size = config.hidden_size if hidden_size is None else hidden_size self.intermediate_size = ( config.intermediate_size if intermediate_size is None else intermediate_size ) self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) return down_proj def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand( batch, num_key_value_heads, n_rep, slen, head_dim ) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def eager_attention_forward( module: nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: Optional[torch.Tensor], scaling: float, dropout: float = 0.0, **kwargs: Unpack[TransformersKwargs], ): key_states = repeat_kv(key, module.num_key_value_groups) value_states = repeat_kv(value, module.num_key_value_groups) attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( query.dtype ) attn_weights = nn.functional.dropout( attn_weights, p=dropout, training=module.training ) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, attn_weights class KimiMLAAttention(nn.Module): """ Multi-Latent Attention adapted from deepseek-v3 """ def __init__(self, config: KimiLinearConfig, layer_idx: int): nn.Module.__init__(self) self.config = config self.layer_idx = layer_idx self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.rope_theta = config.rope_theta self.attention_dropout = getattr(config, "attention_dropout", 0.0) try: self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank self.v_head_dim = config.v_head_dim self.qk_nope_head_dim = config.qk_nope_head_dim self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim self.use_nope = config.mla_use_nope self.scaling = self.q_head_dim ** (-0.5) except Exception as e: raise ValueError( f"Kimi MLA config is not found or not properly formatted: {e}" ) from e assert self.q_lora_rank is None self.q_proj = nn.Linear( self.hidden_size, self.num_heads * self.q_head_dim, bias=False, ) self.kv_a_proj_with_mqa = nn.Linear( self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, bias=False, ) self.kv_a_layernorm = KimiRMSNorm(self.kv_lora_rank) self.kv_b_proj = nn.Linear( self.kv_lora_rank, self.num_heads * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim), bias=False, ) self.o_proj = nn.Linear( self.num_heads * self.v_head_dim, self.hidden_size, bias=False, ) self.is_causal = True assert self.use_nope def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: batch_size, seq_length = hidden_states.shape[:-1] query_shape = (batch_size, seq_length, -1, self.q_head_dim) key_shape = ( batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim, ) q_states = self.q_proj(hidden_states) q_states = q_states.view(query_shape).transpose(1, 2) q_pass, q_rot = torch.split( q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 ) compressed_kv = self.kv_a_proj_with_mqa(hidden_states) k_pass, k_rot = torch.split( compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 ) k_pass = ( self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2) ) k_pass, value_states = torch.split( k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 ) k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim) k_rot = k_rot.expand(*k_pass.shape[:-1], -1) query_states = torch.cat((q_pass, q_rot), dim=-1) key_states = torch.cat((k_pass, k_rot), dim=-1) if past_key_values is not None: key_states, value_states = past_key_values.update( key_states, value_states, self.layer_idx ) if ( self.config._attn_implementation == "flash_attention_2" and self.q_head_dim != self.v_head_dim ): value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim]) attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": attention_interface = ALL_ATTENTION_FUNCTIONS[ self.config._attn_implementation ] attn_output, _ = attention_interface( self, query_states, key_states, value_states, attention_mask, dropout=0.0 if not self.training else self.attention_dropout, scaling=self.scaling, **kwargs, ) if ( self.config._attn_implementation == "flash_attention_2" and self.q_head_dim != self.v_head_dim ): attn_output = attn_output[:, :, :, : self.v_head_dim] attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() attn_output = self.o_proj(attn_output) return attn_output class KimiDeltaAttention(nn.Module): def __init__(self, config: KimiLinearConfig, layer_idx: int): super().__init__() self.config = config self.mode = "chunk" self.hidden_size = config.hidden_size self.conv_size = config.linear_attn_config["short_conv_kernel_size"] self.head_dim = config.linear_attn_config["head_dim"] self.num_heads = config.linear_attn_config["num_heads"] self.head_k_dim = self.head_dim self.num_k_heads = self.num_heads self.layer_idx = layer_idx assert self.mode in ["chunk", "fused_recurrent"], ( f"Not suppoerted mode `{self.mode}`." ) projection_k_size = self.head_k_dim * self.num_k_heads projection_size = self.head_dim * self.num_heads self.q_proj = nn.Linear(self.hidden_size, projection_k_size, bias=False) self.k_proj = nn.Linear(self.hidden_size, projection_k_size, bias=False) self.v_proj = nn.Linear(self.hidden_size, projection_size, bias=False) self.q_conv1d = ShortConvolution( hidden_size=projection_k_size, kernel_size=self.conv_size, activation="silu", ) self.k_conv1d = ShortConvolution( hidden_size=projection_k_size, kernel_size=self.conv_size, activation="silu" ) self.v_conv1d = ShortConvolution( hidden_size=projection_size, kernel_size=self.conv_size, activation="silu" ) self.A_log = torch.nn.Parameter( torch.log( torch.empty(self.num_heads, dtype=torch.float32).uniform_(1, 16) ).view(1, 1, -1, 1) ) self.f_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False) self.f_b_proj = nn.Linear(self.head_dim, projection_size, bias=False) self.dt_bias = nn.Parameter(torch.empty(projection_size, dtype=torch.float32)) self.b_proj = nn.Linear(self.hidden_size, self.num_heads, bias=False) self.g_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False) self.g_b_proj = nn.Linear(self.head_dim, projection_size, bias=False) self.o_norm = FusedRMSNormGated( self.head_dim, eps=config.rms_norm_eps, activation="sigmoid" ) self.o_proj = nn.Linear(projection_size, self.hidden_size, bias=False) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, cache_params: Optional[KimiDynamicCache] = None, **kwargs: Unpack[dict], ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]: if attention_mask is not None: if attention_mask.dim() != 2: attention_mask = kwargs.get("padding_mask", None) if attention_mask is not None and attention_mask.dim() != 2: raise ValueError( "attention_mask must be a 0-1 matrix of shape [batch_size, seq_len] " "(0 = padding). 3D masks are not supported here." ) use_cache = cache_params is not None batch_size, q_len, _ = hidden_states.shape mode = "fused_recurrent" if q_len <= 64 else self.mode if self.training: assert mode == "chunk", "Only chunk mode is supported in training." cu_seqlens = kwargs.get("cu_seqlens", None) indices = None if attention_mask is not None: indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:]) hidden_states = index_first_axis( rearrange(hidden_states, "b s ... -> (b s) ..."), indices ).unsqueeze(0) conv_state_q, conv_state_k, conv_state_v = None, None, None recurrent_state = None if cache_params is not None: if cache_params.conv_states[self.layer_idx] is not None: conv_state_q, conv_state_k, conv_state_v = cache_params.conv_states[ self.layer_idx ] recurrent_state = cache_params.recurrent_states[self.layer_idx] q, conv_state_q = self.q_conv1d( x=self.q_proj(hidden_states), cache=conv_state_q, output_final_state=use_cache, cu_seqlens=cu_seqlens, ) k, conv_state_k = self.k_conv1d( x=self.k_proj(hidden_states), cache=conv_state_k, output_final_state=use_cache, cu_seqlens=cu_seqlens, ) v, conv_state_v = self.v_conv1d( x=self.v_proj(hidden_states), cache=conv_state_v, output_final_state=use_cache, cu_seqlens=cu_seqlens, ) g = self.f_b_proj(self.f_a_proj(hidden_states)) g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias) beta = self.b_proj(hidden_states).float().sigmoid() q, k = map( lambda x: rearrange(x, "... (h d) -> ... h d", d=self.head_k_dim), (q, k) ) v = rearrange(v, "... (h d) -> ... h d", d=self.head_dim) if mode == "chunk": o, recurrent_state = chunk_kda( q=q, k=k, v=v, g=g, beta=beta, initial_state=recurrent_state, output_final_state=True, use_qk_l2norm_in_kernel=True, cu_seqlens=cu_seqlens, ) else: o, recurrent_state = fused_recurrent_kda( q=q, k=k, v=v, g=g, beta=beta, initial_state=recurrent_state, output_final_state=True, use_qk_l2norm_in_kernel=True, cu_seqlens=cu_seqlens, ) if cache_params is not None: cache_params.recurrent_states[self.layer_idx] = recurrent_state cache_params.conv_states[self.layer_idx] = ( conv_state_q, conv_state_k, conv_state_v, ) g = self.g_b_proj(self.g_a_proj(hidden_states)) g = rearrange(g, "... (h d) -> ... h d", d=self.head_dim) o = self.o_norm(o, g) o = rearrange(o, "b t h d -> b t (h d)") o = self.o_proj(o) if attention_mask is not None: o = pad_input(o.squeeze(0), indices, batch_size, q_len) return o class KimiMoEGate(nn.Module): """ MoE Gate that returns router logits. Routing decisions are made in KimiSparseMoeBlock. """ def __init__(self, config: KimiLinearConfig): super().__init__() self.config = config self.num_experts = config.num_experts self.gating_dim = config.hidden_size self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim))) self.e_score_correction_bias = nn.Parameter(torch.zeros((self.num_experts,))) self.reset_parameters() def reset_parameters(self) -> None: import torch.nn.init as init init.kaiming_uniform_(self.weight, a=math.sqrt(5)) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Args: hidden_states: [batch_size, seq_len, hidden_dim] Returns: router_logits: [batch_size * seq_len, num_experts] """ _, _, h = hidden_states.shape hidden_states = hidden_states.view(-1, h) router_logits = F.linear( hidden_states.type(torch.float32), self.weight.type(torch.float32), None ) return router_logits # def forward(self, hidden_states): # bsz, seq_len, h = hidden_states.shape # # compute gating score # hidden_states = hidden_states.view(-1, h) # logits = F.linear( # hidden_states.type(torch.float32), self.weight.type( # torch.float32), None # ) # if self.moe_router_activation_func == "sigmoid": # scores = logits.sigmoid() # elif self.moe_router_activation_func == "softmax": # scores = logits.softmax(dim=1) # else: # raise NotImplementedError( # f"insupportable scoring function for MoE gating: {self.moe_router_activation_func}" # ) # # select top-k experts # assert not self.training # scores_for_choice = scores.view(bsz * seq_len, -1) # scores_for_choice += self.e_score_correction_bias.unsqueeze(0) # group_scores = ( # scores_for_choice.view( # bsz * seq_len, self.num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1) # ) # [n, num_expert_group] # group_idx = torch.topk( # group_scores, k=self.topk_group, dim=-1, sorted=False # )[ # 1 # ] # [n, top_k_group] # group_mask = torch.zeros_like(group_scores) # [n, num_expert_group] # group_mask.scatter_(1, group_idx, 1) # [n, num_expert_group] # score_mask = ( # group_mask.unsqueeze(-1) # .expand( # bsz * seq_len, self.num_expert_group, self.num_experts // self.num_expert_group # ) # .reshape(bsz * seq_len, -1) # ) # [n, e] # tmp_scores = scores_for_choice.masked_fill( # ~score_mask.bool(), 0.0) # [n, e] # _, topk_idx = torch.topk( # tmp_scores, k=self.top_k, dim=-1, sorted=False # ) # topk_weight = scores.gather(1, topk_idx) # # norm gate to sum 1 # if self.top_k > 1 and self.moe_renormalize: # denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 # topk_weight = topk_weight / denominator # # must multiply the scaling factor # topk_weight = topk_weight * self.routed_scaling_factor # return topk_idx, topk_weight # class KimiSparseMoeBlock(nn.Module): # """ # Adapted from Deepseek-V3's MOE implementation # The namings are consistent with Kimi's version. # """ # def __init__(self, config: KimiLinearConfig): # super().__init__() # self.config = config # self.hidden_dim = config.hidden_size # self.num_experts = config.num_experts # self.top_k = config.num_experts_per_token # self.moe_renormalize = config.moe_renormalize # self.ep_size = 1 # self.experts_per_rank = config.num_experts # self.ep_rank = 0 # self.experts = nn.ModuleList( # [ # KimiBlockSparseMLP( # config, intermediate_size=config.moe_intermediate_size # ) # for _ in range(config.num_experts) # ] # ) # self.gate = KimiMoEGate(config) # if config.num_shared_experts is not None: # intermediate_size = config.moe_intermediate_size * config.num_shared_experts # self.shared_experts = KimiMLP( # config=config, intermediate_size=intermediate_size # ) # def forward(self, hidden_states): # identity = hidden_states # orig_shape = hidden_states.shape # topk_idx, topk_weight = self.gate(hidden_states) # hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) # flat_topk_idx = topk_idx.view(-1) # if not self.training: # y = self.moe_infer(hidden_states, topk_idx, # topk_weight).view(*orig_shape) # else: # raise NotImplementedError( # "Training mode is not supported in KimiSparseMoeBlock") # if self.config.num_shared_experts is not None: # y = y + self.shared_experts(identity) # return y # @torch.no_grad() # def moe_infer(self, x, topk_ids, topk_weight): # cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) # cnts.scatter_(1, topk_ids, 1) # tokens_per_expert = cnts.sum(dim=0) # idxs = topk_ids.view(-1).argsort() # sorted_tokens = x[idxs // topk_ids.shape[1]] # tokens_per_expert = tokens_per_expert.cpu().numpy() # outputs = [] # start_idx = 0 # for i, num_tokens in enumerate(tokens_per_expert): # end_idx = start_idx + num_tokens # if num_tokens == 0: # continue # expert = self.experts[i + self.ep_rank * self.experts_per_rank] # tokens_for_this_expert = sorted_tokens[start_idx:end_idx] # expert_out = expert(tokens_for_this_expert) # outputs.append(expert_out) # start_idx = end_idx # outs = torch.cat(outputs, dim=0) if len( # outputs) else sorted_tokens.new_empty(0) # new_x = torch.empty_like(outs) # new_x[idxs] = outs # final_out = ( # new_x.view(*topk_ids.shape, -1) # .type(topk_weight.dtype) # .mul_(topk_weight.unsqueeze(dim=-1)) # .sum(dim=1) # .type(new_x.dtype) # ) # return final_out # Replace the KimiSparseMoeBlock class with this new version class KimiSparseMoeBlock(nn.Module): """ MoE block adapted from Deepseek-V3. Returns only hidden_states - router_logits captured by OutputRecorder. """ def __init__(self, config: KimiLinearConfig): super().__init__() self.config = config self.hidden_dim = config.hidden_size self.num_experts = config.num_experts self.top_k = config.num_experts_per_token self.moe_renormalize = config.moe_renormalize self.routed_scaling_factor = config.routed_scaling_factor self.num_expert_group = getattr(config, "num_expert_group", 1) self.topk_group = getattr(config, "topk_group", 1) self.experts = nn.ModuleList( [ KimiBlockSparseMLP( config, intermediate_size=config.moe_intermediate_size ) for _ in range(config.num_experts) ] ) self.gate = KimiMoEGate(config) if config.num_shared_experts is not None: intermediate_size = config.moe_intermediate_size * config.num_shared_experts self.shared_experts = KimiMLP( config=config, intermediate_size=intermediate_size ) def route_tokens_to_experts( self, router_logits: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: """ Compute routing decisions from router logits. Args: router_logits: [num_tokens, num_experts] Returns: topk_idx: [num_tokens, top_k] topk_weight: [num_tokens, top_k] """ num_tokens = router_logits.shape[0] if self.training: # Training: use softmax for standard aux loss compatibility scores = F.softmax(router_logits, dim=-1, dtype=torch.float32) topk_weight, topk_idx = torch.topk(scores, self.top_k, dim=-1, sorted=False) else: # Inference: use original sigmoid + group selection scores = router_logits.sigmoid() scores_for_choice = scores + self.gate.e_score_correction_bias.unsqueeze(0) # Group-based selection group_scores = ( scores_for_choice.view(num_tokens, self.num_expert_group, -1) .topk(2, dim=-1)[0] .sum(dim=-1) ) group_idx = torch.topk( group_scores, k=self.topk_group, dim=-1, sorted=False )[1] group_mask = torch.zeros_like(group_scores) group_mask.scatter_(1, group_idx, 1) score_mask = ( group_mask.unsqueeze(-1) .expand( num_tokens, self.num_expert_group, self.num_experts // self.num_expert_group, ) .reshape(num_tokens, -1) ) tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False) topk_weight = scores.gather(1, topk_idx) # Normalize and scale if self.top_k > 1 and self.moe_renormalize: denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 topk_weight = topk_weight / denominator topk_weight = topk_weight * self.routed_scaling_factor return topk_idx, topk_weight def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Forward pass returning only hidden_states. Router logits are captured by OutputRecorder for aux loss. """ identity = hidden_states batch_size, seq_len, hidden_dim = hidden_states.shape num_tokens = batch_size * seq_len # Flatten for routing hidden_states_flat = hidden_states.view(num_tokens, hidden_dim) # Get router logits - OutputRecorder captures this! router_logits = self.gate(hidden_states) # Get routing decisions topk_idx, topk_weight = self.route_tokens_to_experts(router_logits) if self.training: final_hidden_states = self._training_forward( hidden_states_flat, topk_idx, topk_weight, num_tokens, hidden_dim ) else: final_hidden_states = self._inference_forward( hidden_states_flat, topk_idx, topk_weight ) final_hidden_states = final_hidden_states.view(batch_size, seq_len, hidden_dim) # Add shared experts if present if self.config.num_shared_experts is not None: final_hidden_states = final_hidden_states + self.shared_experts(identity) return final_hidden_states def _training_forward( self, hidden_states: torch.Tensor, topk_idx: torch.Tensor, topk_weight: torch.Tensor, num_tokens: int, hidden_dim: int, ) -> torch.Tensor: """ Differentiable training forward using scatter-gather pattern. """ # Flatten expert indices: [num_tokens * top_k] flat_topk_idx = topk_idx.view(-1) # Sort by expert index to group tokens going to same expert sorted_indices = torch.argsort(flat_topk_idx) inverse_permutation = torch.argsort(sorted_indices) # Each token appears top_k times (once per expert choice) token_indices = torch.arange( num_tokens, device=hidden_states.device ).repeat_interleave(self.top_k) # Gather tokens and weights in sorted order shuffled_tokens = hidden_states[token_indices[sorted_indices]] shuffled_weights = topk_weight.view(-1)[sorted_indices].unsqueeze(-1) # Count tokens per expert tokens_per_expert = F.one_hot(flat_topk_idx, num_classes=self.num_experts).sum( dim=0 ) # Process each expert's batch expert_outputs = [] current_pos = 0 for i in range(self.num_experts): num_tokens_for_expert = tokens_per_expert[i].item() if num_tokens_for_expert == 0: continue expert_input = shuffled_tokens[ current_pos : current_pos + num_tokens_for_expert ] expert_output = self.experts[i](expert_input) expert_outputs.append(expert_output) current_pos += num_tokens_for_expert # Concatenate all outputs if expert_outputs: concatenated_outputs = torch.cat(expert_outputs, dim=0) else: concatenated_outputs = torch.zeros( num_tokens * self.top_k, hidden_dim, device=hidden_states.device, dtype=hidden_states.dtype, ) # Apply weights while still in sorted order weighted_outputs = concatenated_outputs * shuffled_weights # Unsort back to original token order unshuffled_outputs = weighted_outputs[inverse_permutation] # Sum contributions from all top_k experts for each token final_hidden_states = unshuffled_outputs.view( num_tokens, self.top_k, hidden_dim ).sum(dim=1) return final_hidden_states @torch.no_grad() def _inference_forward( self, hidden_states: torch.Tensor, topk_idx: torch.Tensor, topk_weight: torch.Tensor, ) -> torch.Tensor: """ Optimized inference forward (original implementation). """ cnts = topk_idx.new_zeros((topk_idx.shape[0], len(self.experts))) cnts.scatter_(1, topk_idx, 1) tokens_per_expert = cnts.sum(dim=0) idxs = topk_idx.view(-1).argsort() sorted_tokens = hidden_states[idxs // topk_idx.shape[1]] tokens_per_expert_list = tokens_per_expert.cpu().numpy() outputs = [] start_idx = 0 for i, num_tokens in enumerate(tokens_per_expert_list): end_idx = start_idx + num_tokens if num_tokens == 0: continue expert = self.experts[i] tokens_for_expert = sorted_tokens[start_idx:end_idx] expert_out = expert(tokens_for_expert) outputs.append(expert_out) start_idx = end_idx outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0) new_x = torch.empty_like(outs) new_x[idxs] = outs final_out = ( new_x.view(*topk_idx.shape, -1) .type(topk_weight.dtype) .mul_(topk_weight.unsqueeze(dim=-1)) .sum(dim=1) .type(new_x.dtype) ) return final_out class KimiDecoderLayer(nn.Module): def __init__(self, config: KimiLinearConfig, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size self.config = config if config.is_kda_layer(layer_idx): self.is_linear_attn = True self.self_attn = KimiDeltaAttention(config=config, layer_idx=layer_idx) elif config.is_mla: self.is_linear_attn = False self.self_attn = KimiMLAAttention(config=config, layer_idx=layer_idx) else: raise NotImplementedError if ( config.num_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % getattr(config, "moe_layer_freq", 1) == 0 ): self.block_sparse_moe = KimiSparseMoeBlock(config) else: self.mlp = KimiMLP(config) self.input_layernorm = KimiRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = KimiRMSNorm( config.hidden_size, eps=config.rms_norm_eps ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, **kwargs: Unpack[FlashAttentionKwargs], ) -> Tuple[ torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] ]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`, *optional*): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states """ residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention if self.is_linear_attn is False: hidden_states = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, **kwargs, ) else: hidden_states = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, cache_params=past_key_values, output_attentions=output_attentions, use_cache=use_cache, **kwargs, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) if hasattr(self, "block_sparse_moe"): hidden_states = self.block_sparse_moe(hidden_states) else: hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states return hidden_states class KimiPreTrainedModel(PreTrainedModel): config_class = KimiLinearConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["KimiDecoderLayer"] _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True _can_record_outputs = { "router_logits": OutputRecorder(KimiMoEGate, index=0), "hidden_states": KimiDecoderLayer, "attentions": KimiMLAAttention, } _is_stateful = True def _init_weights(self, module): std = self.config.initializer_range if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() class KimiLinearModel(KimiPreTrainedModel): def __init__(self, config: KimiLinearConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = nn.Embedding( config.vocab_size, config.hidden_size, self.padding_idx ) self.layers = nn.ModuleList( [ KimiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers) ] ) self.norm = KimiRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if getattr(config, "_attn_implementation", None) is not None: if config._attn_implementation != "flash_attention_2": logger.warning_once( f"Ignoring the provided attention implementation {config._attn_implementation}" ) logger.warning_once("Using flash_attention_2 backend instead.") config._attn_implementation = "flash_attention_2" else: config._attn_implementation = "flash_attention_2" self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def _update_linear_attn_mask(self, attention_mask, cache_position): """ NOTE: Left-padding is used for linear attention mask. No need for zeroing states when 1. Cached forward 2. Attending to all inputs """ linear_attn_mask = attention_mask if cache_position[0] > 0 or ( attention_mask is not None and torch.all(attention_mask == 1) ): linear_attn_mask = None return linear_attn_mask def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, cache_position: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: use_cache = use_cache if use_cache is not None else self.config.use_cache if (input_ids is None) and (inputs_embeds is None): raise ValueError( "You must specify exactly one of input_ids or inputs_embeds" ) # Get inputs_embeds if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) if use_cache and past_key_values is None: past_key_values = KimiDynamicCache(config=self.config) if cache_position is None: past_seen_tokens = ( past_key_values.get_seq_length() if past_key_values is not None else 0 ) cache_position: torch.Tensor = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device, ) if position_ids is None: position_ids = cache_position.unsqueeze(0) causal_mask = create_causal_mask( config=self.config, input_embeds=inputs_embeds, attention_mask=attention_mask, cache_position=cache_position, past_key_values=past_key_values, position_ids=position_ids, ) linear_attn_mask = self._update_linear_attn_mask(attention_mask, cache_position) hidden_states = inputs_embeds if past_key_values is not None: assert isinstance(past_key_values, KimiDynamicCache) for decoder_layer in self.layers: layer_mask = ( linear_attn_mask if decoder_layer.is_linear_attn else causal_mask ) hidden_states = decoder_layer( hidden_states, attention_mask=layer_mask, past_key_values=past_key_values, cache_position=cache_position, **kwargs, ) hidden_states = self.norm(hidden_states) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=past_key_values, ) class KimiLinearForCausalLM(KimiPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): super().__init__(config) self.model = KimiLinearModel(config) self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() @can_return_tuple def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, generation_mode: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Returns: Example: ```python >>> from transformers import AutoTokenizer, KimiLinearForCausalLM >>> model = KimiLinearForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." ```""" outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, cache_position=cache_position, **kwargs, ) logits = outputs[0] if generation_mode: logits = logits[:, -1:] logits = self.lm_head(logits) loss = None if labels is not None: loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) aux_loss = None if kwargs.get("output_router_logits", False): aux_loss = load_balancing_loss_func( outputs.router_logits, num_experts=self.config.num_experts, top_k=self.config.num_experts_per_token, attention_mask=attention_mask, ) if loss is not None: loss = loss + self.config.router_aux_loss_coef * aux_loss return MoeCausalLMOutputWithPast( loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: src/axolotl/monkeypatch/models/kimi_linear/patch_kimi_linear.py ================================================ import importlib.resources import importlib.util import sys from pathlib import Path from axolotl.utils.logging import get_logger LOG = get_logger(__name__) KIMI_PATCH_PACKAGE = "axolotl.monkeypatch.models.kimi_linear" def get_patch_file_path(package_dot_path: str, filename: str) -> Path: """ Gets the absolute path to a patch file using importlib.resources.files. """ try: return importlib.resources.files(package_dot_path) / filename except ModuleNotFoundError: return None def _load_local_module(module_name: str, filename: str): """Helper to load a local module if not already loaded.""" if module_name in sys.modules: return sys.modules[module_name] patch_path = get_patch_file_path(KIMI_PATCH_PACKAGE, filename) if patch_path and patch_path.exists(): spec = importlib.util.spec_from_file_location(module_name, patch_path) module = importlib.util.module_from_spec(spec) sys.modules[module_name] = module spec.loader.exec_module(module) return module return None def _patch_get_class_in_module(): """ Core patch function that hijacks Transformers' dynamic module loading. """ from transformers.dynamic_module_utils import get_class_in_module if hasattr(get_class_in_module, "_axolotl_patched"): return original_get_class_in_module = get_class_in_module # Mapping of module path patterns to (module_name, filename) KIMI_MODULE_MAP = { "configuration_kimi": ("configuration_kimi", "configuration_kimi.py"), "modeling_kimi": ("modeling_kimi", "modeling_kimi.py"), "tokenization_kimi": ("tokenization_kimi", "tokenization_kimi.py"), } def patched_get_class_in_module(class_name, module_path, **kwargs): """Patched version that returns our local modules instead of remote ones.""" for pattern, (module_name, filename) in KIMI_MODULE_MAP.items(): if pattern in module_path: module = _load_local_module(module_name, filename) if module: return getattr(module, class_name) break # Pattern matched but file not found, fall through return original_get_class_in_module(class_name, module_path, **kwargs) import transformers.dynamic_module_utils transformers.dynamic_module_utils.get_class_in_module = patched_get_class_in_module patched_get_class_in_module._axolotl_patched = True def patch_kimi(): """ Apply all Kimi patches. Must be called BEFORE loading config/tokenizer/model. """ _patch_get_class_in_module() LOG.info("Kimi patches applied successfully!") # Keep these for backward compatibility if needed patch_kimi_config = patch_kimi patch_kimi_tokenizer = patch_kimi patch_kimi_model = patch_kimi ================================================ FILE: src/axolotl/monkeypatch/models/kimi_linear/tokenization_kimi.py ================================================ """ Adapted Kimi-Linear tokenizer to use proper template defaults and misc fixes. Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/tokenization_kimi.py Revision: 919416f """ import os from logging import getLogger from pathlib import Path from shutil import copyfile from typing import ( Any, Dict, Iterator, List, Optional, Tuple, Union, cast, ) import tiktoken from tiktoken.load import load_tiktoken_bpe from tokenizers import AddedToken from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode from transformers.tokenization_utils import PreTrainedTokenizer logger = getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"} class TikTokenTokenizer(PreTrainedTokenizer): """ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. Args: vocab_file (`str`): The path to the Tiktoken model file. bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`): The end of sequence token. unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. The second to last item in special_tokens. pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`): The token used for padding, for example when batching sequences of different lengths. additional_special_tokens (list of `str`, *optional*): A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be skipped when decoding if `skip_special_tokens` is set to `True`. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] special_tokens: Dict[str, int] num_reserved_special_tokens = 256 pat_str = "|".join( [ r"""[\p{Han}]+""", r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", r"""\p{N}{1,3}""", r""" ?[^\s\p{L}\p{N}]+[\r\n]*""", r"""\s*[\r\n]+""", r"""\s+(?!\S)""", r"""\s+""", ] ) def __init__( self, vocab_file, bos_token: Union[str, AddedToken] = "[BOS]", # nosec: B107 eos_token: Union[str, AddedToken] = "[EOS]", # nosec: B107 unk_token: Union[str, AddedToken, None] = None, pad_token: Union[str, AddedToken, None] = None, additional_special_tokens: List[str] = None, added_tokens_decoder: Optional[dict] = None, **kwargs, ): assert os.path.isfile(vocab_file), vocab_file if additional_special_tokens is None: additional_special_tokens = [ "<|im_end|>", "<|im_user|>", "<|im_assistant|>", "<|start_header_id|>", "<|end_header_id|>", "[EOT]", "<|im_system|>", "<|im_middle|>", ] special_tokens_mapping = { i: added_tokens_decoder[i].content for i in added_tokens_decoder } self.vocab_file = vocab_file mergeable_ranks = load_tiktoken_bpe(vocab_file) num_base_tokens = len(mergeable_ranks) self.special_tokens = { special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i for i in range( num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2 ) } self.model = tiktoken.Encoding( name=Path(vocab_file).name, pat_str=self.pat_str, mergeable_ranks=mergeable_ranks, special_tokens=self.special_tokens, ) logger.info(f"Reloaded tiktoken model from {vocab_file}") self.n_words: int = self.model.n_vocab # BOS / EOS token IDs self.bos_id: int = self.special_tokens[str(bos_token)] self.eos_id: int = self.special_tokens[str(eos_token)] logger.info( f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" ) self.pad_id: int = self.special_tokens[str(pad_token)] self.unk_id: int = self.special_tokens[str(unk_token)] self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.decoder = {} for i in range(self.n_words): # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee decoding = "".join( [ self.byte_encoder[ord(char)] for char in self.model.decode_single_token_bytes(i).decode( "latin-1" ) ] ) self.decoder[i] = decoding self.encoder = {} for i in range(self.n_words): if i in self.decoder: self.encoder[self.decoder[i]] = i super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self.all_special_ids_set = set(self.all_special_ids) def encode( self, text: str, allow_special_tokens: bool = True, **kwargs ) -> List[int]: """ Encodes a string into a list of token IDs. Args: text (str): The input string to be encoded. Returns: list[int]: A list of token IDs. """ # If there are other args, we should call super().encode because there are a lot of code # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id. # NOTE: our encode method is not compatible with the super().encode method, # e.g. split_special_tokens' default is True in our encode method. if len(kwargs) > 0: # logger.warning(f"Calling super().encode with {kwargs}") return super().encode(text, **kwargs) assert type(text) is str # The tiktoken tokenizer can handle <=400k chars without # pyo3_runtime.PanicException. TIKTOKEN_MAX_ENCODE_CHARS = 400_000 # https://github.com/openai/tiktoken/issues/195 # Here we iterate over subsequences and split if we exceed the limit # of max consecutive non-whitespace or whitespace characters. MAX_NO_WHITESPACES_CHARS = 25_000 texts = self.pre_tokenizer_process(text) all_substrs = [] for text in texts: substrs = ( substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS) for substr in self._split_whitespaces_or_nonwhitespaces( text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS ) ) all_substrs.extend(substrs) t: List[int] = [] for substr in all_substrs: if allow_special_tokens: t.extend( # we should consider special token as a common token self.model.encode( substr, allowed_special="all", ) ) else: t.extend( # we should consider special token as a common token self.model.encode( substr, disallowed_special=(), ) ) return t def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str: """ Decodes a list of token IDs into a string. Args: token_ids (List[int]): The list of token IDs to be decoded. Returns: str: The decoded string. """ # If there are other args, we should call super().decode because there are a lot of code # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token. if len(kwargs) > 0: return super().decode(token_ids, **kwargs) if type(token_ids) is int: token_ids = [token_ids] return self.model.decode(cast(List[int], token_ids)) @staticmethod def _split_whitespaces_or_nonwhitespaces( s: str, max_consecutive_slice_len: int ) -> Iterator[str]: """ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` consecutive whitespaces or consecutive non-whitespaces. """ current_slice_len = 0 current_slice_is_space = s[0].isspace() if len(s) > 0 else False slice_start = 0 for i in range(len(s)): is_now_space = s[i].isspace() if current_slice_is_space ^ is_now_space: current_slice_len = 1 current_slice_is_space = is_now_space else: current_slice_len += 1 if current_slice_len > max_consecutive_slice_len: yield s[slice_start:i] slice_start = i current_slice_len = 1 yield s[slice_start:] def pre_tokenizer_process(self, text: str) -> List[str]: """ pre-tokenizes the input text into a list of tokens. This method is used to split the input text into smaller chunks for internal processing. """ return [text] """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """ @property def vocab_size(self) -> int: return self.n_words def get_vocab(self) -> Dict[str, int]: return self.encoder def _tokenize(self, text: str, **kwargs) -> List[str]: return [self.decoder[t] for t in self.encode(text)] def _convert_token_to_id(self, token: str) -> int: return self.encoder.get(token, self.unk_id) def _convert_id_to_token(self, index: int) -> str: return self.decoder.get(index) @staticmethod def clean_up_tokenization(out_string: str) -> str: return out_string def convert_tokens_to_string(self, tokens: List[str]) -> str: text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode( "utf-8", "replace" ) return text def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str]: if not os.path.isdir(save_directory): raise ValueError( f"vocabulary path ({save_directory}) should be a directory" ) out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath( out_vocab_file ) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) def apply_chat_template( self, conversation, tools: Optional[list[dict]] = None, tokenize: bool = True, add_generation_prompt: bool = False, **kwargs, ): tools = deep_sort_dict(tools) return super().apply_chat_template( conversation, tools=tools, tokenize=tokenize, add_generation_prompt=add_generation_prompt, **kwargs, ) def deep_sort_dict(obj: Any) -> Any: if isinstance(obj, dict): return {k: deep_sort_dict(v) for k, v in sorted(obj.items())} if isinstance(obj, list): return [deep_sort_dict(item) for item in obj] return obj ================================================ FILE: src/axolotl/monkeypatch/models/llama4/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/llama4/modeling.py ================================================ """ Modified Llama-4 text experts modeling for linearized experts for improved LoRA support """ import sys import torch from torch import nn from transformers import Llama4Config from transformers.activations import ACT2FN class Llama4TextExperts(nn.Module): """ Modified Llama-4 text experts modeling for linearized experts """ def __init__(self, config: Llama4Config): super().__init__() self.num_experts = config.num_local_experts self.intermediate_size = config.intermediate_size self.hidden_size = config.hidden_size self.expert_dim = self.intermediate_size # Replace fused gate_up_proj with separate Linear modules self.gate_projs = nn.ModuleList( [ nn.Linear(self.hidden_size, self.expert_dim, bias=False) for _ in range(self.num_experts) ] ) self.up_projs = nn.ModuleList( [ nn.Linear(self.hidden_size, self.expert_dim, bias=False) for _ in range(self.num_experts) ] ) # Replace down_proj Parameter with Linear modules self.down_projs = nn.ModuleList( [ nn.Linear(self.expert_dim, self.hidden_size, bias=False) for _ in range(self.num_experts) ] ) self.act_fn = ACT2FN[config.hidden_act] def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Forward method using separate Linear layers for each expert. Args: hidden_states (torch.Tensor): (num_experts * batch_size, hidden_size) The input should be organized by expert Returns: torch.Tensor: (num_experts * batch_size, hidden_size) """ # Reshape to separate by expert hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size) # batch_size_per_expert = hidden_states.size(1) # Initialize output tensor next_states = torch.zeros_like(hidden_states) # Process each expert separately for i in range(self.num_experts): # Get input for this expert expert_input = hidden_states[ i ] # Shape: (batch_size_per_expert, hidden_size) # Apply gate and up projections gate = self.gate_projs[i]( expert_input ) # Shape: (batch_size_per_expert, expert_dim) up = self.up_projs[i]( expert_input ) # Shape: (batch_size_per_expert, expert_dim) # Apply activation and down projection next_states[i] = self.down_projs[i](up * self.act_fn(gate)) # Flatten back to original shape return next_states.view(-1, self.hidden_size) def patch_llama4_linearized_modeling(): """ Patch Llama4TextExperts to use separate Linear layers for each expert. """ from transformers.models.llama4 import modeling_llama4 old_lamma_4_text_experts = modeling_llama4.Llama4TextExperts modeling_llama4.Llama4TextExperts = Llama4TextExperts sys.modules["transformers.models.llama4"].Llama4TextExperts = Llama4TextExperts def unpatch(): modeling_llama4.Llama4TextExperts = old_lamma_4_text_experts sys.modules[ "transformers.models.llama4" ].Llama4TextExperts = old_lamma_4_text_experts return unpatch ================================================ FILE: src/axolotl/monkeypatch/models/mistral3/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py ================================================ """ Monkeypatch to fix inefficient tensor conversion in MistralCommonBackend.apply_chat_template """ import importlib import inspect from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def apply_mistral_tokenizer_image_patch(): """Apply patch to MistralCommonBackend.apply_chat_template to fix image tensor conversion.""" from transformers.tokenization_mistral_common import MistralCommonBackend # Get original source original_source = inspect.getsource(MistralCommonBackend.apply_chat_template) original_source, _ = detab_code(original_source) # Define the replacement original_tensor_conversion = ( " pixel_values = torch.tensor(images)" ) patched_tensor_conversion = """ if isinstance(images, list) and len(images) > 0 and isinstance(images[0], np.ndarray): pixel_values = torch.tensor(np.array(images)) else: pixel_values = torch.tensor(images)""" # Apply the replacement if original_tensor_conversion in original_source: patched_source = original_source.replace( original_tensor_conversion, patched_tensor_conversion ) patched_source = patched_source.replace( "def apply_chat_template(", "def patched_apply_chat_template(", 1, ) # Load necessary imports from the module module_name = MistralCommonBackend.__module__ module = importlib.import_module(module_name) # Detect what needs to be imported items_to_import = [] for item in dir(module): if item in patched_source and not item.startswith("_"): items_to_import.append(item) # Execute imports in global scope if items_to_import: exec( # nosec B102 f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) # Also need standard imports that might be used exec("import numpy as np", globals()) # nosec B102 exec("import torch", globals()) # nosec B102 exec("from typing import Union, Optional, List, Dict, Any, Callable", globals()) # nosec B102 exec("from pathlib import Path", globals()) # nosec B102 # Import other dependencies that might be needed try: exec("from transformers.utils import is_torch_available", globals()) # nosec B102 exec( "from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType", globals(), ) # nosec B102 exec("from transformers.utils import logging", globals()) # nosec B102 exec("logger = logging.get_logger(__name__)", globals()) # nosec B102 except ImportError as e: LOG.warning(f"Could not import some dependencies: {e}") # Execute the patched source exec(patched_source, globals()) # nosec B102 # Replace the method MistralCommonBackend.apply_chat_template = patched_apply_chat_template LOG.info("Successfully applied MistralCommonBackend tensor conversion patch") else: LOG.warning("Could not find target code for MistralCommonBackend patching") ================================================ FILE: src/axolotl/monkeypatch/models/pixtral/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py ================================================ """Monkeypatch for FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid""" import torch def apply_patch_is_packed_sequence(): """Apply patch to FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid""" from transformers import modeling_flash_attention_utils def fixed_is_packed_sequence(position_ids, batch_size): """ Check the position ids whether packed sequences are indicated or not 1. Position ids exist 2. Flattened sequences only are supported 3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences """ if position_ids is None: return False if position_ids.ndim == 1: position_ids = position_ids.unsqueeze(0) # [N] -> [1, N] increasing_position_sequences = ( torch.arange(position_ids.shape[1], device=position_ids.device) + position_ids.min() ) return ( batch_size == 1 and (increasing_position_sequences - position_ids).abs().sum().bool().item() ) # Store original method old_fn = modeling_flash_attention_utils._is_packed_sequence # Apply the patch modeling_flash_attention_utils._is_packed_sequence = fixed_is_packed_sequence def unpatch(): """Restore the original method""" modeling_flash_attention_utils._is_packed_sequence = old_fn return unpatch ================================================ FILE: src/axolotl/monkeypatch/models/qwen3_5/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/qwen3_5/modeling.py ================================================ """Monkeypatch for Qwen3_5 and Qwen3_5Moe models to pass position_ids to linear attention.""" import importlib from typing import Optional, Tuple import torch import torch.nn.functional as F from axolotl.utils.logging import get_logger LOG = get_logger(__name__) try: from fla.modules.convolution import ( causal_conv1d as fla_causal_conv1d, # FLA >= 0.4.1 ) except ImportError: try: from fla.modules.conv import causal_conv1d as fla_causal_conv1d # FLA < 0.4.1 except ImportError: fla_causal_conv1d = None def get_cu_seqlens(position_ids): """ Compute cumulative sequence lengths from position_ids for FLA varlen kernels. Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids. https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316 Qwen3.5 uses MRoPE: position_ids arrive as [axes, B, T]. All axes carry the same temporal positions, so axis 0 is used to recover the [B, T] layout. See: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_5/modeling_qwen3_5.py """ if position_ids.ndim == 3: position_ids = position_ids[0] tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device} position_ids = position_ids.view(-1) indices_q = (position_ids == 0).nonzero().view(-1) return torch.cat( ( indices_q.to(**tensor_kwargs), torch.tensor(position_ids.size(), **tensor_kwargs), ) ) def _inject_fla_kernels(module) -> None: """Inject FLA kernels into a modeling module, bypassing is_flash_linear_attention_available.""" try: from fla.modules import FusedRMSNormGated from fla.ops.gated_delta_rule import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, ) module.FusedRMSNormGated = FusedRMSNormGated module.chunk_gated_delta_rule = chunk_gated_delta_rule module.fused_recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule module.is_fast_path_available = True except ImportError: module.chunk_gated_delta_rule = None module.fused_recurrent_gated_delta_rule = None module.FusedRMSNormGated = None def _patched_decoder_forward( self, hidden_states: torch.Tensor, position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values=None, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> torch.FloatTensor: """Decoder layer forward that passes position_ids through to linear attention.""" residual = hidden_states hidden_states = self.input_layernorm(hidden_states) if self.layer_type == "linear_attention": hidden_states = self.linear_attn( hidden_states=hidden_states, cache_params=past_key_values, cache_position=cache_position, attention_mask=attention_mask, position_ids=position_ids, ) elif self.layer_type == "full_attention": hidden_states, _ = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) if isinstance(hidden_states, tuple): # MoE returns (hidden_states, router_logits) hidden_states, _ = hidden_states hidden_states = residual + hidden_states return hidden_states def _make_qwen3_5_gated_delta_forward(apply_mask_fn): """Factory for patched Qwen3_5/Qwen3_5Moe GatedDeltaNet forward with packing support.""" def patched_forward( self, hidden_states: torch.Tensor, cache_params=None, cache_position: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, ): hidden_states = apply_mask_fn(hidden_states, attention_mask) batch_size, seq_len, _ = hidden_states.shape use_precomputed_states = ( cache_params is not None and cache_params.has_previous_state and seq_len == 1 and cache_position is not None ) cu_seqlens = None if not use_precomputed_states and position_ids is not None: cu_seqlens = get_cu_seqlens(position_ids=position_ids) if cache_params is not None: conv_state = cache_params.conv_states[self.layer_idx] recurrent_state = cache_params.recurrent_states[self.layer_idx] # mixed_qkv stays [B, T, D]; only transposed inside paths that require [B, D, T] mixed_qkv = self.in_proj_qkv(hidden_states) # [B, T, D] z = self.in_proj_z(hidden_states) z = z.reshape(batch_size, seq_len, -1, self.head_v_dim) b = self.in_proj_b(hidden_states) a = self.in_proj_a(hidden_states) if use_precomputed_states: mixed_qkv = self.causal_conv1d_update( mixed_qkv.transpose(1, 2), conv_state, self.conv1d.weight.squeeze(1), self.conv1d.bias, self.activation, ).transpose(1, 2) else: if cache_params is not None: mixed_qkv_t = mixed_qkv.transpose(1, 2) cache_params.conv_states[self.layer_idx] = F.pad( mixed_qkv_t, (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0), ) if fla_causal_conv1d is not None and cu_seqlens is not None: # FLA varlen kernel for packed sequences; input must be contiguous [B, T, D] mixed_qkv, _ = fla_causal_conv1d( x=mixed_qkv, weight=self.conv1d.weight.squeeze(1), bias=self.conv1d.bias, activation=self.activation, cu_seqlens=cu_seqlens, ) else: if cu_seqlens is not None and fla_causal_conv1d is None: raise RuntimeError( "Packed sequences require fla.modules.convolution.causal_conv1d " "(cu_seqlens support). Install flash-linear-attention or disable packing." ) mixed_qkv = F.silu( self.conv1d(mixed_qkv.transpose(1, 2))[:, :, :seq_len] ).transpose(1, 2) query, key, value = torch.split( mixed_qkv, [self.key_dim, self.key_dim, self.value_dim], dim=-1, ) query = query.reshape(batch_size, seq_len, -1, self.head_k_dim) key = key.reshape(batch_size, seq_len, -1, self.head_k_dim) value = value.reshape(batch_size, seq_len, -1, self.head_v_dim) beta = b.sigmoid() g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) if self.num_v_heads // self.num_k_heads > 1: query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) if not use_precomputed_states: core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( query, key, value, g=g.to(dtype=query.dtype), beta=beta, initial_state=None, output_final_state=cache_params is not None, use_qk_l2norm_in_kernel=True, # torch_chunk_gated_delta_rule fallback does not accept cu_seqlens **({"cu_seqlens": cu_seqlens} if cu_seqlens is not None else {}), ) else: core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule( query, key, value, g=g.to(dtype=query.dtype), beta=beta, initial_state=recurrent_state, output_final_state=cache_params is not None, use_qk_l2norm_in_kernel=True, ) if cache_params is not None: cache_params.recurrent_states[self.layer_idx] = last_recurrent_state core_attn_out = core_attn_out.reshape(-1, self.head_v_dim) z = z.reshape(-1, self.head_v_dim) core_attn_out = self.norm(core_attn_out, z) core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1) return self.out_proj(core_attn_out) return patched_forward def _apply_packing_patches(model_type: str, cls_prefix: str, forward_factory) -> None: module_name = f"transformers.models.{model_type}.modeling_{model_type}" try: module = importlib.import_module(module_name) except ImportError: LOG.warning(f"{model_type} not found in transformers, skipping packing patches") return _inject_fla_kernels(module) getattr(module, f"{cls_prefix}DecoderLayer").forward = _patched_decoder_forward gated_cls = getattr(module, f"{cls_prefix}GatedDeltaNet") gated_cls.forward = forward_factory(module.apply_mask_to_padding_states) LOG.info( f"Applied {cls_prefix} packing patch " f"(fla_causal_conv1d={'available' if fla_causal_conv1d else 'unavailable'})" ) def patch_qwen3_5_modeling_packing(): _apply_packing_patches("qwen3_5", "Qwen3_5", _make_qwen3_5_gated_delta_forward) def patch_qwen3_5_moe_modeling_packing(): _apply_packing_patches( "qwen3_5_moe", "Qwen3_5Moe", _make_qwen3_5_gated_delta_forward ) def patch_qwen3_5_vlm_flash_attention(): """ Patch _is_packed_sequence to handle Qwen3.5's 3-D MRoPE position_ids. transformers passes position_ids as [axes, B, T] to decoder layers, but _is_packed_sequence only handles 2-D tensors and mis-classifies the 3-D shape as a packed-sequence indicator, causing CUDA errors in the varlen path. """ try: import transformers.modeling_flash_attention_utils as fa_utils _original = fa_utils._is_packed_sequence def _patched(position_ids, batch_size): if position_ids is not None and position_ids.ndim != 2: return False return _original(position_ids, batch_size) fa_utils._is_packed_sequence = _patched LOG.info("Applied Qwen3.5 VLM flash-attention patch (3-D MRoPE position_ids)") except Exception as exc: # pragma: no cover LOG.warning(f"Failed to apply Qwen3.5 VLM flash-attention patch: {exc}") ================================================ FILE: src/axolotl/monkeypatch/models/qwen3_next/__init__.py ================================================ """Qwen3_Next model monkeypatches.""" ================================================ FILE: src/axolotl/monkeypatch/models/qwen3_next/modeling.py ================================================ """Monkeypatch for Qwen3_Next model to pass position_ids to linear attention.""" from typing import Optional, Tuple import torch import torch.nn.functional as F from axolotl.utils.logging import get_logger LOG = get_logger(__name__) try: from fla.modules.convolution import causal_conv1d as fla_causal_conv1d except ImportError: fla_causal_conv1d = None def get_cu_seqlens(position_ids): """ Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids. https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316 """ tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device} position_ids = position_ids.view(-1) indices_q = (position_ids == 0).nonzero().view(-1) cu_seq_lens_q = torch.cat( ( indices_q.to(**tensor_kwargs), torch.tensor(position_ids.size(), **tensor_kwargs), ) ) return cu_seq_lens_q def patch_qwen3_next_decoder_layer(): """Patch Qwen3NextDecoderLayer to pass position_ids to linear attention.""" try: from transformers.models.qwen3_next.modeling_qwen3_next import ( Qwen3NextDecoderLayer, ) except ImportError: LOG.warning("Qwen3Next model not found, skipping patch") return # Store original forward method original_decoder_forward = Qwen3NextDecoderLayer.forward def patched_decoder_forward( self, hidden_states: torch.Tensor, position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[torch.Tensor]] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> torch.FloatTensor: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Token Mixer if self.layer_type == "linear_attention": hidden_states = self.linear_attn( hidden_states=hidden_states, cache_params=past_key_values, cache_position=cache_position, attention_mask=attention_mask, position_ids=position_ids, ) elif self.layer_type == "full_attention": # Self Attention hidden_states, _ = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) # For the MoE layers, we need to unpack if isinstance(hidden_states, Tuple): hidden_states, _ = hidden_states hidden_states = residual + hidden_states return hidden_states # Apply the patches Qwen3NextDecoderLayer.forward = patched_decoder_forward def unpatch(): """Restore the original forward method""" Qwen3NextDecoderLayer.forward = original_decoder_forward return unpatch def patch_qwen3_next_gateddelta_layer(): """Patch Qwen3NextGatedDeltaNet to parse cu_seqlens and pass to chunk_gated_delta_rule""" try: from transformers.models.qwen3_next.modeling_qwen3_next import ( Qwen3NextDynamicCache, Qwen3NextGatedDeltaNet, apply_mask_to_padding_states, ) except ImportError: LOG.warning("Qwen3Next model not found, skipping patch") return # Store original forward method original_gated_delta_net_forward = Qwen3NextGatedDeltaNet.forward def patched_gated_delta_net_forward( self, hidden_states: torch.Tensor, cache_params: Optional[Qwen3NextDynamicCache] = None, cache_position: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, ): hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) # Set up dimensions for reshapes later batch_size, seq_len, _ = hidden_states.shape use_precomputed_states = ( cache_params is not None and cache_params.has_previous_state and seq_len == 1 and cache_position is not None ) # Compute cu_seqlens early for use by both causal_conv1d and chunk_gated_delta_rule cu_seqlens = None if not use_precomputed_states and position_ids is not None: cu_seqlens = get_cu_seqlens(position_ids=position_ids) # getting projected states from cache if it exists if cache_params is not None: conv_state = cache_params.conv_states[self.layer_idx] recurrent_state = cache_params.recurrent_states[self.layer_idx] projected_states_qkvz = self.in_proj_qkvz(hidden_states) projected_states_ba = self.in_proj_ba(hidden_states) query, key, value, z, b, a = self.fix_query_key_value_ordering( projected_states_qkvz, projected_states_ba ) query, key, value = ( x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value) ) mixed_qkv = torch.cat((query, key, value), dim=-1) # [B, T, D] if use_precomputed_states: # Inference single-token path: causal_conv1d_update expects [B, D, T] mixed_qkv = mixed_qkv.transpose(1, 2) mixed_qkv = self.causal_conv1d_update( mixed_qkv, conv_state, self.conv1d.weight.squeeze(1), self.conv1d.bias, self.activation, ) mixed_qkv = mixed_qkv.transpose(1, 2) else: if cache_params is not None: # Cache state expects [B, D, T] for the inference update path mixed_qkv_t = mixed_qkv.transpose(1, 2) conv_state = F.pad( mixed_qkv_t, (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0), ) cache_params.conv_states[self.layer_idx] = conv_state if fla_causal_conv1d is not None: # FLA Triton causal_conv1d: [B, T, D] in/out, with cu_seqlens support mixed_qkv, _ = fla_causal_conv1d( x=mixed_qkv, weight=self.conv1d.weight.squeeze(1), bias=self.conv1d.bias, activation=self.activation, cu_seqlens=cu_seqlens, ) else: # PyTorch fallback (no cu_seqlens support) if cu_seqlens is not None and cu_seqlens.shape[0] > batch_size + 1: raise RuntimeError( "Packed sequences require fla.modules.convolution.causal_conv1d " "(cu_seqlens support). Install flash-linear-attention or disable packing." ) LOG.warning_once( "FLA causal_conv1d not available. Falling back to PyTorch conv1d." ) mixed_qkv = mixed_qkv.transpose(1, 2) mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len]) mixed_qkv = mixed_qkv.transpose(1, 2) # mixed_qkv is [B, T, D] in all paths query, key, value = torch.split( mixed_qkv, [ self.key_dim, self.key_dim, self.value_dim, ], dim=-1, ) query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim) key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim) value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim) beta = b.sigmoid() # If the model is loaded in fp16, without the .float() here, A might be -inf g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) if self.num_v_heads // self.num_k_heads > 1: query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) if not use_precomputed_states: core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( query, key, value, g=g, beta=beta, initial_state=None, output_final_state=cache_params is not None, use_qk_l2norm_in_kernel=True, cu_seqlens=cu_seqlens, ) else: core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule( query, key, value, g=g, beta=beta, initial_state=recurrent_state, output_final_state=cache_params is not None, use_qk_l2norm_in_kernel=True, ) # Update cache if cache_params is not None: cache_params.recurrent_states[self.layer_idx] = last_recurrent_state z_shape_og = z.shape # reshape input data into 2D tensor core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1]) z = z.reshape(-1, z.shape[-1]) core_attn_out = self.norm(core_attn_out, z) core_attn_out = core_attn_out.reshape(z_shape_og) core_attn_out = core_attn_out.reshape( core_attn_out.shape[0], core_attn_out.shape[1], -1 ) output = self.out_proj(core_attn_out) return output # Apply the patches Qwen3NextGatedDeltaNet.forward = patched_gated_delta_net_forward def unpatch(): """Restore the original forward method""" Qwen3NextGatedDeltaNet.forward = original_gated_delta_net_forward return unpatch def patch_qwen3_next_imports(): """Patch Qwen3Next imports to use try/except instead of is_flash_linear_attention_available.""" try: import transformers.models.qwen3_next.modeling_qwen3_next as qwen3_modeling except ImportError: LOG.warning("Qwen3Next model not found, skipping import patch") return # Save original values for unpatch original_FusedRMSNormGated = getattr(qwen3_modeling, "FusedRMSNormGated", None) original_chunk_gated_delta_rule = getattr( qwen3_modeling, "chunk_gated_delta_rule", None ) original_fused_recurrent_gated_delta_rule = getattr( qwen3_modeling, "fused_recurrent_gated_delta_rule", None ) original_is_fast_path_available = getattr( qwen3_modeling, "is_fast_path_available", False ) try: from fla.modules import FusedRMSNormGated from fla.ops.gated_delta_rule import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, ) qwen3_modeling.FusedRMSNormGated = FusedRMSNormGated qwen3_modeling.chunk_gated_delta_rule = chunk_gated_delta_rule qwen3_modeling.fused_recurrent_gated_delta_rule = ( fused_recurrent_gated_delta_rule ) # Force is_fast_path_available to be True # fla has triton kernels for causal_conv1d qwen3_modeling.is_fast_path_available = True except ImportError: qwen3_modeling.chunk_gated_delta_rule = None qwen3_modeling.fused_recurrent_gated_delta_rule = None qwen3_modeling.FusedRMSNormGated = None def unpatch(): """Restore the original import values""" qwen3_modeling.FusedRMSNormGated = original_FusedRMSNormGated qwen3_modeling.chunk_gated_delta_rule = original_chunk_gated_delta_rule qwen3_modeling.fused_recurrent_gated_delta_rule = ( original_fused_recurrent_gated_delta_rule ) qwen3_modeling.is_fast_path_available = original_is_fast_path_available return unpatch def patch_qwen3_next_modeling_packing(): """Apply all Qwen3Next model patches.""" patch_qwen3_next_imports() patch_qwen3_next_decoder_layer() patch_qwen3_next_gateddelta_layer() LOG.info("Applied Qwen3Next patch for packing") ================================================ FILE: src/axolotl/monkeypatch/models/voxtral/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/models/voxtral/modeling.py ================================================ """Monkeypatch for voxtral to fix leaf node and dtype mismatch""" from typing import Optional, Union import torch from transformers.cache_utils import Cache from transformers.modeling_outputs import CausalLMOutputWithPast def patch_voxtral_conditional_generation_forward(): from transformers.models.voxtral.modeling_voxtral import ( VoxtralForConditionalGeneration, ) # Store the original forward method old_forward = VoxtralForConditionalGeneration.forward def _forward( self, input_ids: Optional[torch.LongTensor] = None, input_features: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, ) -> CausalLMOutputWithPast: if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) if input_features is not None: audio_embeds = self.get_audio_embeds(input_features) # Cast audio_embeds to match inputs_embeds dtype audio_embeds = audio_embeds.to(inputs_embeds.dtype) # replace text-audio token placeholders with audio embeddings audio_token_mask = input_ids == self.config.audio_token_id inputs_embeds = inputs_embeds.clone() inputs_embeds[audio_token_mask] = audio_embeds outputs = self.language_model( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, cache_position=cache_position, logits_to_keep=logits_to_keep, **kwargs, ) return outputs # Apply the patch VoxtralForConditionalGeneration.forward = _forward def unpatch(): """Restore the original forward method""" VoxtralForConditionalGeneration.forward = old_forward return unpatch ================================================ FILE: src/axolotl/monkeypatch/moe_quant.py ================================================ """Loading-time quantization for MoE expert weights stored as 3D nn.Parameter tensors.""" import bitsandbytes as bnb import torch import torch.nn.utils.parametrize as P from axolotl.utils.logging import get_logger LOG = get_logger(__name__) _moe_load_state = { "count": 0, "mode": "4bit", "quant_type": "nf4", "compress_statistics": True, "patched": False, # Module path → param names in definition order, captured before quantization. # Without this, alphabetical loading order would mismatch merge order. "expert_param_order": {}, } class Bnb8bitParametrization(torch.nn.Module): """Dequantizes int8 row-wise quantized data on access.""" def __init__(self, row_stats: torch.Tensor): super().__init__() self.register_buffer("row_stats", row_stats) @torch.no_grad() def forward(self, quantized_param: torch.Tensor) -> torch.Tensor: """Flatten 3D+ to 2D for BnB's dequant, then reshape back.""" orig_shape = quantized_param.shape if quantized_param.ndim > 2: quantized_param = quantized_param.reshape(-1, orig_shape[-1]) result = bnb.functional.int8_vectorwise_dequant(quantized_param, self.row_stats) return result.reshape(orig_shape) def _enable_parametrization_cache(module, inputs): P._cache_enabled += 1 def _disable_parametrization_cache(module, inputs, output): P._cache_enabled -= 1 if not P._cache_enabled: P._cache = {} def replace_parameter_8bit(module, param_name): """Replace a module parameter with an 8-bit quantized version using parametrization.""" original_param = getattr(module, param_name) int8_data, row_stats, _ = bnb.functional.int8_vectorwise_quant( original_param.data.to(torch.float16) ) setattr(module, param_name, torch.nn.Parameter(int8_data, requires_grad=False)) del original_param P.register_parametrization( module, param_name, Bnb8bitParametrization(row_stats), unsafe=True ) # Cache dequantized values during forward to avoid redundant dequantization. if not getattr(module, "_axolotl_8bit_hooks_registered", False): module.register_forward_pre_hook(_enable_parametrization_cache) module.register_forward_hook(_disable_parametrization_cache) module._axolotl_8bit_hooks_registered = True def patch_moe_quantization_on_load(cfg): """Patch transformers' weight loading to quantize MoE expert params on-the-fly.""" mode = "8bit" if getattr(cfg, "load_in_8bit", False) else "4bit" _moe_load_state["mode"] = mode _moe_load_state["count"] = 0 _moe_load_state["expert_param_order"] = {} if _moe_load_state["patched"]: LOG.debug("MoE loading-time quantization patch already active") return import transformers.core_model_loading import transformers.modeling_utils if mode == "4bit": from bitsandbytes.nn.parametrize import replace_parameter_4bit quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4" compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None) if compress_statistics is None: compress_statistics = True _moe_load_state["quant_type"] = quant_type _moe_load_state["compress_statistics"] = compress_statistics # Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16 # size for all params, defeating our on-load quantization VRAM savings. def _noop_warmup(*args, **kwargs): pass transformers.modeling_utils.caching_allocator_warmup = _noop_warmup original_set_param = transformers.core_model_loading.set_param_for_module def _patched_set_param_for_module(model, target_name, param_value, *args, **kwargs): original_set_param(model, target_name, param_value, *args, **kwargs) if param_value.ndim >= 3 and param_value.is_cuda: mod_path, _, pname = target_name.rpartition(".") mod = model.get_submodule(mod_path) if mod_path else model if not isinstance(mod, (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt)): if "expert" not in target_name.lower(): LOG.debug( "Skipping non-expert 3D param: %s (shape=%s)", target_name, list(param_value.shape), ) return # Record definition order before parametrizations override it # with alphabetical order. if mod_path not in _moe_load_state["expert_param_order"]: _moe_load_state["expert_param_order"][mod_path] = list( mod._parameters.keys() ) if _moe_load_state["mode"] == "4bit": replace_parameter_4bit( mod, pname, compress_statistics=_moe_load_state["compress_statistics"], quant_type=_moe_load_state["quant_type"], ) else: replace_parameter_8bit(mod, pname) _moe_load_state["count"] += 1 # Release the bf16 tensor so CUDA memory is freed immediately. param_value.data = torch.empty(0, device="cpu") torch.cuda.empty_cache() transformers.core_model_loading.set_param_for_module = _patched_set_param_for_module _moe_load_state["patched"] = True def get_moe_quantized_count(): """Return the number of expert parameters quantized during loading.""" return _moe_load_state["count"] def patch_peft_target_parameters_matching(): """Fix PEFT's _inject_parameters for target_parameters on quantized MoE experts. 1. Expands short suffixes to full module paths for parametrized modules. 2. Iterates params in definition order (not alphabetical order) so saved adapters are compatible with standard PEFT, vLLM, etc. """ if getattr(patch_peft_target_parameters_matching, "_axolotl_patched", False): return from contextlib import nullcontext from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.utils.integrations import init_empty_weights from peft.utils.other import _get_submodules def _patched_inject_parameters( self, peft_config, model, adapter_name, low_cpu_mem_usage ): original_targets = list(peft_config.target_parameters) expanded = set(original_targets) # Expand short suffixes to full paths for parametrized modules. for module_name, module in model.named_modules(): if not hasattr(module, "parametrizations"): continue for target in original_targets: mod_path, _, param_name = target.rpartition(".") if ( module_name == mod_path or module_name.endswith("." + mod_path) ) and hasattr(module, param_name): expanded.add(f"{module_name}.{param_name}") target_names_set = expanded def strip_base_layer_from_name(module_name): name = ".base_layer" while name in module_name: prefix, _, suffix = module_name.rpartition(name) module_name = prefix + suffix return module_name def create_and_replace_param(module_name, key, param_name): parent, target, target_name = _get_submodules(model, module_name) unwrapped_module_name = strip_base_layer_from_name(module_name) unwrapped_module = model.get_submodule(unwrapped_module_name) if ( isinstance(unwrapped_module, BaseTunerLayer) and unwrapped_module.__class__.__name__ != "ParamWrapper" ): raise ValueError( f"Trying to wrap an `nn.Parameter` of layer " f"'{unwrapped_module_name}' of type " f"{type(target).__name__}, which is not a valid target. " f"Make sure that this layer is not also targeted with " f"`target_modules`." ) self._check_target_module_compatiblity(peft_config, model, target_name) ctx = init_empty_weights if low_cpu_mem_usage else nullcontext with ctx(): self._create_and_replace( peft_config, adapter_name, target, target_name, parent, current_key=key, parameter_name=param_name.rpartition(".")[-1], ) # Use definition order (not alphabetical order) for parametrized modules # so ParamWrapper nesting matches vanilla PEFT on a plain model. expert_param_order = _moe_load_state.get("expert_param_order", {}) for module_name, module in model.named_modules(): if hasattr(module, "parametrizations"): stored_order = expert_param_order.get(module_name) if stored_order is not None: params_iter = [ p for p in stored_order if p in module.parametrizations ] else: # Fallback for paths that bypass model loading (e.g. unit tests). params_iter = list(module.parametrizations.keys()) for param_name in params_iter: key = f"{module_name}.{param_name}" if (key in target_names_set) or any( key.endswith(f".{t}") for t in target_names_set ): create_and_replace_param(module_name, key, param_name) self.targeted_parameter_names.append(key) else: unwrapped_module_name = strip_base_layer_from_name(module_name) for param_name, _ in module.named_parameters(recurse=False): key = f"{unwrapped_module_name}.{param_name}" if (key in target_names_set) or any( key.endswith(f".{t}") for t in target_names_set ): create_and_replace_param(module_name, key, param_name) self.targeted_parameter_names.append(key) BaseTuner._inject_parameters = _patched_inject_parameters patch_peft_target_parameters_matching._axolotl_patched = True LOG.info("Patched PEFT _inject_parameters for consistent ParamWrapper ordering") ================================================ FILE: src/axolotl/monkeypatch/multipack.py ================================================ """multipack patching for v2 of sample packing""" import importlib import transformers from accelerate import init_empty_weights from transformers import AutoConfig, AutoModelForCausalLM from transformers.integrations import is_deepspeed_zero3_enabled from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3 from axolotl.monkeypatch.utils import get_unpad_data SUPPORTED_MULTIPACK_MODEL_TYPES = [ "apertus", "mllama_text_model", "llama", "llama4", "mistral", "mixtral", "qwen2", "qwen2_moe", "qwen3", "qwen3_moe", "qwen3_next", "qwen3_5", "qwen3_5_moe", "falcon", "phi", "phi3", "gemma", "gemma2", "gemma3", "gemma3_text", "cohere", "cohere2", "gemmoe", "starcoder2", "deepseek_v2", "deepseek_v3", "glm", "glm4", "glm4_moe", "smollm3", "granite", "granitemoe", "granitemoeshared", "granitemoehybrid", "hunyuan_v1_dense", "hunyuan_v1_moe", "gpt_oss", "arcee", "seed_oss", "lfm2", "lfm2_moe", "olmo", "olmo2", "olmo3", "ministral", "ministral3", "mistral4", "afmoe", "nemotron", ] def patch_for_multipack(model_type, model_name=None, has_remote_code=False): if has_remote_code: patch_remote(model_name) elif hasattr(transformers, "modeling_flash_attention_utils"): # sanity check in case upstream api changes on this assert hasattr( transformers.modeling_flash_attention_utils, "_get_unpad_data" ), "transformers api changed for _get_unpad_data for flash attention" transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data if model_type == "mixtral" and is_deepspeed_zero3_enabled(): patch_mixtral_moe_forward_zero3() def patch_remote(model_name): model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) # we need to load the model here in order for modeling_* to be available with init_empty_weights(): AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) parts = model_config.__class__.__module__.split(".") parts[-1] = parts[-1].replace("configuration_", "modeling_", 1) module_name = ".".join(parts) modeling_arch = importlib.import_module(module_name) if hasattr(modeling_arch, "_get_unpad_data"): modeling_arch._get_unpad_data = get_unpad_data ================================================ FILE: src/axolotl/monkeypatch/peft/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/peft/utils.py ================================================ """ Patch prepare_model_for_kbit_training to not upcast everything """ import inspect import peft import axolotl from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) ORIGINAL_PREPARE_CODE = """ for param in model.parameters(): if ( (param.dtype == torch.float16) or (param.dtype == torch.bfloat16) ) and param.__class__.__name__ != "Params4bit": param.data = param.data.to(torch.float32) """ PATCHED_PREPARE_CODE = """ for name, param in model.named_parameters(): if ( (param.dtype == torch.float16) or (param.dtype == torch.bfloat16) ) and param.__class__.__name__ != "Params4bit" and all(embed_name not in name for embed_name in ["embed_tokens", "lm_head"]): param.data = param.data.to(torch.float32) """ def get_peft_prep_code() -> str: prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training) return prepare def check_peft_prep_code_is_patchable() -> bool: prep_code = get_peft_prep_code() prep_code, _ = detab_code(prep_code) return ORIGINAL_PREPARE_CODE in prep_code def patch_peft_prep_code(): """ monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs """ try: prep_code = get_peft_prep_code() except OSError: return peft.utils.other._original_create_accelerator_and_postprocess = prep_code prep_code, _ = detab_code(prep_code) if ORIGINAL_PREPARE_CODE not in prep_code: return prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE) prep_code = prep_code.replace( "def prepare_model_for_kbit_training(", "def fixed_prepare_model_for_kbit_training(", 1, ) items_to_import = [] for item in dir(peft.utils.other): if item in prep_code: items_to_import.append(item) exec( "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")", globals(), ) exec(prep_code, globals()) LOG.info("patching prepare_model_for_kbit_training to allow for overrides") peft.utils.other.prepare_model_for_kbit_training = ( fixed_prepare_model_for_kbit_training ) axolotl.loaders.model.prepare_model_for_kbit_training = ( fixed_prepare_model_for_kbit_training ) ================================================ FILE: src/axolotl/monkeypatch/relora.py ================================================ """Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.""" import glob import json import os.path import shutil from functools import partial from pathlib import Path from typing import Dict, List, Union import bitsandbytes as bnb import peft import safetensors.torch as st import torch from huggingface_hub import snapshot_download from torch.distributed.optim import ZeroRedundancyOptimizer from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import barrier, is_main_process from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @torch.no_grad() def magnitude_pruning_(tensor, prune_ratio): tensor_magnitude = torch.abs(tensor) threshold = torch.quantile( tensor_magnitude.flatten().to(dtype=torch.float32), prune_ratio ).to(dtype=tensor.dtype) mask = tensor_magnitude > threshold tensor.mul_(mask.to(dtype=tensor.dtype)) def reset_optimizer( optimizer: torch.optim.Optimizer, *, reset_params: List[str], # where str is the key to a torch.nn.Parameter optimizer_state_keys: List[str], optimizer_magnitude_pruning: float = 0.9, ): # pylint:disable=unused-argument pruning_fn = partial(magnitude_pruning_, prune_ratio=optimizer_magnitude_pruning) n_zeros = 0 n_total = 0 optimizer_state = optimizer.state if isinstance(optimizer, ZeroRedundancyOptimizer): optimizer_state = optimizer.optim.state for group in optimizer.param_groups: for param in group["params"]: state = optimizer_state[param] for key, value in state.items(): if key not in optimizer_state_keys: continue if torch.is_tensor(value): try: pruning_fn(value) n_total += value.numel() n_zeros += torch.sum(value == 0).item() except RuntimeError as exc: if "quantile() input tensor is too large" in str(exc): pass else: raise exc _zeroed = n_zeros / (1e-7 + n_total) * 100 LOG.info(f"Percent of optimizer states zeroed: {_zeroed:.2f}") LOG.info(f"absolute n of optimizer states zeroed: {n_zeros}") class ReLoRACallback(TrainerCallback): """Callback to merge LoRA weights into the base model and save full-weight checkpoints""" def __init__(self, cfg: DictDefault): self.relora_steps = cfg.jagged_restart_steps self.cpu_offload = cfg.relora_cpu_offload self.quantized = cfg.load_in_4bit or cfg.load_in_8bit self.last_full_model = cfg.base_model self.resume_from_checkpoint = cfg.resume_from_checkpoint if not os.path.exists(self.last_full_model): self.last_full_model = str(Path(snapshot_download(cfg.base_model))) assert os.path.exists(self.last_full_model), ( "for ReLORA base_model must be a local path" ) self.num_lora_restarts = 0 self.need_full_save = False def on_train_begin( self, _args: TrainingArguments, _state: TrainerState, control: TrainerControl, model: peft.LoraModel, **_kwargs, ): if self.resume_from_checkpoint: weight_path = os.path.join(self.resume_from_checkpoint, "relora") if not os.path.exists(weight_path): LOG.warning( "Resuming ReLoRA from checkpoint, but no full-weight save found" ) else: LOG.info(f"Loading adjusted base weights from {weight_path}") load_weight_checkpoint(model, weight_path) return control def on_step_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: peft.LoraModel, optimizer: torch.optim.Optimizer, **_kwargs, ): if not optimizer: optimizer = state.optimizer if state.global_step > 0 and state.global_step % self.relora_steps == 0: checkpoint_folder = os.path.join( args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora", ) if "adam" in args.optim.lower(): optimizer_state_keys = ["exp_avg", "exp_avg_sq"] if "8bit" in args.optim.lower(): optimizer_state_keys.append("state1") optimizer_state_keys.append("state2") else: raise ValueError(f"Optimizer {args.optim} not supported with ReLoRA") lora_params = [ n for n, p in model.named_parameters() if p.requires_grad and "lora_" in n ] model.save_pretrained( os.path.join( args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "adapter", ), ) with torch.no_grad(): merge_and_save( model, self.last_full_model, checkpoint_folder, reinit=True, quantized=self.quantized, actually_save=is_main_process(), cpu_offload=self.cpu_offload, ) reset_optimizer( optimizer, reset_params=lora_params, optimizer_state_keys=optimizer_state_keys, optimizer_magnitude_pruning=args.relora_prune_ratio, ) if self.quantized: self.last_full_model = checkpoint_folder self.num_lora_restarts += 1 return control def on_save( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: peft.LoraModel, **_kwargs, ): checkpoint_folder = os.path.join( args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora" ) if ( state.global_step >= self.relora_steps and state.global_step % self.relora_steps != 0 ): if self.quantized: if is_main_process() and self.last_full_model != checkpoint_folder: # ensure the latest full parameter save is in the latest checkpoint # folder, so that automatic pruning of checkpoints does not remove it LOG.info(f"moving last full parameter save to {checkpoint_folder}") os.makedirs(checkpoint_folder, exist_ok=True) chunks = glob.glob( f"{self.last_full_model}/model*.safetensors" ) + glob.glob(f"{self.last_full_model}/model*.index.json") for path in chunks: new_path = os.path.abspath(shutil.move(path, checkpoint_folder)) try: os.symlink(new_path, path) except OSError: # probably on windows without permission to symlink pass self.last_full_model = checkpoint_folder else: model.model.save_pretrained(checkpoint_folder) return control def on_log( self, _args: TrainingArguments, _state: TrainerState, control: TrainerControl, logs: Dict[str, float], **_kwargs, ): logs["num_lora_restarts"] = self.num_lora_restarts return control def on_train_end( self, args: TrainingArguments, _state: TrainerState, control: TrainerControl, model: peft.LoraModel, **_kwargs, ): if self.quantized: # perform final merge and save with torch.no_grad(): merge_and_save( model, self.last_full_model, args.output_dir, reinit=False, quantized=self.quantized, actually_save=is_main_process(), cpu_offload=self.cpu_offload, ) # no need to save if unquantized, as finetune.py will call merge_and_unload() return control def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]: model_name = "model.safetensors" if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists( str(Path(path) / f"{model_name}.index.json") ): model_name = "pytorch_model.bin" index_path = str(Path(path) / f"{model_name}.index.json") if os.path.exists(index_path): with open(index_path, "r", encoding="utf-8") as file: data = json.load(file) return data["weight_map"] return {(module_name + ".weight"): model_name for module_name in module_names} def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor: if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)): adapter: Union[List[str], str] = layer.active_adapter if isinstance(adapter, list): if len(adapter) > 1: raise ValueError("unhandled relora for multiple adapters") adapter = adapter[0] return ( peft.utils.transpose( layer.lora_B[adapter].weight.detach().to(device) @ layer.lora_A[adapter].weight.detach().to(device), getattr(layer, "fan_in_fan_out", False), ) * layer.scaling[adapter] ) raise ValueError("unhandled lora layer type") def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]: modules: Dict[str, peft.tuners.lora.LoraLayer] = {} key_list = [key for key, _ in model.model.named_modules() if "lora" not in key] for key in key_list: try: _parent, target, _target_name = peft.utils._get_submodules(model.model, key) except AttributeError: continue if isinstance(target, peft.tuners.lora.LoraLayer): modules[key] = target return modules def update_weights( target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device ): if reinit: for adapter_name in target.lora_A: target.reset_lora_parameters(adapter_name, True) for adapter_name in target.lora_embedding_A: target.reset_lora_parameters(adapter_name, True) if isinstance(target, peft.tuners.lora.Linear4bit): # This could be faster, but the quantization of Linear4bit weights occurs # when the module is moved from cpu to gpu. Without meddling *too* deeply in # PEFT's innards or maintaining a duplicate of that codepath, this is good # enough for now. target.weight.quant_state = None target.weight.data = new_weight.cpu() target.to(device) elif isinstance(target, peft.tuners.lora.Linear8bitLt): target.weight.data = ( bnb.nn.Int8Params(new_weight, requires_grad=False).to(device).data ) else: target.weight.data = new_weight.to(device) def merge_and_save( model: peft.LoraModel, model_src: str, model_dst: str, reinit: bool = False, quantized: bool = False, cpu_offload: bool = False, actually_save: bool = True, ): modules = find_lora_modules(model) if not quantized: for _, target in modules.items(): active_adapter = target.active_adapter if isinstance(active_adapter, list): active_adapter = active_adapter[0] update = target.get_delta_weight(active_adapter).detach() target.weight.data += update if reinit: for adapter_name in target.lora_A: target.reset_lora_parameters(adapter_name, True) for adapter_name in target.lora_embedding_A: target.reset_lora_parameters(adapter_name, True) return os.makedirs(model_dst, exist_ok=True) shard_paths = sharded_paths(model_src, modules.keys()) out_shard_paths = {} unique_shards = list(set(shard_paths.values())) for shard_path in unique_shards: out_tensors = {} if shard_path.endswith(".safetensors"): in_tensors = st.load_file(str(Path(model_src) / shard_path)) else: in_tensors = torch.load( Path(model_src) / shard_path, weights_only=True, # to prevent arbitrary code execution ) if "state_dict" in in_tensors: in_tensors = in_tensors["state_dict"] for module_name, target in modules.items(): key = module_name + ".weight" if key not in shard_paths or shard_paths[key] != shard_path: continue orig_weight = in_tensors[key] old_dev = target.weight.device math_dev = "cpu" if cpu_offload else old_dev delta_weight = lora_delta_weight(target, math_dev) new_weight = orig_weight.to(math_dev) + delta_weight del delta_weight if actually_save: out_tensors[key] = new_weight.half().cpu() update_weights(target, new_weight, reinit=reinit, device=old_dev) if actually_save: out_shard_name = shard_path if out_shard_name.startswith("pytorch_model"): out_shard_name = ( out_shard_name.replace("pytorch_model", "model").rstrip(".bin") + ".safetensors" ) for module_name in in_tensors: if module_name not in out_tensors: out_tensors[module_name] = in_tensors[module_name].half() out_shard_paths[module_name] = out_shard_name shard_fn = str(Path(model_dst) / out_shard_name) LOG.info(f"saving tensors to {shard_fn}") st.save_file(out_tensors, shard_fn, metadata={"format": "pt"}) barrier() del in_tensors del out_tensors torch.cuda.empty_cache() if actually_save and len(unique_shards) > 1: with open( str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8" ) as file: json.dump({"metadata": {}, "weight_map": out_shard_paths}, file) def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str): modules = find_lora_modules(model) shard_paths = sharded_paths(checkpoint_path, modules.keys()) unique_shards = list(set(shard_paths.values())) for shard_path in unique_shards: tensors = st.load_file(os.path.join(checkpoint_path, shard_path)) for module_name, target in modules.items(): key = module_name + ".weight" if key not in shard_paths or shard_paths[key] != shard_path: continue new_weight = tensors[key] update_weights( target, new_weight, reinit=False, device=target.weight.device ) ================================================ FILE: src/axolotl/monkeypatch/ring_attn/__init__.py ================================================ """Init for ring attention monkeypatch module""" # flake8: noqa from .patch import ( get_ring_attn_group, register_ring_attn_from_device_mesh, set_ring_attn_group, update_ring_attn_params, ) __all__ = ( "get_ring_attn_group", "register_ring_attn_from_device_mesh", "set_ring_attn_group", "update_ring_attn_params", ) ================================================ FILE: src/axolotl/monkeypatch/ring_attn/adapters/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/ring_attn/adapters/batch.py ================================================ """ HuggingFace flash attention adapter for basic ring attention (batch API). Inspired by https://github.com/zhuzilin/ring-flash-attention/blob/ce9fd3935ca0e5f0592bb0826cbed18ec69da729/ring_flash_attn/adapters/hf_adapter.py. Our implementation closely follows the structure of that module, but we've minified it somewhat to support only the latest versions of transformers. """ import os from typing import Callable import torch import torch.distributed as dist import transformers import transformers.modeling_flash_attention_utils from ring_flash_attn import ring_flash_attn_func from ring_flash_attn.adapters.hf_adapter import check_params from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal try: from transformers.modeling_flash_attention_utils import _flash_supports_window except ImportError: try: from transformers.modeling_flash_attention_utils import ( _flash_supports_window_size as _flash_supports_window, ) except ImportError: _flash_supports_window = True from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from axolotl.utils.schemas.enums import RingAttnFunc RING_ATTN_FUNC_MAPPING = { RingAttnFunc.BATCH_RING: torch.compile(ring_flash_attn_func), # RingAttnFunc.BATCH_ZIGZAG: torch.compile(zigzag_ring_flash_attn_func), # RingAttnFunc.BATCH_STRIPE: torch.compile(stripe_flash_attn_func), } def create_flash_attn_forward_varlen_llama3( process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc ) -> Callable: """ Create a ring flash attention forward function compatible with HuggingFace's interface. Args: process_group: A PyTorch distributed process group. ring_attn_func: Function from `ring_flash_attention` to replace HF flash attention with. Returns: A function that implements the ring flash attention forward pass with the signature expected by HuggingFace Transformers. """ # transformers 4.48+ def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, value_states: torch.Tensor, attention_mask: torch.Tensor, query_length: int, is_causal: bool, dropout: float = 0.0, position_ids: torch.Tensor | None = None, softmax_scale: float | None = None, sliding_window: int | None = None, use_top_left_mask: bool = False, softcap: float | None = None, deterministic: bool = None, cu_seq_lens_q: torch.LongTensor | None = None, cu_seq_lens_k: torch.LongTensor | None = None, max_length_q: int | None = None, max_length_k: int | None = None, target_dtype: torch.dtype | None = None, attn_implementation: str | None = None, **kwargs, ): """ Calls the forward method of Ring Flash Attention. Args: query_states: Tensor containing the query vectors. key_states: Tensor containing the key vectors. value_states: Tensor containing the value vectors. attention_mask: Not used in this implementation. query_length: Integer representing the length of the query sequence. is_causal: Boolean indicating whether to apply a causal mask to the attention. dropout: Float representing the dropout probability. Default is 0.0. position_ids: Not used in this implementation. softmax_scale: Optional float value for the softmax scaling factor. Default is None. sliding_window: Optional integer defining the size of the sliding attention window. Default is None. use_top_left_mask: Boolean indicating whether to use a top-left mask for the attention. Default is False. softcap: Not used in this implementation. deterministic: Optional boolean to enforce deterministic computation. Default is None. cu_seq_lens_q: Not used in this implementation. cu_seq_lens_k: Not used in this implementation. max_length_q: Not used in this implementation. max_length_k: Not used in this implementation. target_dtype: Not used in this implementation. attn_implementation: Not used in this implementation. **kwargs: Additional keyword arguments. Not used in this implementation. Returns: torch.Tensor: The output of the attention mechanism, with shape `[batch_size, query_length, num_heads, head_dim]`. """ if not use_top_left_mask: causal = is_causal else: causal = is_causal and query_length != 1 # Handle sliding window use_sliding_windows = ( _flash_supports_window and sliding_window is not None and key_states.shape[1] > sliding_window ) window_size = ( (sliding_window, sliding_window) if use_sliding_windows else (-1, -1) ) # Handle deterministic mode if is_flash_attn_greater_or_equal("2.4.1"): if deterministic is None: deterministic = ( os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" ) # Call ring flash attention function attn_output = RING_ATTN_FUNC_MAPPING[ring_attn_func]( query_states, key_states, value_states, dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, window_size=window_size, alibi_slopes=None, deterministic=deterministic, return_attn_probs=False, group=process_group, ) return attn_output return _flash_attention_forward def substitute_hf_flash_attn( process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc ): """ Substitute HuggingFace's flash attention implementation with ring-based implementation. Args: process_group: PyTorch distributed process group for communication. ring_attn_func: Function from `ring_flash_attention` to replace HF flash attention with. """ try: # Substitute flash attention old_flash_attention_forward = ( transformers.modeling_flash_attention_utils._flash_attention_forward ) new_flash_attention_forward = create_flash_attn_forward_varlen_llama3( process_group=process_group, ring_attn_func=ring_attn_func ) if check_params(old_flash_attention_forward, new_flash_attention_forward): transformers.modeling_flash_attention_utils._flash_attention_forward = ( new_flash_attention_forward ) else: raise ValueError( "The signature of the new flash attention forward function does not match the old one." ) except Exception as exception: raise ValueError( f"The current transformer version {transformers.__version__} is not supported. " "Please use pip install -U transformers to upgrade to the latest version. " "If the code failed with the latest version, " f"please file an issue." ) from exception # Register with ALL_ATTENTION_FUNCTIONS if available if ALL_ATTENTION_FUNCTIONS is not None: from ring_flash_attn.adapters.hf_adapter import flash_attention_forward ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward ================================================ FILE: src/axolotl/monkeypatch/ring_attn/patch.py ================================================ """Ring attention group registration and flash attention patching. Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention) package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in their sequence parallel version of Flash Attention 2. We also provide some patches for accelerate functions to prepare the dataloader for sequence parallelism training. """ import os from typing import Optional import torch import torch.distributed as dist from torch.distributed import DeviceMesh try: from transformers.modeling_flash_attention_utils import _flash_supports_window except ImportError: try: from transformers.modeling_flash_attention_utils import ( _flash_supports_window_size as _flash_supports_window, ) except ImportError: _flash_supports_window = True from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import RingAttnFunc LOG = get_logger(__name__) RING_ATTN_GROUP = None def get_ring_attn_group() -> dist.ProcessGroup: """Getter for ring attention group on this rank.""" if RING_ATTN_GROUP is None: raise RuntimeError("register_ring_attn_from_device_mesh() not yet called") return RING_ATTN_GROUP def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None): """Setter for ring attention group on this rank.""" global RING_ATTN_GROUP RING_ATTN_GROUP = ring_attn_group def create_ring_flash_attention_forward( process_group: dist.ProcessGroup, heads_k_stride: int ): from ring_flash_attn import llama3_flash_attn_varlen_func from ring_flash_attn.adapters.hf_adapter import DATA_PARAMS def _flash_attention_forward_v3( query_states: torch.Tensor, key_states: torch.Tensor, value_states: torch.Tensor, attention_mask: torch.Tensor, query_length: int, is_causal: bool, dropout: float = 0.0, position_ids: Optional[torch.Tensor] = None, softmax_scale: Optional[float] = None, sliding_window: Optional[int] = None, use_top_left_mask: bool = False, softcap: Optional[float] = None, deterministic: bool = None, cu_seq_lens_q: Optional[torch.LongTensor] = None, cu_seq_lens_k: Optional[torch.LongTensor] = None, max_length_q: Optional[int] = None, max_length_k: Optional[int] = None, target_dtype: Optional[torch.dtype] = None, attn_implementation: Optional[str] = None, **kwargs, ): if not use_top_left_mask: causal = is_causal else: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__. causal = is_causal and query_length != 1 # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length). use_sliding_windows = ( _flash_supports_window and sliding_window is not None and key_states.shape[1] > sliding_window ) flash_kwargs = ( {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} ) if deterministic is None: deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" flash_kwargs["deterministic"] = deterministic assert softcap is None, ( "llama3_flash_attn_varlen_func does not support softcap yet." ) # flash_kwargs["softcap"] = softcap flash_kwargs["group"] = process_group # not sure why attention_mask can be not None... assert causal, "only causal attention is supported yet." batch_size = query_states.size(0) assert batch_size == 1, "varlen data should be processed in advance." attn_output = llama3_flash_attn_varlen_func( query_states.squeeze(dim=0), key_states.squeeze(dim=0), value_states.squeeze(dim=0), cu_seqlens_q=DATA_PARAMS["cu_seqlens_q"], cu_seqlens_k=DATA_PARAMS["cu_seqlens_k"], max_seqlen_q=DATA_PARAMS["max_seqlen_q"], max_seqlen_k=DATA_PARAMS["max_seqlen_k"], heads_k_stride=heads_k_stride, local_k_slice=DATA_PARAMS["local_k_slice"], dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs, ) attn_output = attn_output.unsqueeze(dim=0) return attn_output return [ _flash_attention_forward_v3, ] def register_ring_attn_from_device_mesh( device_mesh: "DeviceMesh", context_parallel_dim: tuple[str, ...], heads_k_stride: int | None, ring_attn_func: RingAttnFunc | None, ): """Create ring attention group using DeviceMesh and substitute flash attn with ring flash attn. Args: device_mesh: DeviceMesh object containing the parallelism topology. context_parallel_dim: Name of the sequence parallel dimension in the device mesh. heads_k_stride: Sequence parallelism K head stride size. Passed through to `varlen_llama3` `ring_flash_attn` implementation. ring_attn_func: `ring_flash_attn` ring attention implemention. If sample packing is enabled, it must be a `varlen` function; otherwise, it must be a `batch` function. """ rank = dist.get_rank() LOG.info( f"Enabling ring attention sequence parallelism using DeviceMesh " f"dimension '{context_parallel_dim}'", ) # Extract the sequence parallel submesh try: sequence_mesh = device_mesh[context_parallel_dim] except (KeyError, IndexError) as e: raise ValueError( f"Dimension '{context_parallel_dim}' not found in device_mesh. " f"Available dimensions: {device_mesh.mesh_dim_names}" ) from e # Get the process group for context parallelism sequence_pg = sequence_mesh.get_group() context_parallel_size = sequence_mesh.size() if rank == 0: LOG.info( f"Sequence parallel degree: {context_parallel_size}, " f"mesh shape: {sequence_mesh.mesh.shape}" ) # Log which ranks are in the current process group if sequence_pg != dist.GroupMember.WORLD: ranks_in_group = dist.get_process_group_ranks(sequence_pg) LOG.info(f"Current sequence parallel group ranks: {ranks_in_group}") # Set the ring attention group set_ring_attn_group(sequence_pg) if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3: # fmt: off import ring_flash_attn.adapters.hf_adapter from ring_flash_attn.adapters.hf_adapter import ( # isort: skip create_ring_flash_attention_forward as create_ring_flash_attention_forward_orig, ) create_ring_flash_attention_forward_orig = ( # noqa: F811,F841 create_ring_flash_attention_forward ) ring_flash_attn.adapters.hf_adapter.create_ring_flash_attention_forward = create_ring_flash_attention_forward # fmt: on ring_flash_attn.adapters.hf_adapter.substitute_hf_flash_attn( process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1 ) elif ring_attn_func is RingAttnFunc.BATCH_RING: from axolotl.monkeypatch.ring_attn.adapters.batch import ( substitute_hf_flash_attn, ) substitute_hf_flash_attn( process_group=get_ring_attn_group(), ring_attn_func=ring_attn_func, ) def update_ring_attn_params(position_ids: torch.Tensor | None): """ Calculate the cumulative sequence lengths for the current forward pass and pass the value to the substituted `ring_flash_attn`. Args: position_ids: Optional tensor of position IDs (for sample packed data). """ from ring_flash_attn import update_ring_flash_attn_params cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids) cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device()) update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group()) ================================================ FILE: src/axolotl/monkeypatch/scaled_softmax_attn.py ================================================ """ Scaled Softmax (SSMax) attention patch using FlexAttention. SSMax: softmax(scores * s * log(n) + b) where n is the position index Ref: https://arxiv.org/abs/2501.19399 """ import torch from transformers import PreTrainedModel from axolotl.utils.logging import get_logger LOG = get_logger(__name__) try: from torch.nn.attention.flex_attention import BlockMask from transformers.integrations.flex_attention import ( compile_friendly_flex_attention, repeat_kv, ) FLEX_ATTENTION_AVAILABLE = True except ImportError: FLEX_ATTENTION_AVAILABLE = False BlockMask = None _ssmax_config = {} def patch_scaled_softmax_attention( scaling_factor_init: float = 0.43, bias: float = 0.0, model: PreTrainedModel = None ): """Patch attention to apply SSMax via FlexAttention score_mod.""" global _ssmax_config if not FLEX_ATTENTION_AVAILABLE: raise RuntimeError("SSMax requires FlexAttention.") _ssmax_config["ssmax_s"] = scaling_factor_init _ssmax_config["ssmax_b"] = bias from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS if "flex_attention" in ALL_ATTENTION_FUNCTIONS: _ssmax_config["original_flex_fn"] = ALL_ATTENTION_FUNCTIONS["flex_attention"] ALL_ATTENTION_FUNCTIONS["flex_attention"] = ssmax_flex_attention_forward LOG.info( f"Patched flex_attention with SSMax (s={scaling_factor_init}, b={bias})" ) else: LOG.warning("flex_attention not found. Ensure flex_attention: true is set.") def ssmax_flex_attention_forward( module: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask, scaling: float | None = None, softcap: float | None = None, **kwargs, ) -> tuple[torch.Tensor, torch.Tensor | None]: """FlexAttention forward with SSMax: score * (s * log(n) + b).""" if kwargs.get("dropout", 0.0) > 0: raise ValueError("flex_attention does not support dropout") ssmax_s = _ssmax_config.get("ssmax_s", 0.43) ssmax_b = _ssmax_config.get("ssmax_b", 0.0) position_ids = kwargs.get("position_ids", None) position_ids_flat = position_ids.view(-1) if position_ids is not None else None block_mask = attention_mask if isinstance(attention_mask, BlockMask) else None score_mask = None if block_mask else attention_mask if score_mask is not None: score_mask = score_mask[:, :, :, : key.shape[-2]] def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): """ Apply SSMax scaling: score * (s * log(n) + b) where n is the relative position within each packed sequence. """ if position_ids_flat is not None: relative_pos = position_ids_flat[q_idx] n = (relative_pos + 1).float() else: n = (q_idx + 1).float() n = torch.clamp(n, min=2.0) ssmax_scale = ssmax_s * torch.log(n) + ssmax_b score = score * ssmax_scale if softcap is not None: score = softcap * torch.tanh(score / softcap) if score_mask is not None: score = score + score_mask[batch_idx][0][q_idx][kv_idx] return score enable_gqa = True if (query.shape[1] & (query.shape[1] - 1)) != 0: key = repeat_kv(key, query.shape[1] // key.shape[1]) value = repeat_kv(value, query.shape[1] // value.shape[1]) enable_gqa = False return_lse = query.device.type != "cpu" flex_output = compile_friendly_flex_attention( query, key, value, score_mod=score_mod, block_mask=block_mask, enable_gqa=enable_gqa, scale=scaling, kernel_options=kwargs.get("kernel_options"), return_lse=return_lse, training=module.training, ) if return_lse: attention_output, lse = flex_output lse = lse.to(value.dtype) else: attention_output, lse = flex_output, None return attention_output.transpose(1, 2).contiguous(), lse def unpatch_scaled_softmax_attention(): """Restore the original FlexAttention function.""" global _ssmax_config from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS if "original_flex_fn" in _ssmax_config: ALL_ATTENTION_FUNCTIONS["flex_attention"] = _ssmax_config["original_flex_fn"] _ssmax_config.clear() LOG.info("Unpatched flex_attention, restored original") ================================================ FILE: src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py ================================================ # coding=utf-8 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This code is based off the following work: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py """PyTorch StableLM Epoch model.""" import importlib import math from typing import Optional, Tuple, Union import torch import torch.utils.checkpoint from accelerate import init_empty_weights from einops import rearrange from flash_attn.flash_attn_interface import ( flash_attn_varlen_qkvpacked_func, ) from torch import nn from transformers import AutoConfig, AutoModelForCausalLM from transformers.modeling_outputs import BaseModelOutputWithPast from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids from axolotl.utils.logging import get_logger logger = get_logger(__name__) def replace_stablelm_attn_with_flash_attn(model_name="stabilityai/stablelm-3b-4e1t"): # this is a wonky hack to get the remotely loaded module model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) # we need to load the model here in order for modeling_stablelm_epoch to be available with init_empty_weights(): AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) module_name = model_config.__class__.__module__.replace( ".configuration_stablelm_epoch", ".modeling_stablelm_epoch" ) modeling_stablelm = importlib.import_module(module_name) modeling_stablelm.Attention.forward = flashattn_attn modeling_stablelm.StableLMEpochModel.forward = stablelm_model_forward modeling_stablelm.DecoderLayer.forward = decoder_layer_forward def rotate_half(x: torch.Tensor): """Rotates half the hidden dims of the input.""" x1, x2 = torch.chunk(x, 2, dim=-1) return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = cos[position_ids].unsqueeze(1) # [batch_size, 1, seq_len, dim] sin = sin[position_ids].unsqueeze(1) # [batch_size, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand( batch, num_key_value_heads, n_rep, slen, head_dim ) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def flashattn_attn( self, hidden_states: torch.FloatTensor, attention_mask: torch.FloatTensor, position_ids: torch.LongTensor, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = query_states.view( bsz, q_len, self.num_heads, self.head_dim ).transpose(1, 2) key_states = key_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) query_rot = query_states[..., : self.rotary_ndims] query_pass = query_states[..., self.rotary_ndims :] key_rot = key_states[..., : self.rotary_ndims] key_pass = key_states[..., self.rotary_ndims :] kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb( query_rot, key_rot, cos, sin, position_ids ) # [batch_size, num_heads, seq_len, head_dim] query_states = torch.cat((query_states, query_pass), dim=-1) key_states = torch.cat((key_states, key_pass), dim=-1) if past_key_value is not None: # Reuse k, v, self_attention key_states = torch.cat((past_key_value[0], key_states), dim=2) value_states = torch.cat((past_key_value[1], value_states), dim=2) past_key_value = (key_states, value_states) if use_cache else None # Repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1: # special handling using sample packing qkv = torch.stack( [query_states, key_states, value_states], dim=2 ) # [bsz, nh, 3, q_len, hd] qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] qkv = rearrange(qkv, "b s ... -> (b s) ...") softmax_scale = None output = flash_attn_varlen_qkvpacked_func( qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=softmax_scale, causal=True ) attn_output = rearrange(output, "(b s) ... -> b s ...", b=bsz) attn_output = rearrange(attn_output, "b s h d -> b s (h d)") else: attn_weights = torch.matmul( query_states, key_states.transpose(2, 3) ) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" f" {attn_weights.size()}" ) if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask # Upcast attention to fp32 attn_weights = nn.functional.softmax( attn_weights, dim=-1, dtype=torch.float32 ).to(query_states.dtype) attn_output = torch.matmul(attn_weights, value_states) if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}" ) # Merge heads attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) # Final linear projection attn_output = self.o_proj(attn_output) return attn_output, None, past_key_value def decoder_layer_forward( self, hidden_states: Optional[torch.FloatTensor], attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[torch.Tensor] = None, ) -> Union[ Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]] ]: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, ) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) return outputs def stablelm_model_forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # Retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" ) if input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: raise ValueError( "You have to specify either decoder_input_ids or decoder_inputs_embeds" ) seq_length_with_past = seq_length past_key_values_length = 0 if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length cu_seqlens = None max_seqlen = None if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device, ) position_ids = position_ids.unsqueeze(0).view(-1, seq_length) else: position_ids = position_ids.view(-1, seq_length).long() cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids) cu_seqlens = cu_seqlens.squeeze() if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) # Embed positions if attention_mask is None: attention_mask = torch.ones( (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device, ) attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length, ) hidden_states = inputs_embeds if self.gradient_checkpointing and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False # Decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids, past_key_value, output_attentions, None, cu_seqlens, max_seqlen, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) hidden_states = self.norm(hidden_states) # Add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple( v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, ) ================================================ FILE: src/axolotl/monkeypatch/tiled_mlp/__init__.py ================================================ """ TiledMLP monkey patches """ from .patch import ( patch_tiled_mlp, ) __all__ = [ "patch_tiled_mlp", ] ================================================ FILE: src/axolotl/monkeypatch/tiled_mlp/base.py ================================================ """ TiledMLP support for DDP, FSDP, and single GPU """ import threading from typing import List import torch class DeepSpeedTiledMLPMoE(torch.autograd.Function): @staticmethod def forward( ctx, fn, self, x, shards, compute_params, ) -> torch.Tensor: ctx.fn = fn ctx.self = self ctx.shards = shards ctx.compute_params = [p for p in compute_params if p.requires_grad] ctx.save_for_backward(x) x_shards = list(torch.chunk(x, chunks=shards, dim=1)) with torch.no_grad(): output_shards = [fn(self, x_shard) for x_shard in x_shards] ctx.is_tuple_output = isinstance(output_shards[0], tuple) if isinstance(output_shards[0], tuple): tuple_dim_idx = [1, 0] output_unsharded = tuple( torch.cat( [output_shard[i] for output_shard in output_shards], dim=tuple_dim_idx[i], ) for i in range(len(output_shards[0])) ) else: output_unsharded = torch.cat(output_shards, dim=1) return output_unsharded @staticmethod def backward(ctx, *grads) -> torch.Tensor: fn = ctx.fn (x,) = ctx.saved_tensors self = ctx.self shards = ctx.shards compute_params = ctx.compute_params is_tuple_output = ctx.is_tuple_output x_requires_grad = x.requires_grad x = x.detach() # detach() unsets `x.requires_grad`, so restore it x.requires_grad_(x_requires_grad) incoming_grad = grads[0] x_grad = torch.zeros_like(x) x_shards = list(torch.chunk(x, chunks=shards, dim=1)) shard_step = x_shards[0].numel() for i, x_shard in enumerate(x_shards): # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run if compute_params is not None: if i + 1 < shards: for param in compute_params: param.ds_grad_is_ready = False else: # last shard, can add the grad for param in compute_params: param.ds_grad_is_ready = True x_shard.requires_grad_(x_requires_grad) shard_offset = i * shard_step x_shard.grad = ( x_grad.view(-1) .narrow(0, shard_offset, x_shard.numel()) .view_as(x_shard) ) incoming_grad_shard = ( incoming_grad.view(-1) .narrow(0, shard_offset, x_shard.numel()) .view_as(x_shard) ) with torch.enable_grad(): output = fn(self, x_shard) if is_tuple_output: torch.autograd.backward(output[0], incoming_grad_shard) else: torch.autograd.backward(output, incoming_grad_shard) return (None, None, x_grad, None, None) class TiledMLP(torch.autograd.Function): """ TiledMLP implementation using gradient hooks """ @staticmethod def forward( ctx, fn, self, x, shards, compute_params, ) -> torch.Tensor: ctx.fn = fn ctx.self = self ctx.shards = shards ctx.compute_params = [p for p in compute_params if p.requires_grad] ctx.save_for_backward(x) x_shards = list(torch.chunk(x, chunks=shards, dim=1)) with torch.no_grad(): output_shards = [fn(self, x_shard) for x_shard in x_shards] ctx.is_tuple_output = isinstance(output_shards[0], tuple) if isinstance(output_shards[0], tuple): tuple_dim_idx = [1, 0] output_unsharded = tuple( torch.cat( [output_shard[i] for output_shard in output_shards], dim=tuple_dim_idx[i], ) for i in range(len(output_shards[0])) ) else: output_unsharded = torch.cat(output_shards, dim=1) return output_unsharded @staticmethod def backward(ctx, *grads) -> torch.Tensor: fn = ctx.fn (x,) = ctx.saved_tensors self = ctx.self shards = ctx.shards compute_params = ctx.compute_params is_tuple_output = ctx.is_tuple_output x_requires_grad = x.requires_grad x = x.detach() x.requires_grad_(x_requires_grad) incoming_grad = grads[0] x_grad = torch.zeros_like(x) x_shards = list(torch.chunk(x, chunks=shards, dim=1)) # Create a gradient accumulator for parameters grad_accumulator = GradientAccumulator(compute_params, shards, dtype=x.dtype) shard_step = x_shards[0].numel() for i, x_shard in enumerate(x_shards): x_shard.requires_grad_(x_requires_grad) shard_offset = i * shard_step x_shard.grad = ( x_grad.view(-1) .narrow(0, shard_offset, x_shard.numel()) .view_as(x_shard) ) incoming_grad_shard = ( incoming_grad.view(-1) .narrow(0, shard_offset, x_shard.numel()) .view_as(x_shard) ) # Install hooks for this shard is_last_shard = i + 1 == shards grad_accumulator.install_hooks(is_last_shard) with torch.enable_grad(): output = fn(self, x_shard) if is_tuple_output: torch.autograd.backward(output[0], incoming_grad_shard) else: torch.autograd.backward(output, incoming_grad_shard) # Clean up hooks grad_accumulator.cleanup() del grad_accumulator return (None, None, x_grad, None, None) class GradientAccumulator: """ Manual gradient accumulator for TiledMLP with configurable precision Accumulates in specified dtype and rescales the gradient at the end """ def __init__( self, params: List[torch.nn.Parameter], total_shards: int, dtype: torch.dtype | None = None, ): self.params = params self.total_shards = total_shards self.grad_accumulation_dtype = dtype or torch.float32 self.accumulated_grads = {} self.hooks = [] self.lock = threading.Lock() self.gradient_scale = 1.0 / total_shards # Initialize accumulated gradients in the specified dtype for param in self.params: if param.grad is not None: self.accumulated_grads[param] = param.grad.to( self.grad_accumulation_dtype ) param.grad = None else: self.accumulated_grads[param] = torch.zeros_like( param, dtype=self.grad_accumulation_dtype ) def install_hooks(self, is_last_shard: bool): """Install gradient hooks that accumulate gradients in higher precision""" def create_hook(param): def hook(grad): with self.lock: grad_to_accum_dtype = grad.to(self.grad_accumulation_dtype) scaled_grad = grad_to_accum_dtype * self.gradient_scale if param in self.accumulated_grads: self.accumulated_grads[param] += scaled_grad else: self.accumulated_grads[param] = scaled_grad.clone() # Only assign the averaged gradient on the last shard if is_last_shard: param.grad = self.accumulated_grads[param].to(param.dtype) return param.grad return None return hook # Install hooks on all parameters for param in self.params: if param.requires_grad: hook = param.register_hook(create_hook(param)) self.hooks.append(hook) def cleanup(self): """Remove all installed hooks""" for hook in self.hooks: hook.remove() self.hooks.clear() del self.accumulated_grads ================================================ FILE: src/axolotl/monkeypatch/tiled_mlp/patch.py ================================================ """Monkeypatch for Tiled MLP implementation""" import math import os import torch import torch.distributed as dist from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None): from deepspeed.runtime.sequence_parallel.ulysses_sp import ( TiledMLP as DeepSpeedTiledMLP, ) from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP try: # Dynamically import the module and MLP class module_path = f"transformers.models.{model_type}.modeling_{model_type}" model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type) module = __import__(module_path, fromlist=[f"{model_cls_prefix}MLP"]) mlp_cls = getattr(module, f"{model_cls_prefix}MLP") if use_original_mlp: mlp_forward = mlp_cls.forward else: def generic_mlp_forward(self_, hs): return self_.down_proj( self_.act_fn(self_.gate_proj(hs)) * self_.up_proj(hs) ) mlp_forward = torch.compile(generic_mlp_forward) is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1 def tiled_mlp_forward(self, x): input_shape = x.shape seqlen = input_shape[-2] hidden = input_shape[-1] if cfg_num_shards is None: num_shards = math.ceil(seqlen / hidden) if is_distributed: num_shards_tensor = torch.tensor(num_shards, device=x.device) dist.all_reduce(num_shards_tensor, op=dist.ReduceOp.MAX) num_shards = num_shards_tensor.item() else: num_shards = cfg_num_shards if not self._compute_params: self._compute_params = [p for p in self.parameters() if p.requires_grad] compute_params = self._compute_params if not self._tiled_mlp_dist_impl: if ( self._compute_params and any( hasattr(p, "ds_id") or hasattr(p, "param_idx_in_group") for p in self._compute_params ) ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true": if model_type == "gpt_oss": self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE else: self._tiled_mlp_dist_impl = DeepSpeedTiledMLP else: self._tiled_mlp_dist_impl = TiledMLP down_res = self._tiled_mlp_dist_impl.apply( mlp_forward, self, x, num_shards, compute_params, ) return down_res mlp_cls.forward = tiled_mlp_forward mlp_cls._compute_params = [] mlp_cls._tiled_mlp_dist_impl = None LOG.info( f"Successfully monkey-patched TiledMLP for model_type: {model_type}", ) except (ImportError, AttributeError) as e: raise RuntimeError( f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}" ) from e ================================================ FILE: src/axolotl/monkeypatch/trainer/__init__.py ================================================ from .utils import entropy_from_logits, selective_log_softmax __all__ = ["entropy_from_logits", "selective_log_softmax"] ================================================ FILE: src/axolotl/monkeypatch/trainer/lr.py ================================================ """ monkeypatch for Trainer _get_learning_rate method """ import torch from axolotl.utils.logging import get_logger LOG = get_logger(__name__) # TODO remove this patch once https://github.com/huggingface/transformers/pull/37881 is included in a release def _get_learning_rate(self): if self.is_deepspeed_enabled: # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, so work around it: try: last_lr = self.lr_scheduler.get_last_lr()[0] except AssertionError as e: if "need to call step" in str(e): LOG.warning( "tried to get lr value before scheduler/optimizer started stepping, returning lr=0" ) last_lr = 0 else: raise else: if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): last_lr = self.optimizer.param_groups[0]["lr"] else: last_lr = self.lr_scheduler.get_last_lr()[0] if torch.is_tensor(last_lr): last_lr = last_lr.item() return last_lr def patch_trainer_get_lr(): from transformers.trainer import Trainer Trainer._get_learning_rate = _get_learning_rate ================================================ FILE: src/axolotl/monkeypatch/trainer/trl.py ================================================ """Monkeypatch for TRL trainer FSDP preparation.""" def prepare_fsdp(model, accelerator): from axolotl.monkeypatch.accelerate.fsdp2 import fsdp2_prepare_model return fsdp2_prepare_model(accelerator, model) def patch_trl_prepare_fsdp2(): import trl.models.utils trl.models.utils.prepare_fsdp = prepare_fsdp ================================================ FILE: src/axolotl/monkeypatch/trainer/trl_vllm.py ================================================ """Monkeypatches for TRL's vLLM integration and trainer utils. Adds: - VLLMClient.batch_update_named_params: batched weight sync (fewer HTTP round-trips) - extract_logprobs: NaN→0.0 fix (prevents downstream NaN propagation) - VLLMGeneration: weight_sync_chunk_size + batched sync path for non-FSDP/non-ZeRO - split_tensor_dict / shuffle_sequence_dict: scalar type handling (int/float/bool passthrough) """ import logging import math from functools import wraps import torch from torch import nn LOG = logging.getLogger(__name__) def _batch_update_named_params( self, params: list[tuple[str, torch.Tensor]], chunk_size: int | None = None ): """Batched weight sync — sends param metadata via HTTP, tensors via NCCL.""" from transformers import is_torch_xpu_available if chunk_size is None: chunks = [params] else: chunks = [] current_chunk: list[tuple[str, torch.Tensor]] = [] current_elements = 0 for name, weights in params: n_elem = weights.numel() if current_chunk and current_elements + n_elem > chunk_size: chunks.append(current_chunk) current_chunk = [] current_elements = 0 current_chunk.append((name, weights)) current_elements += n_elem if current_chunk: chunks.append(current_chunk) for chunk in chunks: param_metadata = [ {"name": name, "dtype": str(weights.dtype), "shape": list(weights.shape)} for name, weights in chunk ] url = f"{self.base_url}/batch_update_named_params/" response = self.session.post(url, json={"params": param_metadata}) if response.status_code != 200: raise Exception(f"Request failed: {response.status_code}, {response.text}") for _name, weights in chunk: if is_torch_xpu_available(): self.communicator.broadcast(weights, root=self.rank) else: self.communicator.broadcast(weights, src=self.rank) if is_torch_xpu_available(): self.communicator.barrier() else: self.communicator.group.barrier() def _update_model_params(self, model: nn.Module, chunk_size: int | None = None): """Updates all model params using batch_update_named_params.""" params = [(name, param.data) for name, param in model.named_parameters()] self.batch_update_named_params(params, chunk_size=chunk_size) def _patched_extract_logprobs(all_outputs): """extract_logprobs with NaN→0.0 fix (stock TRL uses None which causes downstream errors).""" all_logprobs = [] all_token_ids = [] for outputs in all_outputs: for output in outputs.outputs: if output.logprobs is None: return None, None seq_logprobs = [] seq_token_ids = [] for lp in output.logprobs: sorted_items = sorted(lp.items(), key=lambda x: x[1].rank) seq_token_ids.append([token_id for token_id, _ in sorted_items]) seq_logprobs.append( [ 0.0 if math.isnan(item.logprob) else item.logprob for _, item in sorted_items ] ) all_logprobs.append(seq_logprobs) all_token_ids.append(seq_token_ids) return all_logprobs, all_token_ids def _patched_split_tensor_dict(tensor_dict, num_chunks): """split_tensor_dict that handles scalar types (int/float/bool) for num_items_in_batch.""" first_tensor = next( tensor for tensor in tensor_dict.values() if tensor is not None and isinstance(tensor, torch.Tensor) and tensor.ndim > 0 ) chunk_size = first_tensor.shape[0] // num_chunks chunks = [] for i in range(num_chunks): chunk_dict = {} for key, tensor in tensor_dict.items(): if isinstance(tensor, (int, float, bool)): chunk_dict[key] = tensor elif tensor is not None and (isinstance(tensor, list) or tensor.ndim > 0): chunk_dict[key] = tensor[i * chunk_size : (i + 1) * chunk_size] elif tensor is not None and tensor.ndim == 0: chunk_dict[key] = tensor else: chunk_dict[key] = None chunks.append(chunk_dict) return chunks def _patched_shuffle_sequence_dict(seq_dict): """shuffle_sequence_dict that handles scalar types (int/float/bool).""" first_seq = next( v for v in seq_dict.values() if v is not None and isinstance(v, (torch.Tensor, list)) and len(v) > 0 ) perm = torch.randperm(len(first_seq)) def permute(v): if v is None: return None if isinstance(v, (int, float, bool)): return v if isinstance(v, torch.Tensor) and v.ndim == 0: return v if isinstance(v, torch.Tensor) and v.ndim >= 1: return v[perm] if isinstance(v, list): return [v[i] for i in perm.tolist()] return v return {k: permute(v) for k, v in seq_dict.items()} def _patch_sync_weights_batched(original_init): """Wrap VLLMGeneration.__init__ to accept weight_sync_chunk_size.""" @wraps(original_init) def patched_init(self, *args, weight_sync_chunk_size=None, **kwargs): original_init(self, *args, **kwargs) self.weight_sync_chunk_size = weight_sync_chunk_size return patched_init def _make_batched_sync_weights(original_sync_weights): """Wrap sync_weights to use batched sync for non-FSDP/non-ZeRO paths.""" @wraps(original_sync_weights) def patched_sync_weights(self): from accelerate.utils import is_peft_model # Check if we're in a non-PEFT, non-FSDP, non-ZeRO scenario where batching helps accelerator = self.accelerator model = self.model is_fsdp_enabled = self.is_fsdp_enabled deepspeed_plugin = accelerator.state.deepspeed_plugin zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3 is_peft = is_peft_model(model) # If PEFT, FSDP, or ZeRO-3, fall back to original (which handles those cases) if is_peft or is_fsdp_enabled or zero_stage_3: return original_sync_weights(self) # Non-PEFT, non-FSDP, non-ZeRO: use batched sync if self.mode == "colocate" and getattr(self, "enable_sleep_mode", False): from vllm.distributed.device_communicators.cuda_wrapper import ( empty_cache, ) empty_cache() self.llm.wake_up(tags=["weights"]) if self.mode == "server" and accelerator.is_main_process: params = [ (self._fix_param_name_to_vllm(name), param.data) for name, param in model.named_parameters() ] self.vllm_client.batch_update_named_params( params, chunk_size=getattr(self, "weight_sync_chunk_size", None) ) elif self.mode == "colocate": llm_model = ( self.llm.llm_engine.model_executor.driver_worker.model_runner.model ) weights = [ (self._fix_param_name_to_vllm(name), param.data) for name, param in model.named_parameters() ] llm_model.load_weights(weights=weights) # Reset cache if self.mode == "server" and accelerator.is_main_process: self.vllm_client.reset_prefix_cache() elif self.mode == "colocate": self.llm.reset_prefix_cache() return patched_sync_weights def patch_trl_vllm(): """Apply all TRL vLLM monkeypatches.""" import trl.generation.vllm_client import trl.generation.vllm_generation import trl.trainer.utils VLLMClient = trl.generation.vllm_client.VLLMClient VLLMGeneration = trl.generation.vllm_generation.VLLMGeneration # 1. Add batch_update_named_params to VLLMClient if not hasattr(VLLMClient, "batch_update_named_params"): VLLMClient.batch_update_named_params = _batch_update_named_params VLLMClient.update_model_params = _update_model_params LOG.info("Patched VLLMClient with batch_update_named_params") # 2. Patch extract_logprobs (NaN→0.0) trl.generation.vllm_generation.extract_logprobs = _patched_extract_logprobs LOG.info("Patched extract_logprobs with NaN→0.0 fix") # 3. Patch VLLMGeneration.__init__ to accept weight_sync_chunk_size VLLMGeneration.__init__ = _patch_sync_weights_batched(VLLMGeneration.__init__) # 4. Patch sync_weights for batched non-FSDP/non-ZeRO path VLLMGeneration.sync_weights = _make_batched_sync_weights( VLLMGeneration.sync_weights ) LOG.info("Patched VLLMGeneration with batched sync_weights") # 5. Patch split_tensor_dict and shuffle_sequence_dict trl.trainer.utils.split_tensor_dict = _patched_split_tensor_dict trl.trainer.utils.shuffle_sequence_dict = _patched_shuffle_sequence_dict LOG.info("Patched split_tensor_dict and shuffle_sequence_dict for scalar types") ================================================ FILE: src/axolotl/monkeypatch/trainer/utils.py ================================================ # Copyright 2026 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch import torch.nn.functional as F import triton import triton.language as tl @triton.jit def _entropy_online_kernel( logits_ptr, output_ptr, stride_row, V: tl.constexpr, BLOCK_V: tl.constexpr, ): """Online entropy: single pass with running max correction.""" row = tl.program_id(0) row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_row running_max = tl.full([], float("-inf"), dtype=tl.float32) running_sum_exp = tl.full([], 0.0, dtype=tl.float32) running_weighted = tl.full([], 0.0, dtype=tl.float32) for v_start in range(0, V, BLOCK_V): offs = v_start + tl.arange(0, BLOCK_V) mask = offs < V x = tl.load(row_ptr + offs, mask=mask, other=float("-inf")).to(tl.float32) block_max = tl.max(x, axis=0) new_max = tl.maximum(running_max, block_max) correction = tl.exp(running_max - new_max) running_sum_exp = running_sum_exp * correction running_weighted = running_weighted * correction exp_x = tl.exp(x - new_max) exp_x = tl.where(mask, exp_x, 0.0) x = tl.where(mask, x, 0.0) running_sum_exp += tl.sum(exp_x, axis=0) running_weighted += tl.sum(exp_x * x, axis=0) running_max = new_max entropy = tl.log(running_sum_exp) + running_max - running_weighted / running_sum_exp tl.store(output_ptr + row, entropy) @triton.jit def _entropy_online_kernel_strided( logits_ptr, output_ptr, stride_outer, stride_inner, n_inner, row_offset, V: tl.constexpr, BLOCK_V: tl.constexpr, ): """Online entropy for non-contiguous 3D (B, L, V) tensors.""" local_row = tl.program_id(0) row = local_row + row_offset outer_idx = row // n_inner inner_idx = row % n_inner off = outer_idx.to(tl.int64) * stride_outer + inner_idx.to(tl.int64) * stride_inner row_ptr = logits_ptr + off running_max = tl.full([], float("-inf"), dtype=tl.float32) running_sum_exp = tl.full([], 0.0, dtype=tl.float32) running_weighted = tl.full([], 0.0, dtype=tl.float32) for v_start in range(0, V, BLOCK_V): offs = v_start + tl.arange(0, BLOCK_V) mask = offs < V x = tl.load(row_ptr + offs, mask=mask, other=float("-inf")).to(tl.float32) block_max = tl.max(x, axis=0) new_max = tl.maximum(running_max, block_max) correction = tl.exp(running_max - new_max) running_sum_exp = running_sum_exp * correction running_weighted = running_weighted * correction exp_x = tl.exp(x - new_max) exp_x = tl.where(mask, exp_x, 0.0) x = tl.where(mask, x, 0.0) running_sum_exp += tl.sum(exp_x, axis=0) running_weighted += tl.sum(exp_x * x, axis=0) running_max = new_max entropy = tl.log(running_sum_exp) + running_max - running_weighted / running_sum_exp tl.store(output_ptr + local_row, entropy) def entropy_from_logits(logits: torch.Tensor, chunk_size: int = 128) -> torch.Tensor: """Triton-fused entropy (online single-pass). Handles non-contiguous tensors without copying.""" original_shape = logits.shape[:-1] V = logits.shape[-1] N = 1 for s in original_shape: N *= s if not logits.is_cuda: # CPU fallback: stable entropy via log_softmax logp = F.log_softmax(logits.float(), dim=-1) ent = -(logp.exp() * logp).sum(dim=-1) return ent.to(logits.dtype).reshape(original_shape) output = torch.empty(N, device=logits.device, dtype=torch.float32) BLOCK_V = 4096 MAX_GRID_CONTIG = 8192 MAX_GRID_STRIDED = 2048 # Vocab (last) dim must be contiguous for coalesced loads if logits.stride(-1) != 1: logits = logits.contiguous() if logits.is_contiguous(): flat_logits = logits.reshape(-1, V) stride = flat_logits.stride(0) for start in range(0, N, MAX_GRID_CONTIG): n_rows = min(MAX_GRID_CONTIG, N - start) _entropy_online_kernel[(n_rows,)]( flat_logits[start], output[start], stride, V=V, BLOCK_V=BLOCK_V ) elif logits.ndim == 3: stride_outer = logits.stride(0) stride_inner = logits.stride(1) n_inner = logits.shape[1] for start in range(0, N, MAX_GRID_STRIDED): n_rows = min(MAX_GRID_STRIDED, N - start) _entropy_online_kernel_strided[(n_rows,)]( logits, output[start], stride_outer, stride_inner, n_inner, start, V=V, BLOCK_V=BLOCK_V, ) else: logits = logits.contiguous() flat_logits = logits.reshape(-1, V) stride = flat_logits.stride(0) for start in range(0, N, MAX_GRID_CONTIG): n_rows = min(MAX_GRID_CONTIG, N - start) _entropy_online_kernel[(n_rows,)]( flat_logits[start], output[start], stride, V=V, BLOCK_V=BLOCK_V ) return output.to(logits.dtype).reshape(original_shape) # --------------------------------------------------------------------------- # selective_log_softmax — fused forward + backward Triton kernels # --------------------------------------------------------------------------- def selective_log_softmax_original(logits, index) -> torch.Tensor: """Original selective_log_softmax (reference/fallback).""" squeeze = index.ndim == logits.ndim - 1 if squeeze: index = index.unsqueeze(-1) if logits.dtype in [torch.float32, torch.float64]: selected_logits = torch.gather(logits, dim=-1, index=index) logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) per_token_logps = selected_logits - logsumexp_values.unsqueeze(-1) else: per_token_logps = [] for row_logits, row_labels in zip(logits, index, strict=True): row_logps = F.log_softmax(row_logits, dim=-1) row_per_token_logps = row_logps.gather(dim=-1, index=row_labels) per_token_logps.append(row_per_token_logps) per_token_logps = torch.stack(per_token_logps) if squeeze: per_token_logps = per_token_logps.squeeze(-1) return per_token_logps @triton.jit def _selective_logsoftmax_fwd_kernel( logits_ptr, index_ptr, output_ptr, logsumexp_ptr, stride_logits_row, stride_index_row, stride_output_row, actual_K, K_BLOCK: tl.constexpr, V: tl.constexpr, BLOCK_V: tl.constexpr, ): """Forward: online logsumexp + gather. Saves logsumexp for backward.""" row = tl.program_id(0) logits_row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_logits_row # Online logsumexp running_max = tl.full([], float("-inf"), dtype=tl.float32) running_sum_exp = tl.full([], 0.0, dtype=tl.float32) for v_start in range(0, V, BLOCK_V): offs = v_start + tl.arange(0, BLOCK_V) mask = offs < V x = tl.load(logits_row_ptr + offs, mask=mask, other=float("-inf")).to( tl.float32 ) block_max = tl.max(x, axis=0) new_max = tl.maximum(running_max, block_max) running_sum_exp = running_sum_exp * tl.exp(running_max - new_max) exp_x = tl.exp(x - new_max) exp_x = tl.where(mask, exp_x, 0.0) running_sum_exp += tl.sum(exp_x, axis=0) running_max = new_max lse = tl.log(running_sum_exp) + running_max tl.store(logsumexp_ptr + row, lse) # Gather and subtract index_row_ptr = index_ptr + tl.cast(row, tl.int64) * stride_index_row output_row_ptr = output_ptr + tl.cast(row, tl.int64) * stride_output_row k_offs = tl.arange(0, K_BLOCK) k_mask = k_offs < actual_K indices = tl.load(index_row_ptr + k_offs, mask=k_mask, other=0).to(tl.int64) valid_mask = k_mask & (indices >= 0) & (indices < V) safe_indices = tl.where(valid_mask, indices, 0) selected = tl.load(logits_row_ptr + safe_indices, mask=valid_mask, other=0.0).to( tl.float32 ) tl.store(output_row_ptr + k_offs, selected - lse, mask=valid_mask) @triton.jit def _selective_logsoftmax_bwd_kernel( grad_output_ptr, logits_ptr, index_ptr, logsumexp_ptr, grad_logits_ptr, stride_grad_out_row, stride_logits_row, stride_index_row, stride_grad_logits_row, actual_K, K_BLOCK: tl.constexpr, V: tl.constexpr, BLOCK_V: tl.constexpr, ): """Backward: d_logits[j] = -softmax(x)[j] * sum(grad_out) + (grad_out[k] if j == index[k]). Single fused pass over V. For each tile, computes the base gradient and adds scatter contributions inline by checking which indices fall in the current tile. No separate scatter pass — no read-after-write issues. """ row = tl.program_id(0) logits_row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_logits_row grad_logits_row_ptr = ( grad_logits_ptr + tl.cast(row, tl.int64) * stride_grad_logits_row ) grad_out_row_ptr = grad_output_ptr + tl.cast(row, tl.int64) * stride_grad_out_row index_row_ptr = index_ptr + tl.cast(row, tl.int64) * stride_index_row lse = tl.load(logsumexp_ptr + row).to(tl.float32) # Load grad_output and indices (K_BLOCK elements, masked) k_offs = tl.arange(0, K_BLOCK) k_mask = k_offs < actual_K grad_out = tl.load(grad_out_row_ptr + k_offs, mask=k_mask, other=0.0).to(tl.float32) indices = tl.load( index_row_ptr + k_offs, mask=k_mask, other=-1 ) # -1 = never matches valid_mask = k_mask & (indices >= 0) & (indices < V) grad_out = tl.where(valid_mask, grad_out, 0.0) indices = tl.where(valid_mask, indices, -1) grad_sum = tl.sum(grad_out, axis=0) # Fused pass: for each tile, compute -softmax * grad_sum + scatter for v_start in range(0, V, BLOCK_V): offs = v_start + tl.arange(0, BLOCK_V) # [BLOCK_V] mask = offs < V x = tl.load(logits_row_ptr + offs, mask=mask, other=0.0).to(tl.float32) softmax_j = tl.exp(x - lse) softmax_j = tl.where(mask, softmax_j, 0.0) grad_j = -softmax_j * grad_sum # Scatter: check which selected indices fall in this tile # offs: [BLOCK_V], indices: [K_BLOCK] # Broadcast: offs[:, None] == indices[None, :] → [BLOCK_V, K_BLOCK] match = offs[:, None] == indices[None, :] # [BLOCK_V, K_BLOCK] # Sum grad_out contributions: for each position j, sum grad_out[k] where index[k]==j scatter_contrib = tl.sum( tl.where(match, grad_out[None, :], 0.0), axis=1 ) # [BLOCK_V] grad_j += scatter_contrib tl.store(grad_logits_row_ptr + offs, grad_j, mask=mask) class _SelectiveLogSoftmaxTriton(torch.autograd.Function): @staticmethod def forward(ctx, flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID): N = flat_logits.shape[0] output = torch.empty(N, K_BLOCK, device=flat_logits.device, dtype=torch.float32) logsumexp = torch.empty(N, device=flat_logits.device, dtype=torch.float32) for start in range(0, N, MAX_GRID): n_rows = min(MAX_GRID, N - start) _selective_logsoftmax_fwd_kernel[(n_rows,)]( flat_logits[start], flat_index[start], output[start], logsumexp[start], flat_logits.stride(0), flat_index.stride(0), output.stride(0), K, K_BLOCK=K_BLOCK, V=V, BLOCK_V=BLOCK_V, ) ctx.save_for_backward(flat_logits, flat_index, logsumexp) ctx.K = K ctx.K_BLOCK = K_BLOCK ctx.V = V ctx.BLOCK_V = BLOCK_V ctx.MAX_GRID = MAX_GRID return output @staticmethod def backward(ctx, grad_output): flat_logits, flat_index, logsumexp = ctx.saved_tensors K, K_BLOCK, V, BLOCK_V, MAX_GRID = ( ctx.K, ctx.K_BLOCK, ctx.V, ctx.BLOCK_V, ctx.MAX_GRID, ) N = flat_logits.shape[0] grad_logits = torch.empty_like(flat_logits) # grad_output may have K_BLOCK cols; backward kernel reads actual_K grad_output_contig = grad_output.contiguous() for start in range(0, N, MAX_GRID): n_rows = min(MAX_GRID, N - start) _selective_logsoftmax_bwd_kernel[(n_rows,)]( grad_output_contig[start], flat_logits[start], flat_index[start], logsumexp[start], grad_logits[start], grad_output_contig.stride(0), flat_logits.stride(0), flat_index.stride(0), grad_logits.stride(0), K, K_BLOCK=K_BLOCK, V=V, BLOCK_V=BLOCK_V, ) # Return grads for: flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID return grad_logits, None, None, None, None, None, None def selective_log_softmax(logits, index) -> torch.Tensor: """ Fused selective_log_softmax with Triton forward+backward kernels. Equivalent to: torch.gather(logits.log_softmax(-1), dim=-1, index=index) """ squeeze = index.ndim == logits.ndim - 1 if squeeze: index = index.unsqueeze(-1) if not logits.is_cuda or logits.dtype == torch.float64: # Triton kernel computes in float32; fall back for float64 and CPU return selective_log_softmax_original( logits, index.squeeze(-1) if squeeze else index ) V = logits.shape[-1] K = index.shape[-1] original_index_shape = index.shape flat_logits = logits.reshape(-1, V).contiguous() flat_index = index.reshape(-1, K).contiguous() BLOCK_V = 4096 MAX_GRID = 8192 K_BLOCK = max(1, triton.next_power_of_2(K)) output = _SelectiveLogSoftmaxTriton.apply( flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID ) if K_BLOCK != K: output = output[:, :K] per_token_logps = output.to(logits.dtype).reshape(original_index_shape) if squeeze: per_token_logps = per_token_logps.squeeze(-1) return per_token_logps ================================================ FILE: src/axolotl/monkeypatch/trainer_accelerator_args.py ================================================ """ allow adding additional kwargs to Accelerator init """ import inspect from transformers import Trainer from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) ORIGINAL_TRAINER_CODE = """ # create accelerator object self.accelerator = Accelerator(**args) """ PATCHED_TRAINER_CODE = """ if hasattr(self, "additional_accelerator_args"): additional_args = self.additional_accelerator_args(fp8=True, enable_fsdp_float8_all_gather={enable_fsdp_float8_all_gather}, **args) if additional_args: args.update(additional_args) # create accelerator object self.accelerator = Accelerator(**args) """ def get_create_accelerate_code() -> str: training_loop = inspect.getsource(Trainer.create_accelerator_and_postprocess) return training_loop def check_create_accelerate_code_is_patchable() -> bool: create_code = get_create_accelerate_code() create_code, _ = detab_code(create_code) return ORIGINAL_TRAINER_CODE in create_code def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool): """ Monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs. """ try: create_code = get_create_accelerate_code() except OSError: return Trainer._original_create_accelerator_and_postprocess = create_code create_code, _ = detab_code(create_code) if ORIGINAL_TRAINER_CODE not in create_code: return patched_trainer_code = PATCHED_TRAINER_CODE.format( enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather ) create_code = create_code.replace(ORIGINAL_TRAINER_CODE, patched_trainer_code) create_code = create_code.replace( "def create_accelerator_and_postprocess(", "def fixed_create_accelerator_and_postprocess(", 1, ) # load imports necessary import transformers.trainer items_to_import = [] for item in dir(transformers.trainer): if item in create_code: items_to_import.append(item) exec( "from transformers.trainer import (" + ", ".join(x for x in items_to_import) + ")", globals(), ) exec(create_code, globals()) LOG.info("patching create_accelerator_and_postprocess to allow for overrides") Trainer.create_accelerator_and_postprocess = ( fixed_create_accelerator_and_postprocess ) ================================================ FILE: src/axolotl/monkeypatch/trainer_fsdp_optim.py ================================================ """ fix for FSDP optimizer save in trainer w 4.47.0 """ import inspect from transformers import Trainer from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) ORIGINAL_TRAINER_CODE = """ if delay_optimizer_creation: self.optimizer = self.accelerator.prepare(self.optimizer) """ PATCHED_TRAINER_CODE = """ if delay_optimizer_creation: model = self.accelerator.prepare(self.model) """ def get_training_loop_code() -> str: training_loop = inspect.getsource(Trainer._inner_training_loop) return training_loop def check_training_loop_is_patchable() -> bool: training_loop = get_training_loop_code() training_loop, _ = detab_code(training_loop) return ORIGINAL_TRAINER_CODE in training_loop def patch_training_loop_for_fsdp(): """ monkeypatch for fixing the training loop for fsdp with optimizer save """ try: training_loop = get_training_loop_code() except OSError: return Trainer._original_inner_training_loop = training_loop training_loop, _ = detab_code(training_loop) if ORIGINAL_TRAINER_CODE not in training_loop: return training_loop = training_loop.replace(ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE) training_loop = training_loop.replace( "def _inner_training_loop(", "def _fixed_inner_training_loop(", 1, ) # load imports necessary import transformers.trainer items_to_import = [] for item in dir(transformers.trainer): if item in training_loop: items_to_import.append(item) exec( "from transformers.trainer import (" + ", ".join(x for x in items_to_import) + ")", globals(), ) exec(training_loop, globals()) LOG.info("patching _inner_training_loop for fsdp optimizer save") Trainer._inner_training_loop = _fixed_inner_training_loop ================================================ FILE: src/axolotl/monkeypatch/transformers/__init__.py ================================================ ================================================ FILE: src/axolotl/monkeypatch/transformers/trainer_context_parallel.py ================================================ """Monkey patch to allow context parallelism with FlashAttention in HF Trainer.""" from __future__ import annotations import importlib import inspect from transformers import Trainer from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":' PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):' def patch_prepare_context_parallel_inputs() -> None: """Relax the SDPA-only guard when running context parallelism with FlashAttention.""" if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False): LOG.debug("Trainer._prepare_context_parallel_inputs already patched") return try: original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs) except OSError as exc: # pragma: no cover - occurs when source is unavailable LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc) return if GUARD_PATTERN not in original_source: LOG.warning( "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n" "skipping FlashAttention context parallelism patch" ) return patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD) patched_source, _ = detab_code(patched_source) patched_source = patched_source.replace( "def _prepare_context_parallel_inputs(", "def axolotl_prepare_context_parallel_inputs(", 1, ) module_name = Trainer.__module__ module = importlib.import_module(module_name) # import symbols referenced in the method so exec can succeed items_to_import = [] for item in dir(module): if item in patched_source: items_to_import.append(item) # Use a separate namespace to capture the exec'd function namespace = {} exec(f"from {module_name} import ({', '.join(items_to_import)})", namespace) exec(patched_source, namespace) # Explicitly get the function from the namespace axolotl_prepare_context_parallel_inputs = namespace[ "axolotl_prepare_context_parallel_inputs" ] Trainer._original_prepare_context_parallel_inputs = ( Trainer._prepare_context_parallel_inputs ) Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source Trainer._axolotl_prepare_context_parallel_inputs_patched = True LOG.debug( "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP" ) ================================================ FILE: src/axolotl/monkeypatch/transformers/trainer_loss_calc.py ================================================ """ Module for patching transformers Trainer loss calculation to use nanmean. This is needed for context parallelism since chunks of the input sequences may be fully masked and return NaNs in the loss calculation. Also includes a patch for FSDP2 + torch.compile. We need to bundle this together with the other evaluation_loop patch because we can't patch the same code twice without raising an OSError. """ import importlib import inspect from transformers import Trainer from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) ORIGINAL_EVAL_CODE = { "list": 'metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()', "array": 'metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()', } PATCHED_EVAL_CODE = { "list": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(np.concatenate(all_losses)).item()', "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()', } ORIGINAL_MAYBE_CODE = ( "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()" ) PATCHED_MAYBE_CODE = ( "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).nanmean().item()" ) def check_evaluation_loop_is_patchable() -> bool: evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop) return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values()) def patch_evaluation_loop(): """Patch the evaluation_loop method.""" # Check if already patched if hasattr(Trainer, "_original_evaluation_loop"): LOG.debug("Trainer.evaluation_loop already patched") return # Check if the patterns exist try: evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop) except OSError: return Trainer.evaluation = evaluation_loop_source evaluation_loop_source, _ = detab_code(evaluation_loop_source) # Apply the nanmean patches evaluation_loop_source = evaluation_loop_source.replace( ORIGINAL_EVAL_CODE["list"], PATCHED_EVAL_CODE["list"] ) evaluation_loop_source = evaluation_loop_source.replace( ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"] ) # Rename the function to avoid conflicts evaluation_loop_source = evaluation_loop_source.replace( "def evaluation_loop(", "def axolotl_evaluation_loop(", 1, ) # Get the module for necessary imports module_name = Trainer.__module__ module = importlib.import_module(module_name) # Import necessary items from the module items_to_import = [] for item in dir(module): if item in evaluation_loop_source: items_to_import.append(item) # Execute the imports and patched method exec( f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) exec(evaluation_loop_source, globals()) LOG.debug("Patched Trainer.evaluation_loop with nanmean loss calculation") Trainer.evaluation_loop = axolotl_evaluation_loop def check_maybe_log_save_evaluate_is_patchable() -> bool: maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate) return ORIGINAL_MAYBE_CODE in maybe_log_source def patch_maybe_log_save_evaluate(): """Patch the _maybe_log_save_evaluate method.""" # Check if already patched if hasattr(Trainer, "_original_maybe_log_save_evaluate"): LOG.info("Trainer._maybe_log_save_evaluate already patched") return # Check if the patterns exist try: maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate) except OSError: return Trainer._original_maybe_log_save_evaluate = maybe_log_source maybe_log_source, _ = detab_code(maybe_log_source) # Apply the patch maybe_log_source = maybe_log_source.replace(ORIGINAL_MAYBE_CODE, PATCHED_MAYBE_CODE) # Rename the function to avoid conflicts maybe_log_source = maybe_log_source.replace( "def _maybe_log_save_evaluate(", "def axolotl_maybe_log_save_evaluate(", 1, ) # Get the module for necessary imports module_name = Trainer.__module__ module = importlib.import_module(module_name) # Import necessary items from the module items_to_import = [] for item in dir(module): if item in maybe_log_source: items_to_import.append(item) # Execute the imports and patched method exec( f"from {module_name} import ({', '.join(items_to_import)})", globals(), ) exec(maybe_log_source, globals()) LOG.debug("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation") Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate ================================================ FILE: src/axolotl/monkeypatch/transformers_fa_utils.py ================================================ """ see https://github.com/huggingface/transformers/pull/35834 """ from functools import partial from typing import Optional import torch from axolotl.utils.logging import get_logger logger = get_logger(__name__) def fixed_fa_peft_integration_check( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, target_dtype: Optional[torch.dtype] = None, preferred_dtype: Optional[torch.dtype] = None, ): """ PEFT usually casts the layer norms in float32 for training stability reasons therefore the input hidden states gets silently casted in float32. Hence, we need cast them back in float16 / bfloat16 just to be sure everything works as expected. This might slowdown training & inference so it is recommended to not cast the LayerNorms! Args: query (`torch.Tensor`): Input query states to be passed to Flash Attention API key (`torch.Tensor`): Input key states to be passed to Flash Attention API value (`torch.Tensor`): Input value states to be passed to Flash Attention API target_dtype (`torch.dtype`, *optional*): The dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype. preferred_dtype (`torch.dtype`, *optional*): The preferred dtype to convert the attention tensors to regardless of the target dtype. """ if target_dtype is None and preferred_dtype is None: return query, key, value if preferred_dtype and target_dtype != preferred_dtype: target_dtype = preferred_dtype # check if any of query, key, or value are in float32. If so, cast them back to target dtype. if any(module.dtype == torch.float32 for module in [query, key, value]): logger.warning_once( f"The input hidden states seems to be silently casted in float32, this might be related to" f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" f" {target_dtype}." ) query = query.to(target_dtype) key = key.to(target_dtype) value = value.to(target_dtype) return query, key, value def patch_fa_peft_integration(): import transformers.modeling_flash_attention_utils transformers.modeling_flash_attention_utils.fa_peft_integration_check = partial( fixed_fa_peft_integration_check, preferred_dtype=None ) ================================================ FILE: src/axolotl/monkeypatch/unsloth_.py ================================================ """module for patching with unsloth optimizations""" import inspect import types import torch from peft import PeftModelForCausalLM from torch import nn from transformers.models.llama.modeling_llama import LlamaFlashAttention2 from axolotl.monkeypatch.utils import detab_code from axolotl.utils.logging import get_logger LOG = get_logger(__name__) ORIGINAL_QKV_CODE = """ query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) """.lstrip("\n") PATCHED_QKV_CODE = """ query_states, key_states, value_states = self.apply_qkv(self, hidden_states) """.lstrip("\n") ORIGINAL_O_CODE = """ attn_output = self.o_proj(attn_output) """.lstrip("\n") PATCHED_O_CODE = """ attn_output = self.apply_o(self, attn_output) """.lstrip("\n") def original_apply_qkv(self, hidden_states): query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) return query_states, key_states, value_states def original_apply_o(self, hidden_states): attn_output = self.o_proj(hidden_states) return attn_output def get_self_attn_code() -> str: forward = inspect.getsource(LlamaFlashAttention2.forward) return forward def check_self_attn_is_patchable() -> bool: qkv = get_self_attn_code() qkv, _ = detab_code(qkv) return ORIGINAL_QKV_CODE in qkv and ORIGINAL_O_CODE in qkv def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None: from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss def UnslothForCausalLMLoss( logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs, ): # Upcast to float if we need to compute the loss to avoid potential precision issues logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss = fast_cross_entropy_loss( logits=shift_logits, labels=shift_labels, n_items=num_items_in_batch ) return loss if model_type == "llama": from transformers.loss import loss_utils loss_utils.ForCausalLMLoss = UnslothForCausalLMLoss # type: ignore[assignment] else: raise ValueError("Unsupported model type") self_attn_lora_patched = False def patch_self_attn_lora(): global self_attn_lora_patched if self_attn_lora_patched: # prevent patching multiple times return self_attn_forward = get_self_attn_code() LlamaFlashAttention2._original_forward = self_attn_forward self_attn_forward, _ = detab_code(self_attn_forward) assert ORIGINAL_QKV_CODE in self_attn_forward, "Original qkv code not found" assert ORIGINAL_O_CODE in self_attn_forward, "Original o code not found" self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE) self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE) self_attn_forward = self_attn_forward.replace( "def forward(", "def unsloth_attn_forward(", 1, ) # load imports necessary import transformers.models.llama.modeling_llama items_to_import = [] for item in dir(transformers.models.llama.modeling_llama): if item in self_attn_forward: items_to_import.append(item) exec( "from transformers.models.llama.modeling_llama import (" + ", ".join(x for x in items_to_import) + ")", globals(), ) exec(self_attn_forward, globals()) self_attn_lora_patched = True LOG.info("patching unsloth attn lora") LlamaFlashAttention2.forward = unsloth_attn_forward def integrate_rope_embeddings(): import transformers.models.llama.modeling_llama from unsloth.kernels.rope_embedding import fast_rope_embedding def apply_rotary_pos_emb( q, k, cos, sin, position_ids=None, unsqueeze_dim=1, ): return fast_rope_embedding(q, k, cos, sin) LOG.info("patching unsloth RoPE embeddings") transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM): if peft_model.base_model.config.model_type in ["llama", "mistral"]: from unsloth.kernels import apply_lora_mlp_swiglu apply_lora_mlp = apply_lora_mlp_swiglu elif peft_model.base_model.config.model_type == "gemma": from unsloth.kernels import apply_lora_mlp_geglu_approx apply_lora_mlp = apply_lora_mlp_geglu_approx else: raise NotImplementedError( f"Model type {peft_model.base_model.config.model_type} not supported" ) for idx, layer in enumerate(peft_model.model.model.layers): layer_modules = [ getattr(layer.mlp, linear_proj) for linear_proj in ["gate_proj", "up_proj", "down_proj"] ] is_mlp_lora = all(hasattr(module, "lora_A") for module in layer_modules) mlp_no_bias = all( getattr(module, "base_layer", module).bias is None for module in layer_modules ) mlp_not_dora = all( len(getattr(module, "lora_magnitude_vector", []) or []) == 0 for module in layer_modules ) if is_mlp_lora and mlp_no_bias and mlp_not_dora: layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp) else: LOG.warning(f"unable to apply unsloth lora mlp patch to layer {idx}") def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg): from unsloth.kernels import apply_lora_o, apply_lora_qkv for idx, layer in enumerate(peft_model.model.model.layers): if cfg.unsloth_lora_qkv: layer_modules = [ getattr(layer.self_attn, linear_proj) for linear_proj in ["q_proj", "k_proj", "v_proj"] ] is_qkv_lora = all(hasattr(module, "lora_A") for module in layer_modules) qkv_no_bias = all( getattr(module, "base_layer", module).bias is None for module in layer_modules ) qkv_not_dora = all( len(getattr(module, "lora_magnitude_vector", []) or []) == 0 for module in layer_modules ) if is_qkv_lora and qkv_no_bias and qkv_not_dora: layer.self_attn.apply_qkv = apply_lora_qkv else: layer.self_attn.apply_qkv = original_apply_qkv LOG.warning(f"unable to apply unsloth lora qkv patch to layer {idx}") if cfg.unsloth_lora_o: layer_modules = [ getattr(layer.self_attn, linear_proj) for linear_proj in ["o_proj"] ] is_o_lora = all(hasattr(module, "lora_A") for module in layer_modules) o_no_bias = all( getattr(module, "base_layer", module).bias is None for module in layer_modules ) o_not_dora = all( len(getattr(module, "lora_magnitude_vector", []) or []) == 0 for module in layer_modules ) if is_o_lora and o_no_bias and o_not_dora: layer.self_attn.apply_o = apply_lora_o else: layer.self_attn.apply_o = original_apply_o LOG.warning(f"unable to apply unsloth lora o_proj patch to layer {idx}") def patch_unsloth_layernorm(): try: import transformers.models.llama.modeling_llama from unsloth.kernels.rms_layernorm import Fast_RMS_Layernorm class LlamaRMSNorm(nn.Module): """LlamaRMSNorm""" def __init__(self, hidden_size, eps=1e-6): """ LlamaRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): return Fast_RMS_Layernorm.apply( hidden_states, self.weight, self.variance_epsilon, False ) LOG.info("patching with unsloth.kernels.rms_layernorm") transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm except ImportError: LOG.warning("missing unsloth library") ================================================ FILE: src/axolotl/monkeypatch/utils.py ================================================ """ Shared utils for the monkeypatches """ import re from typing import Tuple import torch import torch.nn.functional as F @torch.jit.script def get_max_seqlen_in_batch(attention_mask: torch.Tensor) -> torch.Tensor: max_num = int(torch.max(attention_mask).item()) batch_size, _ = attention_mask.shape counts = torch.zeros((batch_size, max_num), dtype=torch.int32) for i in range(1, max_num + 1): mask = attention_mask == i counts[:, i - 1] = torch.sum(mask, dim=-1).to(dtype=torch.int32) result = counts.flatten() nonzero_indices = torch.nonzero(result).squeeze(-1) return result[nonzero_indices] @torch.jit.script def get_unpad_data(attention_mask: torch.Tensor): device = attention_mask.device seqlens_in_batch = get_max_seqlen_in_batch(attention_mask) indices = torch.nonzero(attention_mask.flatten()).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() cu_seqlens = ( F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) .to(device=device) .detach() ) return ( indices, cu_seqlens, max_seqlen_in_batch, ) def get_cu_seqlens(attn_mask): """generate a cumulative sequence length mask for flash attention using attn mask""" if len(attn_mask.shape) == 1: attn_mask = attn_mask.unsqueeze(0) device = attn_mask.device results = [] max_seq_lens = [] for row in attn_mask: # Exclude zeros to avoid adding their positions to the mask t_non_zeros = row[row != 0] # Find where the sequence number changes (including the first position) seq_change = torch.cat( [ torch.tensor([1], dtype=torch.int32, device=device), t_non_zeros[1:] != t_non_zeros[:-1], ] ) # Get the indices where the sequence changes change_indices = torch.cat( [ (seq_change == 1).nonzero(as_tuple=True)[0], torch.tensor([len(t_non_zeros)], dtype=torch.int32, device=device), ] ) # Calculate the sequence lengths seq_lengths = change_indices[1:] - change_indices[:-1] # Calculate the length of the final sequence or padding final_seq_length = len(row) - change_indices[-1] # Append the length of the final sequence or padding to seq_lengths if final_seq_length.item(): seq_lengths = torch.cat( [ seq_lengths, torch.tensor( [final_seq_length.item()], dtype=torch.int32, device=device ), ] ) # Calculate the cumulative sequence lengths cu_seqlens = torch.cat( [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)] ) max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max() results.append(cu_seqlens) max_seq_lens.append(max_seq_len) return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens) def get_cu_seqlens_from_pos_ids( position_ids: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: """generate a cumulative sequence length mask for flash attention using pos ids""" if len(position_ids.shape) == 1: position_ids = position_ids.unsqueeze(0) device = position_ids.device results = [] max_seq_lens = [] for row in position_ids: # Count the number of consecutive zeros from the right side padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item() # Adjust the row to exclude padding adjusted_row = row[:-padding_length] if padding_length else row.clone() # Find where the position resets to 0 (indicating a new sequence) seq_starts = torch.cat( [ torch.tensor([True], dtype=torch.bool, device=device), adjusted_row[1:] == 0, ] ) # Get the indices where the sequence starts start_indices = torch.cat( [ torch.nonzero(seq_starts).unbind(dim=1)[0], torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device), ] ) # Calculate the sequence lengths seq_lengths = start_indices[1:] - start_indices[:-1] # Calculate the cumulative sequence lengths cu_seqlens = torch.cat( [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)] ) # Append the padding length to the cumulative sequence lengths if padding_length: cu_seqlens = torch.cat( [cu_seqlens, torch.tensor([len(row)], dtype=torch.int32, device=device)] ) max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max() results.append(cu_seqlens) max_seq_lens.append(max_seq_len) # Find the maximum value across all tensors max_value = max(t.max() for t in results) # Find the length of the longest tensor max_length = max(t.size(0) for t in results) # Pad each tensor to the same length and collect them in a list padded_results = [ F.pad(t, (0, max_length - t.size(0)), "constant", max_value) for t in results ] return torch.stack(padded_results).to(dtype=torch.int32), torch.stack(max_seq_lens) def set_module_name(model, name, value): if "." in name: parent_name = name.rsplit(".", 1)[0] child_name = name[len(parent_name) + 1 :] parent = model.get_submodule(parent_name) else: parent_name = "" parent = model child_name = name setattr(parent, child_name, value) def detab_code(code: str) -> Tuple[str, str]: try: spaces = re.match(r"([\s\t]{1,})", code).group(0) code = re.sub(r"^" + spaces, "", code, flags=re.MULTILINE) except AttributeError: return code, "" return code, spaces ================================================ FILE: src/axolotl/monkeypatch/xformers_/__init__.py ================================================ """ Fused MLP layer for incrementally improved training efficiency """ import torch from transformers.models.llama.modeling_llama import LlamaMLP from xformers.ops import SwiGLU from axolotl.monkeypatch.utils import set_module_name class FusedMLP(torch.nn.Module): """ Fused MLP layer for incrementally improved training efficiency """ def __init__( self, config, gate_proj: torch.nn.Linear, up_proj: torch.nn.Linear, down_proj: torch.nn.Linear, ): super().__init__() self.config = config self.swiglu = SwiGLU( in_features=config.hidden_size, hidden_features=config.intermediate_size, bias=False, _pack_weights=True, ) # overwrite initialized weights with pretrained weights self.swiglu.w12.weight.data = torch.cat( (gate_proj.weight.data, up_proj.weight.data), dim=0 ) self.swiglu.w3.weight.data = down_proj.weight.data def _post_training(self, model, name): w1, w2 = torch.split( self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0 ) # Assign the split weights back to the original layers new_mlp = LlamaMLP(self.config) new_mlp.gate_proj.weight.data = w1 new_mlp.up_proj.weight.data = w2 new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data set_module_name(model, name, new_mlp) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.swiglu(x) ================================================ FILE: src/axolotl/processing_strategies.py ================================================ """Module containing ProcessingStrategy classes and its derivative for different MultiModal Model types""" from copy import deepcopy from typing import Optional from PIL import Image, ImageOps from PIL.Image import Resampling from torch import Tensor, zeros_like from transformers import ProcessorMixin from transformers.image_utils import load_image from transformers.models.internvl import InternVLProcessor from transformers.models.smolvlm import SmolVLMProcessor from transformers.models.voxtral import VoxtralProcessor from axolotl.utils.dict import remove_none_values from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class ProcessingStrategy: """Base Processing Strategy class""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): self.processor = processor self.chat_template = chat_template self.image_token = None self.image_token_id = None self.image_size = image_size self.image_resize_algorithm = ( image_resize_algorithm or Image.Resampling.BILINEAR ) if hasattr(processor, "image_token"): self.image_token = processor.image_token self.image_token_id = processor.tokenizer.convert_tokens_to_ids( self.image_token ) def __call__(self, examples: list[dict]) -> list[dict]: """ Preprocess conversation examples to ensure consistent format. Converts different conversation formats to OpenAI format with 'messages'. Supports two formats: 1. OpenAI format with 'messages' 2. Legacy format with 'conversations' Args: examples: list of conversation dictionaries Returns: list of dicts in OpenAI format with 'messages' key Raises: ValueError: If the conversation format is not supported """ role_mapping = { "human": "user", "gpt": "assistant", } def normalize_role(role: str) -> str: """Normalize role names to OpenAI format. Default to original role if not found.""" return role_mapping.get(role, role) def convert_legacy_format(example: dict) -> dict: """Convert legacy 'conversations' format to OpenAI 'messages' format.""" messages = [ {"role": normalize_role(convo["from"]), "content": convo["value"]} for convo in example["conversations"] ] # Create new dict without 'conversations' key result = deepcopy(example) result.pop("conversations") result["messages"] = messages return result def convert_messages_to_multimedia_messages(messages: list[dict]) -> list[dict]: """Convert regular messages format to Messages format with content type""" new_messages = [] for message in messages: if isinstance(message["content"], str): new_messages.append( { "role": message["role"], "content": [ { "type": "text", "text": message["content"], } ], } ) elif isinstance(message["content"], list): content = message["content"] new_messages.append( { "role": message["role"], "content": content, } ) return new_messages processed_examples = [] for example in examples: if not ("messages" in example or "conversations" in example): raise ValueError( "Only `messages` and `conversations` message keys are currently supported." ) processed_example = None if ( "messages" in example and example["messages"] is not None ): # OpenAI format processed_example = example else: # Legacy format processed_example = convert_legacy_format(example) # convert regular messages format to Messages format with content type # for compatibility with apply_chat_template processed_example["messages"] = convert_messages_to_multimedia_messages( processed_example["messages"] ) # find the image key if it exists possible_image_keys = ["images", "image"] image_key = None for key in possible_image_keys: if key in processed_example: image_key = key break # if the image key exists, add the image to the first user message if image_key is not None and processed_example[image_key] is not None: # TODO: check if it's normal to be single image only for common datasets # From observation, it's usually a list of single image but some datasets may have several columns for images # Temporary solution: take the first image and suggest people convert their datasets to use multi-content Messages if len(processed_example[image_key]) > 1: LOG.warning( f"Found {len(processed_example[image_key])} images in a sample. Using the first one." "If you are using a dataset with multiple images per sample, please convert it to use multi-content Messages." "See https://docs.axolotl.ai/docs/multimodal.html#dataset-format" ) image_value = processed_example[image_key][0] # Handle image loading (Image, url, path, base64) image_value = load_image(image_value) if self.image_size is not None: assert hasattr(image_value, "resize"), ( "Image does not have a resize method" ) if isinstance(self.image_size, tuple): image_value = image_value.resize( self.image_size, self.image_resize_algorithm ) else: # Set the padding value; here we use black (0, 0, 0) for RGB images padding_color = (0, 0, 0) # When image_size is an int (square target), preserve aspect ratio then pad # This is to prevent aspect ratio distortion when resizing to square image_value = ImageOps.pad( image_value, (self.image_size, self.image_size), method=self.image_resize_algorithm, color=padding_color, ) # Look for any image type in the first message # some dataset have an {type: "image"} in the first message msg_ind_to_add = None ind_to_add = None first_user_idx = None for msg_idx, msg_content in enumerate(processed_example["messages"]): if first_user_idx is None and msg_content["role"] == "user": first_user_idx = msg_idx for i, content in enumerate( processed_example["messages"][msg_idx]["content"] ): # Usually datasets created with image columns, don't have it in the messages itself if content["type"] == "image" and all( k not in content for k in ["image", "url", "path", "base64"] ): msg_ind_to_add = msg_idx ind_to_add = i break # If an image type is found, add the image to that index if ind_to_add is not None and msg_ind_to_add is not None: processed_example["messages"][msg_ind_to_add]["content"][ ind_to_add ]["image"] = image_value else: # if no image type is found, add it to end of the first user message if first_user_idx is None: first_user_idx = 0 processed_example["messages"][first_user_idx]["content"].append( { "type": "image", "image": image_value, } ) processed_examples.append(remove_none_values(processed_example)) return processed_examples def _mask_non_assistant(self, labels: Tensor) -> Tensor: """ Mask non assistant regions to -100. To be implemented per subclass. """ return labels def process_labels(self, input_ids: Tensor) -> Tensor: labels = input_ids.clone() labels = self._mask_non_assistant(labels) # The labels are the input_ids, and we mask the padding tokens in the loss computation labels[labels == self.processor.tokenizer.pad_token_id] = -100 # Ignore the image token index in the loss computation (model specific) labels[labels == self.image_token_id] = -100 return labels class Qwen2VLProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Qwen2-VL""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) self.image_token = "<|image_pad|>" # nosec self.image_token_id = processor.tokenizer.convert_tokens_to_ids( self.image_token ) class Qwen3_5ProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Qwen3.5 (early-fusion VLM)""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) self.image_token = "<|image_pad|>" # nosec self.image_token_id = processor.tokenizer.convert_tokens_to_ids( self.image_token ) self.video_token = "<|video_pad|>" # nosec self.video_token_id = processor.tokenizer.convert_tokens_to_ids( self.video_token ) def process_labels(self, input_ids): labels = super().process_labels(input_ids) labels[labels == self.video_token_id] = -100 return labels class Gemma3ProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Gemma3""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) self.image_token = processor.tokenizer.special_tokens_map["boi_token"] self.image_token_id = processor.tokenizer.convert_tokens_to_ids( self.image_token ) def process_labels(self, input_ids): labels = input_ids.clone() # Follows https://ai.google.dev/gemma/docs/core/huggingface_vision_finetune_qlora labels[labels == self.processor.tokenizer.pad_token_id] = -100 labels[labels == self.image_token_id] = -100 labels[labels == 262144] = -100 # corresponds to return labels class Gemma3nProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Gemma3n""" def _mask_non_assistant(self, labels: Tensor) -> Tensor: def _find_token_sequence(label, start_pos, token_sequence): """Check if token_sequence appears at start_pos in label""" if start_pos + len(token_sequence) > len(label): return False if label[start_pos] != token_sequence[0]: return False return ( label[start_pos : start_pos + len(token_sequence)].tolist() == token_sequence ) def _find_assistant_end(label, start_pos, assistant_end_tok, mask, i): """ Find the end of assistant response and update mask accordingly Returns new position to continue from and whether the end seq is found """ k = start_pos while k < len(label): if not _find_token_sequence(label, k, assistant_end_tok): mask[i][k] = 1 k += 1 continue return k + len(assistant_end_tok), True return k, False mask = zeros_like(labels) assistant_start_str = "model" assistant_end_str = "" include_assistant_start_tok = False include_assistant_end_tok = True # str to tokens assistant_start_tok = self.processor.tokenizer.encode( assistant_start_str, add_special_tokens=False ) assistant_end_tok = self.processor.tokenizer.encode( assistant_end_str, add_special_tokens=False ) for i, label in enumerate(labels): j = 0 # while loop through each tok index in labels[i] while j < len(label): # Check until match start seq if not _find_token_sequence(label, j, assistant_start_tok): j += 1 continue if include_assistant_start_tok: mask[i][j : j + len(assistant_start_tok)] = 1 # Find where the assistant response ends start_of_content = j + len(assistant_start_tok) end_pos, found_end_seq = _find_assistant_end( label, start_of_content, assistant_end_tok, mask, i ) # Include end token if requested if include_assistant_end_tok and found_end_seq: mask[i][end_pos - len(assistant_end_tok) : end_pos] = 1 j = end_pos labels[i][mask[i] == 0] = -100 return labels def process_labels(self, input_ids): labels = input_ids.clone() labels = self._mask_non_assistant(labels) # Follows https://colab.research.google.com/github/huggingface/huggingface-gemma-recipes/blob/main/notebooks/fine_tune_gemma3n_on_t4.ipynb labels[labels == self.processor.tokenizer.pad_token_id] = -100 if hasattr(self.processor.tokenizer, "image_token_id"): labels[labels == self.processor.tokenizer.image_token_id] = -100 if hasattr(self.processor.tokenizer, "audio_token_id"): labels[labels == self.processor.tokenizer.audio_token_id] = -100 if hasattr(self.processor.tokenizer, "boi_token_id"): labels[labels == self.processor.tokenizer.boi_token_id] = -100 if hasattr(self.processor.tokenizer, "eoi_token_id"): labels[labels == self.processor.tokenizer.eoi_token_id] = -100 return labels class VoxtralProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Voxtral""" def __init__( self, processor: VoxtralProcessor, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) special_ids = ( processor.tokenizer.tokenizer.instruct_tokenizer.audio_encoder.special_ids ) self.audio_token = special_ids.audio self.begin_audio_token = special_ids.begin_audio def process_labels(self, input_ids): labels = input_ids.clone() labels[labels == self.processor.tokenizer.pad_token_id] = -100 labels[labels == self.audio_token] = -100 labels[labels == self.begin_audio_token] = -100 return labels class SmolVLM2ProcessingStrategy(ProcessingStrategy): """Processing Strategy class for SmolVLM2""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) self.image_token = "" # nosec self.image_token_id = processor.tokenizer.additional_special_tokens_ids[ processor.tokenizer.additional_special_tokens.index(self.image_token) ] class Mistral3ProcessingStrategy(ProcessingStrategy): """Processing Strategy class for Mistral3""" def __init__( self, processor, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) special_ids = ( processor.tokenizer.tokenizer.instruct_tokenizer.image_encoder.special_ids ) self.image_token = special_ids.img self.image_break_token = special_ids.img_break self.image_end_token = special_ids.img_end def process_labels(self, input_ids): labels = input_ids.clone() labels[labels == self.processor.tokenizer.pad_token_id] = -100 labels[labels == self.image_token] = -100 labels[labels == self.image_break_token] = -100 labels[labels == self.image_end_token] = -100 return labels class InternVLProcessingStrategy(ProcessingStrategy): """Processing Strategy class for InternVL""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) if not hasattr(processor, "image_ids"): raise ValueError("'image_ids' missing from InternVL Processor.") self.image_token_ids = processor.image_ids def process_labels(self, input_ids): labels = input_ids.clone() labels[labels == self.processor.tokenizer.pad_token_id] = -100 for ids in self.image_token_ids: labels[labels == ids] = -100 # Note: Check if need to mask 'video_token' as it gets converted to # image patches during media processing return labels class Glm4vProcessingStrategy(ProcessingStrategy): """Processing Strategy class for GLM4V and GLM4V-MoE vision models.""" def __init__( self, processor: ProcessorMixin, chat_template: Optional[str] = None, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): super().__init__(processor, chat_template, image_size, image_resize_algorithm) self.tokenizer = getattr(processor, "tokenizer", processor) self.image_token = "<|image|>" # nosec self.begin_image_token = "<|begin_of_image|>" # nosec self.end_image_token = "<|end_of_image|>" # nosec self.video_token = "<|video|>" # nosec self.begin_video_token = "<|begin_of_video|>" # nosec self.end_video_token = "<|end_of_video|>" # nosec self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token) self.begin_image_token_id = self.tokenizer.convert_tokens_to_ids( self.begin_image_token ) self.end_image_token_id = self.tokenizer.convert_tokens_to_ids( self.end_image_token ) self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token) self.begin_video_token_id = self.tokenizer.convert_tokens_to_ids( self.begin_video_token ) self.end_video_token_id = self.tokenizer.convert_tokens_to_ids( self.end_video_token ) def process_labels(self, input_ids): labels = input_ids.clone() labels[labels == self.tokenizer.pad_token_id] = -100 labels[labels == self.image_token_id] = -100 labels[labels == self.begin_image_token_id] = -100 labels[labels == self.end_image_token_id] = -100 labels[labels == self.video_token_id] = -100 labels[labels == self.begin_video_token_id] = -100 labels[labels == self.end_video_token_id] = -100 return labels def get_processing_strategy( processor: ProcessorMixin, chat_template, chat_template_type, image_size: int | tuple[int, int] | None = None, image_resize_algorithm: Resampling | None = None, ): from axolotl.utils.mistral.mistral3_processor import Mistral3Processor processing_kwargs = { "processor": processor, "chat_template": chat_template, "image_size": image_size, "image_resize_algorithm": image_resize_algorithm, } if chat_template_type in [None, "tokenizer_default"]: tokenizer = getattr(processor, "tokenizer", processor) if hasattr(tokenizer, "chat_template"): processing_kwargs["chat_template"] = tokenizer.chat_template if chat_template_type == "qwen2_vl": return Qwen2VLProcessingStrategy( **processing_kwargs, ) if chat_template_type in ["qwen3_5", "qwen3_5_moe"]: return Qwen3_5ProcessingStrategy( **processing_kwargs, ) if chat_template_type == "gemma3": return Gemma3ProcessingStrategy( **processing_kwargs, ) if chat_template_type == "gemma3n": return Gemma3nProcessingStrategy( **processing_kwargs, ) if isinstance(processor, VoxtralProcessor): return VoxtralProcessingStrategy( **processing_kwargs, ) if isinstance(processor, SmolVLMProcessor): return SmolVLM2ProcessingStrategy( **processing_kwargs, ) if isinstance(processor, Mistral3Processor): return Mistral3ProcessingStrategy( **processing_kwargs, ) try: from transformers.models.glm46v.processing_glm46v import Glm46VProcessor if isinstance(processor, Glm46VProcessor): return Glm4vProcessingStrategy( **processing_kwargs, ) except ImportError: pass if isinstance(processor, InternVLProcessor): return InternVLProcessingStrategy( **processing_kwargs, ) # llama3_2_vision, llama4, llava # mistral_v7_tekken, pixtral, lfm2vl return ProcessingStrategy( **processing_kwargs, ) ================================================ FILE: src/axolotl/prompt_strategies/__init__.py ================================================ """Module to load prompt strategies.""" import importlib import inspect from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def load(strategy, tokenizer, cfg, ds_cfg, processor=None): try: if strategy == "messages": from .messages import load as messages_load return messages_load(tokenizer, cfg, ds_cfg, processor=processor) load_fn = "load" package = "axolotl.prompt_strategies" if ( strategy.split(".")[-1].startswith("load_") or strategy.split(".")[-1] == "load" ): load_fn = strategy.split(".")[-1] strategy = ".".join(strategy.split(".")[:-1]) elif len(strategy.split(".")) > 1: try: importlib.import_module( "." + strategy.split(".")[-1], ".".join(strategy.split(".")[:-1]), ) package = ".".join(strategy.split(".")[:-1]) strategy = strategy.split(".")[-1] except ModuleNotFoundError: pass mod = importlib.import_module(f".{strategy}", package) func = getattr(mod, load_fn) load_kwargs = {} if strategy == "user_defined": load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg) else: sig = inspect.signature(func) if "ds_cfg" in sig.parameters: load_kwargs["ds_cfg"] = ds_cfg if "processor" in sig.parameters: load_kwargs["processor"] = processor return func(tokenizer, cfg, **load_kwargs) except ModuleNotFoundError: return None except Exception as exc: LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}") raise exc ================================================ FILE: src/axolotl/prompt_strategies/alpaca_chat.py ================================================ """Module for Alpaca prompt strategy classes""" from typing import Any, Dict, Optional, Tuple from axolotl.prompt_tokenizers import ( AlpacaPromptTokenizingStrategy, InstructionPromptTokenizingStrategy, ) from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): prompt_style = PromptStyle.CHAT.value if ds_cfg and "conversation" in ds_cfg: prompt_style = ds_cfg["conversation"] return AlpacaPromptTokenizingStrategy( AlpacaPrompter(prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) class AlpacaConcisePrompter(AlpacaPrompter): """ Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers """ system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n" system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n" class AlpacaChatPrompter(AlpacaPrompter): """ Alpaca Chat Prompter extending the system prompt to for chat-instruct answers """ system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n" system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n" def __init__(self): self.prompt_style = PromptStyle.CHAT.value self.match_prompt_style() class NoSystemPrompter(AlpacaPrompter): """ Null Prompter with no system prompts """ system_prompt = "" system_no_input_prompt = "" turn_format = "{instruction} {input} " turn_no_input_format = "{instruction} " def __init__(self): pass class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for AlpacaQA """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["question"], "", prompt["answer"], ) class CamelAIPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for CamelAI datasets """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["message_1"], "", prompt["message_2"], ) def load_concise(tokenizer, cfg): return AlpacaPromptTokenizingStrategy( AlpacaConcisePrompter(PromptStyle.CHAT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_qa(tokenizer, cfg): return AlpacaQAPromptTokenizingStrategy( AlpacaChatPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_camel_ai(tokenizer, cfg): return CamelAIPromptTokenizingStrategy( AlpacaChatPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_no_prompt(tokenizer, cfg): return AlpacaPromptTokenizingStrategy( UnpromptedPrompter(PromptStyle.CHAT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/alpaca_instruct.py ================================================ """Module loading the AlpacaInstructPromptTokenizingStrategy class""" from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter def load(tokenizer, cfg): return AlpacaPromptTokenizingStrategy( AlpacaPrompter(PromptStyle.INSTRUCT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_no_prompt(tokenizer, cfg): return AlpacaPromptTokenizingStrategy( UnpromptedPrompter(PromptStyle.INSTRUCT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/alpaca_w_system.py ================================================ """ Prompt strategies loader for alpaca instruction datasets with system prompts """ from typing import Generator, Tuple, Union from axolotl.prompt_tokenizers import PromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter, PromptStyle class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for instruction-based prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: return ( prompt["instruction"], prompt["input"] if "input" in prompt else "", prompt["output"], prompt["system"], ) def tokenize_prompt(self, prompt): ( instruction, input, response, system, ) = self.parse_instruction_fields(prompt) user_prompt = next( iter( self.prompter.build_prompt_w_system( system, instruction, input, ) ) ) tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False) if not self.train_on_inputs: user_prompt_len = len(tokenized_prompt["input_ids"]) # TODO this could be sped up using numpy array slicing tokenized_prompt["labels"] = [-100] * user_prompt_len tokenized_res_prompt = self._tokenize( response, strip_bos_token=True, add_eos_token=True ) tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"] tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"] tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"] return tokenized_prompt class SystemDataPrompter(AlpacaPrompter): """ Alpaca Style Prompter that uses system prompts from the dataset """ system_format: str = "### System:\n{system}\n\n" def build_prompt_w_system( self, system: str, instruction: str, input: Union[None, str] = None, output: Union[None, str] = None, ) -> Generator[str, None, None]: # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. formatted_sys_prompt = ( self.system_format.format(system=system) if system and self.system_format else "" ) if input: res = formatted_sys_prompt + self.turn_format.format( instruction=instruction, input=input ) else: res = formatted_sys_prompt + self.turn_no_input_format.format( instruction=instruction ) if output: res = f"{res}{output}" yield res class OpenOrcaSystemDataPrompter(SystemDataPrompter): """ Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts """ def match_prompt_style(self): if self.prompt_style == PromptStyle.INSTRUCT.value: self.turn_format = "### Human:\n{instruction}\n### Additional Context:\n{input}\n### Assistant:\n" self.turn_no_input_format = "### Human:\n{instruction}\n### Assistant:\n" self.system_format = "### System:\n{system}\n" if self.prompt_style == PromptStyle.CHAT.value: self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" self.system_format = "SYSTEM: {system}\n" if self.prompt_style == PromptStyle.CHATML.value: self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n" self.turn_no_input_format = ( "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" ) self.system_format = "<|im_start|>system\n{system}<|im_end|>\n" class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): """ Tokenizing strategy for OpenOrca datasets """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: return ( prompt["question"], "", prompt["response"], prompt["system_prompt"], ) def load(tokenizer, cfg): return load_chat(tokenizer, cfg) def load_instruct(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.INSTRUCT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_chat(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.CHAT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_open_orca(tokenizer, cfg): return OpenOrcaPromptTokenizingStrategy( OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_open_orca_chatml(tokenizer, cfg): return OpenOrcaPromptTokenizingStrategy( OpenOrcaSystemDataPrompter(PromptStyle.CHATML.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/base.py ================================================ """ module for base dataset transform strategies """ import importlib from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def load(strategy, cfg, module_base=None, **kwargs): try: if len(strategy.split(".")) == 1: strategy = strategy + ".default" load_fn = strategy.split(".")[-1] if len(strategy.split(".")) > 1: try: importlib.import_module( strategy.split(".")[-2], ".".join(strategy.split(".")[:-2]), ) module_base = ".".join(strategy.split(".")[:-2]) strategy = strategy.split(".")[-2] except ModuleNotFoundError: strategy = "." + ".".join(strategy.split(".")[:-1]) else: strategy = "." + ".".join(strategy.split(".")[:-1]) mod = importlib.import_module(strategy, module_base) func = getattr(mod, load_fn) return func(cfg, **kwargs) except Exception: LOG.warning(f"unable to load strategy {strategy}") return None ================================================ FILE: src/axolotl/prompt_strategies/bradley_terry/README.md ================================================ ### example yaml ```yaml chat_template: gemma datasets: - path: argilla/distilabel-intel-orca-dpo-pairs type: bradley_terry.chat_template val_set_size: 0.0 output_dir: ./outputs/out ``` ================================================ FILE: src/axolotl/prompt_strategies/bradley_terry/__init__.py ================================================ """Module to load prompt strategies.""" import importlib import inspect from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def load(strategy, tokenizer, cfg, ds_cfg): try: load_fn = "load" if strategy.split(".")[-1].startswith("load_"): load_fn = strategy.split(".")[-1] strategy = ".".join(strategy.split(".")[:-1]) mod = importlib.import_module( f".{strategy}", "axolotl.prompt_strategies.bradley_terry" ) func = getattr(mod, load_fn) load_kwargs = {} if strategy == "user_defined": load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg) else: sig = inspect.signature(func) if "ds_cfg" in sig.parameters: load_kwargs["ds_cfg"] = ds_cfg return func(tokenizer, cfg, **load_kwargs) except ModuleNotFoundError: return None except Exception as exc: LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}") return None ================================================ FILE: src/axolotl/prompt_strategies/bradley_terry/chat_template.py ================================================ """ Bradley-Terry model with chat template prompt strategy. """ from typing import Any, Dict, Optional from axolotl.prompt_strategies.chat_template import ( ChatTemplatePrompter, ChatTemplateStrategy, ) from axolotl.utils.chat_templates import get_chat_template_from_config from axolotl.utils.logging import get_logger # Configure the logger LOG = get_logger(__name__) LOG.setLevel("INFO") class BTChatTemplateStrategy(ChatTemplateStrategy): """ Bradley-Terry reward model pairwise chat template prompt strategy. """ @property def supports_batched(self) -> bool: return False def _tokenize_single_prompt(self, prompt): """ :param prompt: the actual row of data from the underlying dataset :return: """ max_length = self.prompter.max_length prompt["messages"] = [] if prompt["system"]: prompt["messages"].append({"role": "system", "content": prompt["system"]}) prompt["messages"].append({"role": "user", "content": prompt["input"]}) prompt["messages"].append({"role": "assistant", "content": prompt["chosen"]}) chosen_tokenized = super()._tokenize_single_prompt(prompt) if len(chosen_tokenized["input_ids"]) > max_length: LOG.warning( f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}" ) chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length] chosen_tokenized["attention_mask"] = chosen_tokenized["attention_mask"][ :max_length ] prompt["messages"] = [] if prompt["system"]: prompt["messages"].append({"role": "system", "content": prompt["system"]}) prompt["messages"].append({"role": "user", "content": prompt["input"]}) prompt["messages"].append({"role": "assistant", "content": prompt["rejected"]}) rejected_tokenized = super()._tokenize_single_prompt(prompt) if len(rejected_tokenized["input_ids"]) > max_length: LOG.warning( f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}" ) rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][ :max_length ] rejected_tokenized["attention_mask"] = rejected_tokenized["attention_mask"][ :max_length ] return { "chosen_input_ids": chosen_tokenized["input_ids"], "attention_mask_chosen": chosen_tokenized["attention_mask"], "labels_chosen": 1.0, "rejected_input_ids": rejected_tokenized["input_ids"], "attention_mask_rejected": rejected_tokenized["attention_mask"], "labels_rejected": 0.0, } def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): ds_cfg = ds_cfg or {} chat_template_string = get_chat_template_from_config( cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer ) prompter_params = { "tokenizer": tokenizer, "chat_template": chat_template_string, "message_property_mappings": ds_cfg.get( "message_property_mappings", { "role": "role", "content": "content", }, ), "message_field_training": ds_cfg.get("message_field_training", None), "message_field_training_detail": ds_cfg.get( "message_field_training_detail", None ), "roles": ds_cfg.get("roles"), "drop_system_message": ds_cfg.get("drop_system_message", False), # we need to add one for detecting sequences with exceeding the `sequence_len` limit. "max_length": ( cfg.sequence_len + 1 if not cfg.reward_model else cfg.sequence_len ), } strategy_params = { "train_on_inputs": cfg.train_on_inputs, "sequence_len": cfg.sequence_len, "roles_to_train": ds_cfg.get("roles_to_train", []), "train_on_eos": ds_cfg.get("train_on_eos", None), } strategy = BTChatTemplateStrategy( ChatTemplatePrompter(**prompter_params), tokenizer=tokenizer, **strategy_params ) return strategy ================================================ FILE: src/axolotl/prompt_strategies/bradley_terry/llama3.py ================================================ """ chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template """ def icr( cfg, **kwargs, ): """ chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs """ def transform_fn(sample): if "system" in sample and sample["system"]: prompt = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" sample["chosen"] = prompt + f"{sample['chosen']}<|eot_id|>" sample["rejected"] = prompt + f"{sample['rejected']}<|eot_id|>" return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/chat_template.py ================================================ """ HF Chat Templates prompt strategy """ import json from collections import defaultdict from typing import TYPE_CHECKING, Any, Dict, List, Set, Union from pydantic import BaseModel from transformers import ProcessorMixin from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer from axolotl.prompt_tokenizers import PromptTokenizingStrategy from axolotl.prompters import IGNORE_TOKEN_ID, Prompter from axolotl.utils.chat_templates import get_chat_template_from_config from axolotl.utils.dict import remove_none_values from axolotl.utils.logging import get_logger from axolotl.utils.schemas.datasets import DatasetConfig if TYPE_CHECKING: from axolotl.utils.mistral import HFMistralTokenizer # Configure the logger LOG = get_logger(__name__) LOG.setLevel("INFO") class ChatTemplatePrompter(Prompter): """Prompter for HF chat templates""" def __init__( self, tokenizer, chat_template: str, processor=None, max_length=2048, message_property_mappings: dict[str, str] | None = None, message_field_training: str | None = None, message_field_training_detail: str | None = None, field_messages: str = "messages", field_system: str = "system", field_tools: str = "tools", field_thinking: str = "reasoning_content", roles: dict[str, list[str]] | None = None, template_thinking_key: str | None = "reasoning_content", chat_template_kwargs: dict[str, Any] | None = None, drop_system_message: bool = False, ): # check if message_property_mappings is None or empty dict if message_property_mappings is None or (not message_property_mappings): message_property_mappings = { "role": "role", "content": "content", } if template_thinking_key and field_thinking: message_property_mappings[template_thinking_key] = field_thinking if roles: self.roles = {s: t for t, sources in roles.items() for s in sources} else: self.roles = { "human": "user", "user": "user", "assistant": "assistant", "gpt": "assistant", "system": "system", "tool": "tool", } self._chat_template_msg_variables = self.get_chat_template_msg_variables( chat_template, field_messages ) self.message_property_mappings = message_property_mappings self.message_field_training = message_field_training self.message_field_training_detail = message_field_training_detail self.field_messages = field_messages self.field_system = field_system self.field_tools = field_tools self.field_thinking = field_thinking self.tokenizer = tokenizer self.processor: ProcessorMixin | None = processor self.chat_template = chat_template self.chat_template_kwargs = chat_template_kwargs or {} self.template_thinking_key: str = template_thinking_key or "reasoning_content" self.max_length = max_length self.drop_system_message = drop_system_message @property def chat_template_msg_variables(self) -> Set[str]: return self._chat_template_msg_variables def build_prompt( self, conversation: list[dict], add_generation_prompt=False, images=None, tools=None, real_last_index=None, ): """ Build a prompt from a conversation. Args: conversation: A list of messages. add_generation_prompt: Whether to add a generation prompt. images: A list of images. (optional) tools: A list of tools. (optional) """ chat_template_kwargs = { "chat_template": self.chat_template, "add_generation_prompt": add_generation_prompt, **self.chat_template_kwargs, } if tools: chat_template_kwargs["tools"] = tools if real_last_index: chat_template_kwargs["real_last_index"] = real_last_index if self.processor: if not callable(self.processor): raise TypeError("Processor must be callable") text = self.processor.apply_chat_template( conversation, tokenize=False, **chat_template_kwargs, ) batch = self.processor( text=text, images=images, return_tensors="pt", ) if hasattr(batch, "to_dict"): batch = batch.to_dict() else: batch = dict(batch) # workaround since processor works in batches instead of single examples out = {} for k, val in batch.items(): if hasattr(val, "tolist"): out[k] = ( val.tolist() if k == "pixel_values" else val.squeeze(0).tolist() ) else: out[k] = val return out return self.tokenizer.apply_chat_template( conversation, tokenize=True, return_dict=False, **chat_template_kwargs, ) def get_offsets_for_train_detail( self, text: str, train_details: List[Dict], mask_untrainable: bool = True ) -> List[int]: tokenized_output = self.tokenizer( text, return_offsets_mapping=True, add_special_tokens=False ) tokens = tokenized_output.tokens() token_offsets = tokenized_output["offset_mapping"] LOG.debug(f"Tokenizing text: {text}") LOG.debug(f"Tokens: {tokens}") # Adjust the end offsets. For some reason by default they are set to the same value as the start offsets. for i in range(len(token_offsets) - 1): token_offsets[i] = (token_offsets[i][0], token_offsets[i + 1][0] - 1) # Ensure the last token's end offset is set correctly token_offsets[-1] = (token_offsets[-1][0], len(text) - 1) LOG.debug(f"Token offsets: {token_offsets}") # Initialize all offsets as IGNORE_TOKEN_ID (not trained) result = [IGNORE_TOKEN_ID] * len(token_offsets) # Adjust train_details to align with token boundaries adjusted_train_details = self.adjust_train_details(train_details, token_offsets) for idx, (start, end) in enumerate(token_offsets): for detail in adjusted_train_details: # Check if the token is completely within the detail's range if start >= detail["begin_offset"] and end <= detail["end_offset"]: if detail["train"] or not mask_untrainable: result[idx] = start LOG.debug(f"Token {idx} ({tokens[idx]}) marked for training") else: LOG.debug( f"Token {idx} ({tokens[idx]}) marked as non-trainable" ) elif start < detail["end_offset"] and end > detail["begin_offset"]: # Token partially overlaps with detail, always mark as non-trainable LOG.debug( f"Token {idx} ({tokens[idx]}) partially overlaps detail, marked as non-trainable" ) LOG.debug(f"Final result: {result}") return result def adjust_train_details( self, train_details: List[Dict], token_offsets: List[tuple] ) -> List[Dict]: adjusted_details = [] for detail in train_details: begin_offset = detail["begin_offset"] end_offset = detail["end_offset"] # Find the first token that starts after or at the begin_offset begin_token = next( ( i for i, (t_start, t_end) in enumerate(token_offsets) if t_start >= begin_offset ), len(token_offsets), ) if begin_token > 0 and token_offsets[begin_token - 1][1] > begin_offset: begin_token -= 1 # Find the last token that ends before or at the end_offset end_token = next( ( i for i in range(len(token_offsets) - 1, -1, -1) if token_offsets[i][1] <= end_offset ), -1, ) if ( end_token < len(token_offsets) - 1 and token_offsets[end_token + 1][0] < end_offset ): end_token += 1 if begin_token <= end_token: adjusted_begin = token_offsets[begin_token][0] adjusted_end = token_offsets[end_token][1] if adjusted_begin != begin_offset or adjusted_end != end_offset: LOG.warning( f"Adjusting detail offsets: ({begin_offset}, {end_offset}) -> ({adjusted_begin}, {adjusted_end})" ) adjusted_details.append( { "begin_offset": adjusted_begin, "end_offset": adjusted_end, "train": detail["train"], } ) else: LOG.warning( f"Could not adjust detail offsets: ({begin_offset}, {end_offset}). Skipping this detail." ) return adjusted_details def get_chat_template_msg_variables( self, chat_template: str, field_messages: str ) -> Set[str]: template_analyzer = JinjaTemplateAnalyzer(chat_template) return template_analyzer.get_message_vars(field_messages) class ChatTemplateStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for instruction-based prompts. """ def __init__( self, prompter: "ChatTemplatePrompter", tokenizer, train_on_inputs: bool, sequence_len: int, roles_to_train: list[str] | None = None, train_on_eos: str | None = None, train_on_eot: str | None = None, eot_tokens: list[str] | None = None, split_thinking: bool | None = False, ): super().__init__(prompter, tokenizer, train_on_inputs, sequence_len) self.prompter: ChatTemplatePrompter = prompter self.roles_to_train = [] if roles_to_train: # map roles if exist in prompter.roles else use the role as is self.roles_to_train = [ prompter.roles.get(role, role) for role in roles_to_train ] self.train_on_eos = train_on_eos # Backward compatibility, load from train_on_eos self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos # Default to eos_token if eot_tokens not provided self.eot_tokens = [] if eot_tokens is not None: self.eot_tokens = eot_tokens elif ( hasattr(self.tokenizer, "eos_token") and self.tokenizer.eos_token is not None ): self.eot_tokens = [self.tokenizer.eos_token] self.split_thinking = split_thinking self.images = "images" LOG.debug( f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}" ) self._validate_eot_and_eos_tokens() def _validate_eot_and_eos_tokens(self): """ - Validates that EOT tokens (or eos_token) are in the chat_template - Checks if EOT tokens are encoded as multiple tokens in the tokenizer. - Checks for potential conflicts between train_on_eos and train_on_eot. """ if self.prompter.chat_template is None: # Usually this should not happen LOG.warning( "No chat template provided, skipping EOT and EOS token validation" ) return # If the EOT token is the same as the EOS token, we need to check differently if len(self.eot_tokens) == 1 and self.eot_tokens[0] == self.tokenizer.eos_token: # Check if the eos_token is in the chat_template or as a variable `eos_token` # Note: we check for `eos_token` in the string, but it could possibly not be a variable if ( self.tokenizer.eos_token not in self.prompter.chat_template and "eos_token" not in self.prompter.chat_template ): LOG.warning( f"EOS token '{self.tokenizer.eos_token}' not found in chat_template. Please check if your template/EOS token is correct." ) return # Create a new list to store tokens that should be kept valid_eot_tokens = [] for token in self.eot_tokens: # Check if EOT token is in the chat_template if token not in self.prompter.chat_template: LOG.warning(f"EOT token '{token}' not found in chat_template.") # Don't add to the valid tokens list continue valid_eot_tokens.append(token) # Replace the original list with the filtered one self.eot_tokens = valid_eot_tokens for token in self.eot_tokens: # If token in template, check if EOT token is in tokenizer and not encoded as multiple tokens token_ids = self.tokenizer.encode(token, add_special_tokens=False) if not token_ids: raise ValueError( "EOT token encoding failed. Please check if the token is valid and can be encoded." ) if token_ids and len(token_ids) > 1: raise ValueError( f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config " "or (recommended) override unused added_tokens via `added_tokens_overrides: `." ) # If eos_token is in eot_tokens and conflict between train_on_eos and train_on_eot, raise an error if ( self.tokenizer.eos_token in self.eot_tokens and self.train_on_eos != self.train_on_eot ): raise ValueError( "Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot" f"train_on_eos: {self.train_on_eos}, train_on_eot: {self.train_on_eot}" f"eot_tokens: {self.eot_tokens}" f"eos_token: {self.tokenizer.eos_token}" ) @property def supports_batched(self) -> bool: # Let calling code know we can handle lists of examples return True def is_prompt_batched(self, prompt: dict[str, Any]) -> bool: try: return all(isinstance(v, list) for v in prompt.values()) and all( isinstance(v, list) for v in prompt[self.prompter.field_messages] ) except KeyError: return False def tokenize_prompt(self, prompt: dict[str, Any]): """ Public method that can handle either a single prompt or a batch of prompts. """ prompt = remove_none_values(prompt) if not self.is_prompt_batched(prompt) or not self.supports_batched: return self._tokenize_single_prompt(prompt) res = defaultdict(lambda: []) feature_names = list(prompt.keys()) # Process each prompt individually for row in zip(*prompt.values(), strict=False): tokenized_prompt = self._tokenize_single_prompt( dict(zip(feature_names, row, strict=False)) ) for key, val in tokenized_prompt.items(): res[key].append(val) # If there are no examples left, return an empty dictionary if not res: return {} return dict(res) def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]: # Old simple legacy behavior that works reliably. if ( not self.roles_to_train and not self.train_on_eos and not self.train_on_eot and not self.prompter.message_field_training # type: ignore and not self.prompter.message_field_training_detail # type: ignore ): turns = self.get_conversation_thread(prompt) images = self._get_images(prompt) prompt_ids = self.prompter.build_prompt( # type: ignore turns[:-1], add_generation_prompt=True, images=images, ) tokenized_res = self.prompter.build_prompt(turns, images=images) # type: ignore tokenized_prompt = {} if isinstance(tokenized_res, list): input_ids = prompt_ids + tokenized_res[len(prompt_ids) :] tokenized_prompt["input_ids"] = input_ids tokenized_prompt["attention_mask"] = [1] * len(input_ids) else: input_ids = tokenized_res["input_ids"] tokenized_prompt = dict(tokenized_res) if not self.train_on_inputs: if isinstance(prompt_ids, dict): user_prompt_len = len(prompt_ids["input_ids"]) else: user_prompt_len = len(prompt_ids) labels = [-100] * user_prompt_len + input_ids[user_prompt_len:] else: labels = input_ids tokenized_prompt["labels"] = labels return tokenized_prompt turns = self.get_conversation_thread(prompt) tools = self._get_tools(prompt) input_ids = self.prompter.build_prompt(turns, tools=tools) # type: ignore labels = [IGNORE_TOKEN_ID] * len(input_ids) last_eos_idx = -1 last_eot_idx = -1 for index, turn in enumerate(turns): role = turn.get("role") content = turn.get("content") train_turn = turn.get("training") train_detail = turn.get("training_detail") LOG.debug( f"Processing turn {index}: role={role}, content={content}, train_turn={train_turn}, train_detail={train_detail}" ) should_train = None if train_turn is not None: should_train = train_turn elif train_detail is not None: should_train = bool(train_detail) else: should_train = self.train_on_inputs or role in self.roles_to_train LOG.debug(f"Should train: {should_train}") # turn not trainable, skip having to find the turn indices # unless last turn and train_on_eos/train_on_eot is all if not should_train and ( self.train_on_eos != "all" and self.train_on_eot != "all" ): if index == len(turns) - 1: LOG.warning( "Last turn is not trainable, skipping having to find the turn indices. " "This may cause incorrect last EOT/EOS token to be unmasked." "This is likely a dataset design issue. Please ensure last turn is trainable." ) continue turn_start_idx, turn_end_idx = self.find_turn( turns=turns, turn_idx=index, tools=tools ) LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}") if should_train and turn_start_idx != -1 and turn_end_idx != -1: if train_detail: # Block multi-content for now if not isinstance(content, str): raise ValueError( "`train_detail` is not supported when `content` is not a string." ) token_offsets = self.prompter.get_offsets_for_train_detail( # type: ignore content, train_detail ) LOG.debug(f"Token offsets: {token_offsets}") for i, offset in enumerate(token_offsets): if offset != IGNORE_TOKEN_ID and turn_start_idx + i < len( input_ids ): labels[turn_start_idx + i] = input_ids[turn_start_idx + i] LOG.debug( f"Label set at index {turn_start_idx + i}: {input_ids[turn_start_idx + i]}" ) else: labels[turn_start_idx:turn_end_idx] = input_ids[ turn_start_idx:turn_end_idx ] LOG.debug( f"Set labels for training from {turn_start_idx} to {turn_end_idx}" ) LOG.debug(f"Labels after processing turn {index}: {labels}") # Handle special tokens (EOT and EOS) for token_type, find_func, train_option in [ ("EOT", self.find_first_eot_token, self.train_on_eot), ("EOS", self.find_first_eos_token, self.train_on_eos), ]: token_idx = find_func(input_ids, start_idx=turn_end_idx) if ( token_idx != -1 and abs(token_idx - turn_end_idx) <= 3 ): # Allow for some template padding # Update the last token index if token_type == "EOT": # nosec B105 last_eot_idx = token_idx else: last_eos_idx = token_idx # Set labels if needed for this turn if train_option == "all" or ( train_option == "turn" and should_train ): labels[token_idx] = input_ids[token_idx] LOG.debug( f"{token_type} token set for training at index {token_idx}" ) else: LOG.debug( f"{token_type} token missing after turn {turn}. {token_type.lower()}_idx: {token_idx}, turn_end_idx: {turn_end_idx}" ) # Handle 'last' option for special tokens for token_type, last_idx, train_option in [ ("EOT", last_eot_idx, self.train_on_eot), ("EOS", last_eos_idx, self.train_on_eos), ]: if train_option == "last" and last_idx != -1: labels[last_idx] = input_ids[last_idx] LOG.debug( f"Last {token_type} token set for training at index {last_idx}" ) LOG.debug(f"Final labels: {labels}") return { "input_ids": input_ids, "labels": labels, "attention_mask": [1] * len(input_ids), } def find_first_eos_token(self, input_ids, start_idx): eos_token_id = self.tokenizer.eos_token_id for i in range(start_idx, len(input_ids)): if input_ids[i] == eos_token_id: return i return -1 def find_first_eot_token(self, input_ids, start_idx): """Find the first EOT token in the input_ids starting from start_idx.""" # Get token IDs for all EOT tokens eot_token_ids = [] for token in self.eot_tokens: token_ids = self.tokenizer.encode(token, add_special_tokens=False) if len(token_ids) != 1: raise ValueError( f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config." ) eot_token_ids.append(token_ids[0]) # Use the last token ID if multiple # Search for any of the EOT token IDs for i in range(start_idx, len(input_ids)): if input_ids[i] in eot_token_ids: return i return -1 def find_turn( self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None ): """ Locate the starting and ending indices of the specified turn in a conversation. """ if turn_idx >= len(turns): raise ValueError(f"Turn index {turn_idx} out of range") # mistral/gemma3 does not output message if it contains only system message if ( turn_idx == 0 and turns[0].get("role") == "system" and ("mistral" in self.tokenizer.name_or_path.lower()) ): return -1, -1 empty_turn = { "role": turns[turn_idx].get("role"), "content": "[[dummy_message]]", } # Create conversation versions turns_with_empty = turns[:turn_idx] + [empty_turn] turns_with_content = turns[: turn_idx + 1] real_last_index = len(turns) - 1 # Generate the conversation up to the turn, with final turn replaced with dummy content dummy_ids = self.prompter.build_prompt( turns_with_empty, tools=tools, real_last_index=real_last_index ) # type: ignore # Generate the conversation up to the turn, with final turn included full_ids = self.prompter.build_prompt( turns_with_content, tools=tools, real_last_index=real_last_index ) # type: ignore if not full_ids or not dummy_ids: LOG.warning(f"Empty template generated for turn {turn_idx}") return -1, -1 # Find first difference (start of content) start_idx = None min_len = min(len(dummy_ids), len(full_ids)) for i in range(min_len): if dummy_ids[i] != full_ids[i]: start_idx = i break if start_idx is None: LOG.warning(f"Could not find content start boundary for turn {turn_idx}") return -1, -1 # Find last difference (end of content) end_idx = None for i in range(min_len): dummy_pos = len(dummy_ids) - 1 - i full_pos = len(full_ids) - 1 - i if dummy_ids[dummy_pos] != full_ids[full_pos]: end_idx = full_pos + 1 # Add one to include the last token when slice break if end_idx is None: LOG.warning(f"Could not find content end boundary for turn {turn_idx}") return -1, -1 if end_idx < start_idx: LOG.warning( f"Content end boundary is before start boundary for turn {turn_idx}" ) return -1, -1 if end_idx == start_idx: LOG.warning( f"Content end boundary is the same as start boundary for turn {turn_idx}. This is likely an empty turn." ) return -1, -1 LOG.debug(f"Content boundaries: {start_idx}, {end_idx}") LOG.debug( f"Content tokens: {self.tokenizer.convert_ids_to_tokens(full_ids[start_idx:end_idx])}" ) return start_idx, end_idx def get_conversation_thread(self, prompt): turns = [] messages = self._get_messages(prompt) possible_sys_turn = self.transform_message(messages[0]) if ( possible_sys_turn["role"] != "system" and self.prompter.field_system in prompt ): turn = {"role": "system", "content": prompt[self.prompter.field_system]} turns.append(turn) for message in messages: transformed_message = self.transform_message(message) turn = transformed_message training = message.get(self.prompter.message_field_training) training_detail = message.get(self.prompter.message_field_training_detail) if training is not None: turn["training"] = training if training_detail is not None: turn["training_detail"] = training_detail turns.append(turn) if self.prompter.drop_system_message and turns[0]["role"] == "system": turns = turns[1:] return turns def transform_message(self, message: dict) -> dict: # Build the initial transformed message from the mappings transformed_message = {} for key, value in self.prompter.message_property_mappings.items(): if message.get(value) is not None: transformed_message[key] = message[value] else: LOG.debug( f"Could not find value for property {value} in message: {message}" ) # Map the role if necessary if "role" in transformed_message: transformed_message["role"] = self.prompter.roles.get( transformed_message["role"], transformed_message["role"] ) # TODO handle reasoning_content with split_thinking # if the role is assistant that we want to use reasoning_content if self.split_thinking and transformed_message["role"] == "assistant": content = transformed_message["content"] thinking_pairs = [ ("", ""), ("", ""), ("<|begin_of_thought|>", "<|end_of_thought|>"), ] content_pairs = [("<|begin_of_solution|>", "<|end_of_solution|>")] for tpair in thinking_pairs: # check if the thinking pair is in the content if tpair[0] in content and tpair[1] in content: # find the start and end index of the thinking pair t_start_idx = content.find(tpair[0]) t_end_idx = content.find(tpair[1]) # get the thinking content thinking_content = content[t_start_idx + len(tpair[0]) : t_end_idx] transformed_message[self.prompter.template_thinking_key] = ( thinking_content.strip() ) # take remainder of the content # strip whitespace from beginning of the remainder (thinking tokens) remainder = content[t_end_idx + len(tpair[1]) :].lstrip() # check if the content pair is in the remainder cpair_found = False for cpair in content_pairs: if cpair[0] in remainder and cpair[1] in remainder: # find the start and end index of the content pair c_start_idx = remainder.find(cpair[0]) c_end_idx = remainder.find(cpair[1]) # get the content content content_content = remainder[ c_start_idx + len(cpair[0]) : c_end_idx ] transformed_message["content"] = content_content.strip() cpair_found = True break # else, the content is the remainder if not cpair_found: transformed_message["content"] = remainder break # Determine which keys in the original message were not mapped mapped_values = set(self.prompter.message_property_mappings.values()) remaining_keys = set(message) - mapped_values # Keep only the properties defined in the chat template # and not already mapped for key in self.prompter.chat_template_msg_variables: if key in remaining_keys: val = message.get(key) if val is not None: transformed_message[key] = val if "tool_calls" in transformed_message and transformed_message["tool_calls"]: for tool_call in transformed_message["tool_calls"]: if "function" in tool_call and "arguments" in tool_call["function"]: args = tool_call["function"]["arguments"] if isinstance(args, str): try: tool_call["function"]["arguments"] = json.loads(args) except json.JSONDecodeError as e: LOG.error( f"Error parsing tool_calls arguments as JSON. " f"Function: {tool_call.get('function', {}).get('name', 'unknown')}, " f"Arguments string: {args!r}, " f"Error: {e}" ) raise return transformed_message def _get_images(self, prompt): return prompt.get(self.images, None) def _get_tools(self, prompt) -> list[dict] | None: """Get tools from prompt if available.""" tools = prompt.get(self.prompter.field_tools, None) if tools is None: return None if isinstance(tools, list): # Process each tool to handle JSON string parameters for tool in tools: if isinstance(tool, dict) and "function" in tool: function = tool["function"] if "parameters" in function: params = function["parameters"] if isinstance(params, str): try: function["parameters"] = json.loads(params) except json.JSONDecodeError as e: LOG.error( f"Error parsing tool parameters as JSON. " f"Function: {function.get('name', 'unknown')}, " f"Parameters string: {params!r}, " f"Error: {e}" ) raise return tools raise ValueError( "Unknown tools format. Please convert it into a list[dict].\n" f"Current format: {type(tools)}" ) def _get_messages(self, prompt): messages = prompt.get(self.prompter.field_messages, None) if messages is None: raise ValueError("Messages is null. Please check `field_messages`.") if isinstance(messages, list): return messages raise ValueError( "Unknown messages format. Please convert it into a list[dict].\n" f"Current format: {type(messages)}" ) class MistralStrategy(ChatTemplateStrategy): """ Mistral strategy for chat template. """ def __init__( self, prompter: "ChatTemplatePrompter", tokenizer: "HFMistralTokenizer", train_on_inputs: bool, sequence_len: int, roles_to_train: list[str] | None = None, train_on_eos: str | None = None, train_on_eot: str | None = None, eot_tokens: list[str] | None = None, split_thinking: bool | None = False, ): # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation PromptTokenizingStrategy.__init__( self, prompter, tokenizer, train_on_inputs, sequence_len ) self.prompter: ChatTemplatePrompter = prompter self.roles_to_train = [] if roles_to_train: # map roles if exist in prompter.roles else use the role as is self.roles_to_train = [ prompter.roles.get(role, role) for role in roles_to_train ] self.train_on_eos = train_on_eos # Backward compatibility, load from train_on_eos self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos # Default to eos_token if eot_tokens not provided self.eot_tokens = [] if eot_tokens is not None: self.eot_tokens = eot_tokens else: # set eot_tokens to the eos_token self.eot_tokens = [self.tokenizer.eos_token] self.split_thinking = split_thinking self.images = "images" LOG.debug( f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}" ) # Skip the validation that ChatTemplateStrategy calls # TODO: address this in the future with mistral-specific checks # self._validate_eot_and_eos_tokens() def find_first_eot_token(self, input_ids, start_idx): """Find the first EOT token in the input_ids starting from start_idx.""" # mistral-common tokenizer does not support eot_tokens return self.find_first_eos_token(input_ids, start_idx) class MistralPrompter(ChatTemplatePrompter): """ Mistral prompter for chat template. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"]) class StrategyLoader: """ Load chat template strategy based on configuration. """ def _get_strategy_cls(self, cfg): if cfg.tokenizer_use_mistral_common: return MistralStrategy return ChatTemplateStrategy def _get_prompter_cls(self, cfg): if cfg.tokenizer_use_mistral_common: return MistralPrompter return ChatTemplatePrompter def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]): return { "train_on_inputs": cfg.train_on_inputs, "sequence_len": cfg.sequence_len, "roles_to_train": ds_cfg.get("roles_to_train", ["assistant"]), "train_on_eos": ds_cfg.get("train_on_eos", "turn"), "train_on_eot": ds_cfg.get("train_on_eot", None), "eot_tokens": cfg.get("eot_tokens", None), # loads from cfg, not ds_cfg "split_thinking": ds_cfg.get("split_thinking", False), } def __call__( self, tokenizer, cfg, ds_cfg: Union[Dict[str, Any], DatasetConfig] | None = None, processor=None, ): if ds_cfg is None: dataset_config = {} elif isinstance(ds_cfg, BaseModel): dataset_config = ds_cfg.model_dump() else: dataset_config = ds_cfg if cfg.tokenizer_use_mistral_common: # mistral-common does not use this, so we pass an empty string chat_template_string = "" else: chat_template_string = get_chat_template_from_config( cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer ) LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---") prompter_params = { "tokenizer": tokenizer, "chat_template": chat_template_string, "chat_template_kwargs": cfg.get("chat_template_kwargs", {}), "message_property_mappings": dataset_config.get( "message_property_mappings", {} ), "message_field_training": dataset_config.get( "message_field_training", None ), "message_field_training_detail": dataset_config.get( "message_field_training_detail", None, ), "field_messages": dataset_config.get("field_messages", "messages"), "field_thinking": dataset_config.get("field_thinking", "reasoning_content"), "template_thinking_key": dataset_config.get( "template_thinking_key", "reasoning_content" ), "roles": dataset_config.get("roles"), "drop_system_message": dataset_config.get("drop_system_message", False), # we need to add one for detecting sequences with exceeding the `sequence_len` limit. "max_length": cfg.sequence_len + 1, "processor": processor, } strategy_params = self._get_strategy_params(cfg, dataset_config) strategy_cls = self._get_strategy_cls(cfg) prompter_cls = self._get_prompter_cls(cfg) strategy = strategy_cls( prompter_cls(**prompter_params), tokenizer=tokenizer, **strategy_params, ) return strategy load = StrategyLoader() ================================================ FILE: src/axolotl/prompt_strategies/completion.py ================================================ """ Basic completion text """ from collections import defaultdict from typing import Any, Dict, Generator, Optional, Tuple from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Completion prompts. """ _field: str = "text" def __init__(self, *args, max_length=None, **kwargs): super().__init__(*args, **kwargs) if max_length is not None: self.max_length = max_length @property def supports_batched(self): return True @property def field(self) -> str: return self._field @field.setter def field(self, new_field: str): self._field = new_field def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt[self.field], "", "", ) def tokenize_prompt(self, prompt): res = defaultdict(lambda: []) feature_names = list(prompt.keys()) for row in zip(*prompt.values(), strict=False): prompt_row = dict(zip(feature_names, row, strict=False)) ( instruction, _, _, ) = self.parse_instruction_fields(prompt_row) full_prompt = self._build_full_prompt(instruction, None, None) tokenized_full_prompt = self._tokenize(full_prompt) for key, val in tokenized_full_prompt.items(): for i in range(0, len(val), self.sequence_len): res[key].append(val[i : i + self.sequence_len]) return dict(res) def _build_full_prompt(self, instruction, input, response): return next(iter(self.prompter.build_prompt(instruction, input, response))) class CompletionPrompter: """ Prompter for completion """ def build_prompt( self, instruction: str, input=None, output=None, ) -> Generator[str, None, None]: yield instruction def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): strat = CompletionPromptTokenizingStrategy( CompletionPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, max_length=cfg.sequence_len * 64, ) if ds_cfg and "field" in ds_cfg: strat.field = ds_cfg["field"] return strat ================================================ FILE: src/axolotl/prompt_strategies/context_qa.py ================================================ """Module containing the classes for Context QA Prompt Tokenization Strategies""" from typing import Tuple from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter, PromptStyle # article, unanswerable_question, question, answer def load_404(tokenizer, cfg): return AlpacaMissingInfoContextPromptTokenizingStrategy( AlpacaContextPrompter(PromptStyle.CHAT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load(tokenizer, cfg): return AlpacaContextPromptTokenizingStrategy( AlpacaContextPrompter(PromptStyle.CHAT.value), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_v2(tokenizer, cfg): return ContextQaV2PromptTokenizingStrategy( ContextV2Prompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) class AlpacaContextPrompter(AlpacaPrompter): """ Customized system prompted for concise QA """ system_prompt = ( "Use the following contextual information to concisely answer the question.\n" ) system_no_input_prompt = ( "Use the following contextual information to concisely answer the question.\n" ) class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenization Strategy to combine in-context article with a question and answer """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["article"] + "\n===\n" + prompt["question"], "", prompt["answer"], ) class ContextQaV2PromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenization Strategy to combine in-context article with a question and answer """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( "Context: " + prompt["context"] + "\nQuestion: " + prompt["question"] + "\n", "", "Answer: " + prompt["answer"], ) class ContextV2Prompter(AlpacaPrompter): """ Customized system prompted for concise QA """ system_prompt = "" system_no_input_prompt = "" def match_prompt_style(self): self.turn_format = "{instruction}\n{input}" self.turn_no_input_format = "{instruction}" self.system_format = "{system}" class AlpacaMissingInfoContextPromptTokenizingStrategy( InstructionPromptTokenizingStrategy ): """ Tokenization Strategy to combine in-context article with a question that can't be answered from the context and a default response to that effect """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["article"] + "\n===\n" + prompt["unanswerable_question"], "", "The context provided does not contain any information about your inquiry. " "Therefore, I'm unable to answer your question based on the given context.", ) ================================================ FILE: src/axolotl/prompt_strategies/creative_acr.py ================================================ """Module loading the CreativePromptTokenizingStrategy and similar classes""" from typing import Generator, Tuple, Union import yaml from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy class CreativeAnsweringPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Creative Answering """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: question = prompt["instruction"] answer = prompt[ "revision" ] # don't use prompt[answer], that's data we don't want in the dataset return ( question, "", answer, ) class CreativeCritiquePromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Creative Critique """ user_prompt = """Given the following Question and Response, critique the Response on a scale of 1-10. You should critique the answer in the following criteria: refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question. prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means there is prescriptive bias. creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth Then for each criteria, give a detailed explanation of why the response doesn't meet the criteria (don't reference the score), what it did right, and how it could be improved for that criteria. Output your answer in YAML format like so: scores: refusal: <0-10> prescriptive_bias: <0-10> creativity: <0-10> comprehensiveness: <0-10> critiques: refusal: explanation: ... improvements: ... prescriptive_bias: explanation: ... improvements: ... creativity: explanation: ... improvements: ... comprehensiveness: explanation: ... improvements: ... Question: {question} Answer: {answer} """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: scores = yaml.dump( prompt["scores"], default_flow_style=False, Dumper=yaml.Dumper, ) critiques = yaml.dump( prompt["critiques"], default_flow_style=False, Dumper=yaml.Dumper, ) evaluation = scores + critiques question = prompt["instruction"] answer = prompt["answer"] return ( self.user_prompt.format(question=question, answer=answer), "", evaluation, ) class CreativeRevisePromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Creative Revise """ user_prompt = """Definitions: refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question. prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means their is prescriptive bias. creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth Given the following Question, Response, and Evaluation, revise the Response based on the Evaluation and recommendations for improvements. Reply only with the revised response. Question: {question} Answer: {answer} Evaluation: {evaluation} """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: scores = yaml.dump( prompt["scores"], default_flow_style=False, Dumper=yaml.Dumper, ) critiques = yaml.dump( prompt["critiques"], default_flow_style=False, Dumper=yaml.Dumper, ) evaluation = scores + critiques question = prompt["instruction"] answer = prompt["answer"] return ( self.user_prompt.format( question=question, answer=answer, evaluation=evaluation ), "", prompt["revision"], ) class CreativePrompterBase: """ Base class for Creative Prompters """ system_prompt = "" prompt_input = "{system_prompt}\nUSER: {instruction}\nASSISTANT:" def build_prompt( self, instruction: str, input: Union[None, str] = None, output: Union[None, str] = None, ) -> Generator[str, None, None]: if self.system_prompt: res = f"{self.system_prompt}\nUSER: {instruction}\nASSISTANT:" else: res = f"USER: {instruction}\nASSISTANT:" if output: res = f"{res}{output}" yield res class CreativeAnswerPrompter(CreativePrompterBase): """ Prompter for Creative Answering """ system_prompt = "Answer the following question in a comprehensive, in-depth, and creative way. Additionally your response should be relevant, accurate, and free of any ambiguity." class CreativeCritiquePrompter(CreativePrompterBase): """ Prompter for Creative Critique """ system_prompt = "" class CreativeRevisePrompter(CreativePrompterBase): """ Prompter for Creative Revise """ system_prompt = "" def load_answer(tokenizer, cfg): return CreativeAnsweringPromptTokenizingStrategy( CreativeAnswerPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_critique(tokenizer, cfg): return CreativeCritiquePromptTokenizingStrategy( CreativeCritiquePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) def load_revise(tokenizer, cfg): return CreativeRevisePromptTokenizingStrategy( CreativeRevisePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/dpo/__init__.py ================================================ """ module for DPO style dataset transform strategies """ from functools import partial from ..base import load as load_base load = partial(load_base, module_base="axolotl.prompt_strategies.dpo") ================================================ FILE: src/axolotl/prompt_strategies/dpo/chat_template.py ================================================ """ DPO prompt strategies for using tokenizer chat templates. """ from axolotl.utils.chat_templates import extract_chat_template_args, get_chat_template from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic def default(cfg, dataset_idx=0, **kwargs): ds_cfg = cfg["datasets"][dataset_idx] ds_cfg = handle_legacy_message_fields_logic(ds_cfg) chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg=cfg, ds_cfg=ds_cfg ) field_messages = ds_cfg.get("field_messages", "messages") field_chosen = ds_cfg.get("field_chosen", "chosen") field_rejected = ds_cfg.get("field_rejected", "rejected") message_property_mappings = ds_cfg.get( "message_property_mappings", { "role": "role", "content": "content", }, ) role_map_inv = ds_cfg.get( "roles", { "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, ) role_map = {} for target, sources in role_map_inv.items(): for source in sources: role_map[source] = target def transform_fn(sample, tokenizer=None): chat_template_string = get_chat_template( user_choice=chat_template_choice, jinja_template=chat_template_jinja, tokenizer=tokenizer, ) messages = sample[field_messages] if isinstance(messages, str): messages = [ { message_property_mappings["role"]: "user", message_property_mappings["content"]: messages, } ] messages = [ { "role": role_map[m[message_property_mappings["role"]]], "content": m[message_property_mappings["content"]], } for m in messages ] chosen_raw = sample[field_chosen] if isinstance(chosen_raw, str): chosen_msg = { message_property_mappings["role"]: "assistant", message_property_mappings["content"]: chosen_raw, } elif isinstance(chosen_raw, dict): chosen_msg = chosen_raw else: chosen_msg = chosen_raw[-1] chosen = { "role": role_map[chosen_msg[message_property_mappings["role"]]], "content": chosen_msg[message_property_mappings["content"]], } rejected_raw = sample[field_rejected] if isinstance(rejected_raw, str): rejected_msg = { message_property_mappings["role"]: "assistant", message_property_mappings["content"]: rejected_raw, } elif isinstance(rejected_raw, dict): rejected_msg = rejected_raw else: rejected_msg = rejected_raw[-1] rejected = { "role": role_map[rejected_msg[message_property_mappings["role"]]], "content": rejected_msg[message_property_mappings["content"]], } dummy_user_message = {"role": "user", "content": "[[dummy_message]]"} result = {} result["prompt"] = tokenizer.apply_chat_template( messages, add_generation_prompt=True, chat_template=chat_template_string, tokenize=False, ) result["chosen"] = tokenizer.apply_chat_template( [dummy_user_message, chosen], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, ) chosen_strip_index = result["chosen"].find(chosen["content"]) result["chosen"] = result["chosen"][chosen_strip_index:].rstrip() result["rejected"] = tokenizer.apply_chat_template( [dummy_user_message, rejected], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, ) rejected_strip_index = result["rejected"].find(rejected["content"]) result["rejected"] = result["rejected"][rejected_strip_index:].rstrip() return result return transform_fn, {"remove_columns": [field_messages]} def argilla_chat(cfg, dataset_idx=0, **kwargs): """ DPO chat template strategy for argilla-style datasets. For argilla-style datasets where chosen/rejected contain full conversations instead of single response messages. Extracts the conversation history from the chosen field and formats both chosen/rejected responses using the configured chat template. Args: cfg: Configuration object containing chat_template and dataset settings dataset_idx: Index of the dataset in the config (default: 0) **kwargs: Additional keyword arguments (unused) Returns: tuple: (transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with 'remove_columns' specifying columns to drop Dataset format: { "chosen": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ], "rejected": [ {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } """ ds_cfg = cfg["datasets"][dataset_idx] ds_cfg = handle_legacy_message_fields_logic(ds_cfg) chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg=cfg, ds_cfg=ds_cfg ) field_chosen = ds_cfg.get("field_chosen", "chosen") field_rejected = ds_cfg.get("field_rejected", "rejected") message_property_mappings = ds_cfg.get( "message_property_mappings", { "role": "role", "content": "content", }, ) role_map_inv = ds_cfg.get( "roles", { "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, ) role_map = {} for target, sources in role_map_inv.items(): for source in sources: role_map[source] = target def transform_fn(sample, tokenizer=None): chat_template_string = get_chat_template( user_choice=chat_template_choice, jinja_template=chat_template_jinja, tokenizer=tokenizer, ) chosen_raw = sample[field_chosen] rejected_raw = sample[field_rejected] # Extract messages (all but last) and responses (last message) chosen_messages = [ { "role": role_map[m[message_property_mappings["role"]]], "content": m[message_property_mappings["content"]], } for m in chosen_raw[:-1] ] chosen_response = { "role": role_map[chosen_raw[-1][message_property_mappings["role"]]], "content": chosen_raw[-1][message_property_mappings["content"]], } rejected_response = { "role": role_map[rejected_raw[-1][message_property_mappings["role"]]], "content": rejected_raw[-1][message_property_mappings["content"]], } dummy_user_message = {"role": "user", "content": "[[dummy_message]]"} result = {} result["prompt"] = tokenizer.apply_chat_template( chosen_messages, add_generation_prompt=True, chat_template=chat_template_string, tokenize=False, ) result["chosen"] = tokenizer.apply_chat_template( [dummy_user_message, chosen_response], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, ) chosen_strip_index = result["chosen"].find(chosen_response["content"]) result["chosen"] = result["chosen"][chosen_strip_index:].rstrip() result["rejected"] = tokenizer.apply_chat_template( [dummy_user_message, rejected_response], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, ) rejected_strip_index = result["rejected"].find(rejected_response["content"]) result["rejected"] = result["rejected"][rejected_strip_index:].rstrip() return result return transform_fn, {"remove_columns": [field_chosen, field_rejected]} ================================================ FILE: src/axolotl/prompt_strategies/dpo/chatml.py ================================================ """ DPO strategies for chatml """ def default( cfg, **kwargs, ): def transform_fn(sample): if "prompt" in sample.keys(): prompt_key = "prompt" elif "input" in sample.keys(): prompt_key = "input" elif "question" in sample.keys(): prompt_key = "question" else: prompt_key = "instruction" if "chosen" in sample.keys(): chosen_key = "chosen" else: chosen_key = "chosen_response" if "rejected" in sample.keys(): rejected_key = "rejected" else: rejected_key = "rejected_response" if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample[chosen_key]}<|im_end|>" sample["rejected"] = f"{sample[rejected_key]}<|im_end|>" return sample return transform_fn def argilla_chat( cfg, **kwargs, ): """ for argilla/dpo-mix-7k conversations """ def transform_fn(sample): sample["prompt"] = ( f"<|im_start|>user\n{sample['chosen'][0]['content']}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>" sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>" return sample return transform_fn def icr( cfg, **kwargs, ): """ chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample['chosen']}<|im_end|>" sample["rejected"] = f"{sample['rejected']}<|im_end|>" return sample return transform_fn def intel(cfg, **kwargs): """ For Intel Orca DPO Pairs """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample['chosen']}<|im_end|>" sample["rejected"] = f"{sample['rejected']}<|im_end|>" return sample return transform_fn def prompt_pairs(cfg, **kwargs): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample['chosen']}<|im_end|>" sample["rejected"] = f"{sample['rejected']}<|im_end|>" return sample return transform_fn def ultra(cfg, **kwargs): """ for ultrafeedback binarized conversations """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>" sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>" return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/dpo/llama3.py ================================================ """ DPO strategies for llama-3 chat template """ def default( cfg, **kwargs, ): def transform_fn(sample): if "prompt" in sample.keys(): prompt_key = "prompt" elif "input" in sample.keys(): prompt_key = "input" elif "question" in sample.keys(): prompt_key = "question" else: prompt_key = "instruction" if "chosen" in sample.keys(): chosen_key = "chosen" else: chosen_key = "chosen_response" if "rejected" in sample.keys(): rejected_key = "rejected" else: rejected_key = "rejected_response" if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample[chosen_key]}<|eot_id|>" sample["rejected"] = f"{sample[rejected_key]}<|eot_id|>" return sample return transform_fn def argilla_chat( cfg, **kwargs, ): """ for argilla/dpo-mix-7k conversations """ def transform_fn(sample): sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['chosen'][0]['content']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample['chosen'][1]['content']}<|eot_id|>" sample["rejected"] = f"{sample['rejected'][1]['content']}<|eot_id|>" return sample return transform_fn def icr( cfg, **kwargs, ): """ chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample['chosen']}<|eot_id|>" sample["rejected"] = f"{sample['rejected']}<|eot_id|>" return sample return transform_fn def intel(cfg, **kwargs): """ For Intel Orca DPO Pairs """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample['chosen']}<|eot_id|>" sample["rejected"] = f"{sample['rejected']}<|eot_id|>" return sample return transform_fn def prompt_pairs(cfg, **kwargs): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample['chosen']}<|eot_id|>" sample["rejected"] = f"{sample['rejected']}<|eot_id|>" return sample return transform_fn def ultra(cfg, **kwargs): """ for ultrafeedback binarized conversations """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["chosen"] = f"{sample['chosen'][1]['content']}<|eot_id|>" sample["rejected"] = f"{sample['rejected'][1]['content']}<|eot_id|>" return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/dpo/passthrough.py ================================================ """ DPO prompt strategies passthrough/zero-processing strategy """ def default(cfg, dataset_idx=0, **kwargs): def transform_fn(sample, tokenizer=None): return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/dpo/user_defined.py ================================================ """ User-defined DPO strategies """ def default(cfg, dataset_idx=0, **kwargs): ds_cfg = cfg["datasets"][dataset_idx]["type"] if not isinstance(ds_cfg, dict): raise ValueError( f"User-defined dataset type must be a dictionary. Got: {ds_cfg}" ) field_prompt = ds_cfg.get("field_prompt", "prompt") field_system = ds_cfg.get("field_system", "system") field_chosen = ds_cfg.get("field_chosen", "chosen") field_rejected = ds_cfg.get("field_rejected", "rejected") prompt_format = ds_cfg.get("prompt_format") if not prompt_format: prompt_format = "{" + field_prompt + "}" chosen_format = ds_cfg.get("chosen_format") if not chosen_format: chosen_format = "{" + field_chosen + "}" rejected_format = ds_cfg.get("rejected_format") if not rejected_format: rejected_format = "{" + field_rejected + "}" def transform_fn(sample): if ( "{" + field_system + "}" in prompt_format and field_system in sample and sample[field_system] ): sample["prompt"] = prompt_format.format( system=sample[field_system], prompt=sample[field_prompt] ) else: sample["prompt"] = prompt_format.format(prompt=sample[field_prompt]) sample["chosen"] = chosen_format.format(chosen=sample[field_chosen]) sample["rejected"] = rejected_format.format(rejected=sample[field_rejected]) return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/dpo/zephyr.py ================================================ """ DPO strategies for zephyr """ def nectar(cfg, **kwargs): def transform_fn(sample): data = {} data["prompt"] = ( f"<|system|>\n
\n<|user|>\n{sample['prompt']}
\n<|assistant|>\n" ) answers = sorted(sample["answers"], key=lambda x: x["rank"]) data["chosen"] = answers[-1]["answer"] data["rejected"] = answers[-2]["answer"] return data return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/input_output.py ================================================ """Module for plain input/output prompt pairs""" from typing import Generator, Tuple from axolotl.prompt_tokenizers import PromptTokenizingStrategy from axolotl.prompters import IGNORE_TOKEN_ID, Prompter class RawInputOutputStrategy(PromptTokenizingStrategy): """Prompt Strategy class for input/output pairs""" def __init__(self, *args, eos_token=None, **kwargs): super().__init__(*args, **kwargs) self.eos_token = eos_token if not eos_token: self.eos_token = self.tokenizer.eos_token def tokenize_prompt(self, prompt): input_ids = [] labels = [] for label, text in self.prompter.build_prompt(prompt["segments"]): tokenized_output = self.tokenizer( text, add_special_tokens=False, return_tensors=None )["input_ids"] input_ids += tokenized_output if label or self.train_on_inputs: labels += tokenized_output else: labels += [IGNORE_TOKEN_ID] * len(tokenized_output) tokenized_prompt = { "input_ids": input_ids, "labels": labels, "attention_mask": [1] * len(input_ids), } return tokenized_prompt class RawInputOutputPrompter(Prompter): """prompter for raw i/o data""" def build_prompt(self, source) -> Generator[Tuple[bool, str], None, None]: for segment in source: yield segment["label"], segment["text"] def load(tokenizer, cfg): return RawInputOutputStrategy( RawInputOutputPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/jinja_template_analyzer.py ================================================ """Module for inspect jinja templates for the variables they use""" from typing import Dict, Optional, Set, TypedDict, Union from jinja2 import Environment, meta, nodes from jinja2.ext import Extension class JinjaTemplateAnalysis(TypedDict): """ Represents the detailed analysis of a Jinja template variable. Attributes: accessed_properties (Set[str]): A set of properties accessed from the variable (e.g., `foo.bar` results in 'bar' being accessed for 'foo'). accessed_indices (Set[Union[int, float]]): A set of indices accessed from the variable. is_iterated (bool): Indicates if the variable is used as an iteration source in a `for` loop. is_conditional (bool): Indicates if the variable is referenced within a conditional statement (e.g., an `if` block). iteration_source (Optional[str]): The name of the variable being iterated over, if applicable. iteration_target (Optional[Union[str, list[str]]]): The loop target(s) assigned in the iteration. """ accessed_properties: Set[str] accessed_indices: Set[Union[int, float]] is_iterated: bool is_conditional: bool iteration_source: Optional[str] iteration_target: Optional[Union[str, list[str]]] class GenerationTagIgnore(Extension): """ Ignores the generation and endgeneration tags in Jinja templates. """ tags = {"generation", "endgeneration"} def parse(self, parser): parser.stream.skip(1) return nodes.Const("") class JinjaTemplateAnalyzer: """ Analyzes Jinja templates to extract information about variable usage, including accessed properties, iteration, and conditional references. Attributes: env (jinja2.Environment): The Jinja2 environment used for parsing templates. property_access (Dict[str, Set[str]]): Tracks accessed properties for variables. iteration_targets (Dict[str, str]): Maps iteration target variables to their sources. Methods: get_template_variables(template: str) -> Dict[str, Set[str]]: Parse a Jinja template and return a mapping of variables to their accessed properties. analyze_template(template: str) -> Dict[str, JinjaTemplateAnalysis]: Perform a detailed analysis of the template, including variable usage, iteration, and conditional references. Private Methods: _visit_node(node) -> None: Recursively visit AST nodes to detect attribute access and iteration targets. _get_base_name(node) -> Optional[str]: Extract the base variable name from a node. _get_target_name(node) -> Optional[Union[str, list[str]]]: Extract the target name(s) from a `For` node. """ def __init__(self, template: str): self.env: Environment = Environment( autoescape=True, extensions=[GenerationTagIgnore] ) self.property_access: Dict[str, Set[str]] = {} self.iteration_targets: Dict[str, Union[str, list[str]]] = {} self.index_access: Dict[str, Set[Union[int, float]]] = {} self.ast: nodes.Node = self.env.parse(template) self.template: str = template self.variable_assignments: Dict[str, str] = {} def _visit_node(self, node) -> None: """Recursively visit AST nodes to find attribute access.""" # Handle attribute access (dot notation) if isinstance(node, nodes.Getattr): base_name = self._get_base_name(node.node) if base_name: self.property_access.setdefault(base_name, set()).add(node.attr) # Handle dictionary access (subscript notation) elif isinstance(node, nodes.Getitem): base_name = self._get_base_name(node.node) if base_name and isinstance(node.arg, nodes.Const): value = node.arg.value if isinstance(value, (int, float)): self.index_access.setdefault(base_name, set()).add(value) else: self.property_access.setdefault(base_name, set()).add(value) elif isinstance(node, nodes.Test) and node.name == "defined": base_name = self._get_base_name(node.node) if base_name: if isinstance(node.node, nodes.Getattr): self.property_access.setdefault(base_name, set()).add( node.node.attr ) # Handle loop variables elif isinstance(node, nodes.For): iter_name = self._get_base_name(node.iter) target_name = self._get_target_name(node.target) if iter_name and target_name: self.iteration_targets[target_name] = iter_name self.property_access.setdefault(iter_name, set()) elif isinstance(node, nodes.Assign): target_name = self._get_target_name(node.target) source_name = self._get_base_name(node.node) if target_name and source_name: self.variable_assignments[target_name] = source_name elif isinstance(node, nodes.Filter): if node.name == "selectattr": target = self._get_base_name(node.node) if target: self.variable_assignments[f"filtered_{target}"] = target for child in node.iter_child_nodes(): self._visit_node(child) def _get_target_name(self, node) -> Optional[str]: """Get the target variable name from a For node. Args: node: A Jinja AST node representing either a Name or Tuple node Returns: - str: For simple variable targets (e.g., "item" in "for item in items") - None: If the node type is not recognized or is a tuple """ if isinstance(node, nodes.Name): return node.name return None def _get_target_names(self, node) -> list[str]: """Get all target variable names from a For node, including tuple unpacking. Args: node: A Jinja AST node representing either a Name or Tuple node Returns: List of target variable names """ if isinstance(node, nodes.Name): return [node.name] if isinstance(node, nodes.Tuple): names = [] for n in node.items: if isinstance(n, nodes.Name): names.append(n.name) return names return [] def _get_base_name(self, node) -> Optional[str]: """Get the base variable name from a node.""" if isinstance(node, nodes.Name): return node.name if isinstance(node, nodes.Getattr): return self._get_base_name(node.node) if isinstance(node, nodes.Getitem): return self._get_base_name(node.node) return None def get_template_variables(self) -> Dict[str, Set[str]]: """ Parse a Jinja template and return both variables and their accessed properties. Args: template (str): The Jinja template string Returns: Dict[str, Set[str]]: Dictionary mapping variable names to sets of accessed properties """ # Parse the template ast = self.env.parse(self.template) # Get all undeclared variables variables = meta.find_undeclared_variables(ast) # Reset property access tracking self.property_access = {} # Visit all nodes to find property access self._visit_node(ast) # Create result dictionary result: Dict[str, Set[str]] = {var: set() for var in variables} # Merge in any discovered sub-properties for var, props in self.property_access.items(): if var not in result: result[var] = set() result[var].update(props) return result def analyze_template(self) -> Dict[str, JinjaTemplateAnalysis]: """ Provide a detailed analysis of template variables and their usage. """ variables = self.get_template_variables() self.iteration_targets = {} analysis: Dict[str, JinjaTemplateAnalysis] = { var: JinjaTemplateAnalysis( accessed_properties=props, accessed_indices=set(), is_iterated=False, is_conditional=False, iteration_source=None, iteration_target=None, ) for var, props in variables.items() } for var, indices in self.index_access.items(): if var in analysis: analysis[var]["accessed_indices"] = indices def visit_node(node): if isinstance(node, nodes.If): def find_test_vars(test_node): if isinstance(test_node, nodes.Name): if test_node.name in analysis: analysis[test_node.name]["is_conditional"] = True for child in test_node.iter_child_nodes(): find_test_vars(child) find_test_vars(node.test) if isinstance(node, nodes.For): iter_target = self._get_base_name(node.iter) target_name = self._get_target_name(node.target) if iter_target in analysis: analysis[iter_target]["is_iterated"] = True if target_name: analysis[iter_target]["iteration_target"] = target_name if isinstance(target_name, str) and target_name not in analysis: analysis[target_name] = { "accessed_properties": set(), "is_iterated": False, "is_conditional": False, "iteration_source": iter_target, "iteration_target": None, } for child in node.iter_child_nodes(): visit_node(child) visit_node(self.ast) return analysis def get_downstream_properties(self, start_var: str) -> Dict[str, Set[str]]: """ Get all properties accessed on a variable and its downstream assignments. Args: start_var: The starting variable to trace Returns: Dict mapping variable names to their accessed properties """ visited = set() properties = {} def trace_variable(var_name: str): if var_name in visited: return visited.add(var_name) # Get direct properties if var_name in self.property_access: properties[var_name] = self.property_access[var_name] # Get properties from iteration targets if var_name in self.iteration_targets: target = self.iteration_targets[var_name] if isinstance(target, str): trace_variable(target) elif isinstance(target, list): for t in target: trace_variable(t) # Follow assignments for target, source in self.variable_assignments.items(): if source == var_name: trace_variable(target) # Check for array slicing analysis = self.analyze_template() if var_name in analysis: var_info = analysis[var_name] if var_info["accessed_indices"]: # If this variable is sliced, follow the resulting assignment slice_result = f"{var_name}_slice" if slice_result in self.property_access: trace_variable(slice_result) trace_variable(start_var) return properties def get_message_vars(self, field_messages: str = "messages") -> Set[str]: """ Get all properties accessed on messages and derived variables. """ all_properties = self.get_downstream_properties(field_messages) # Combine all properties from all related variables combined_properties = set() for properties in all_properties.values(): combined_properties.update(properties) # Also include properties from the message iteration variable analysis = self.analyze_template() if "message" in analysis: combined_properties.update(analysis["message"]["accessed_properties"]) return combined_properties ================================================ FILE: src/axolotl/prompt_strategies/kto/__init__.py ================================================ """ module for KTO style dataset transform strategies """ from functools import partial from ..base import load as load_base load = partial(load_base, module_base="axolotl.prompt_strategies.kto") ================================================ FILE: src/axolotl/prompt_strategies/kto/chatml.py ================================================ """ KTO strategies for chatml """ def argilla( cfg, **kwargs, ): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n" ) sample["completion"] = f"{sample['completion']}<|im_end|>" return sample return transform_fn def argilla_chat( cfg, **kwargs, ): """ for argilla/kto-mix-15k conversations """ def transform_fn(sample): sample["prompt"] = ( f"<|im_start|>user\n{sample['chosen'][0]['content']}<|im_end|>\n<|im_start|>assistant\n" ) sample["completion"] = f"{sample['completion'][1]['content']}<|im_end|>" return sample return transform_fn def intel(cfg, **kwargs): """ For Intel Orca KTO ex: argilla/distilabel-intel-orca-kto """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n" ) sample["completion"] = f"{sample['completion']}<|im_end|>" return sample return transform_fn def prompt_pairs(cfg, **kwargs): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) sample["completion"] = f"{sample['completion']}<|im_end|>" return sample return transform_fn def ultra(cfg, **kwargs): """ for ultrafeedback binarized conversations ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|im_start|>system\n{sample['system']}<|im_end|>\n" f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) else: sample["prompt"] = ( f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n" ) sample["completion"] = f"{sample['completion']}<|im_end|>" return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/kto/llama3.py ================================================ """ KTO strategies for llama-3 chat template """ def argilla( cfg, **kwargs, ): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["completion"] = f"{sample['completion']}<|eot_id|>" return sample return transform_fn def argilla_chat( cfg, **kwargs, ): """ for argilla/kto-mix-15k conversations """ def transform_fn(sample): sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['completion'][0]['content']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["completion"] = f"{sample['completion'][1]['content']}<|eot_id|>" return sample return transform_fn def intel(cfg, **kwargs): """ For Intel Orca KTO ex: argilla/distilabel-intel-orca-kto """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["completion"] = f"{sample['completion']}<|eot_id|>" return sample return transform_fn def prompt_pairs(cfg, **kwargs): def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["completion"] = f"{sample['completion']}<|eot_id|>" return sample return transform_fn def ultra(cfg, **kwargs): """ for ultrafeedback binarized conversations ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto """ def transform_fn(sample): if "system" in sample and sample["system"]: sample["prompt"] = ( f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>" f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) else: sample["prompt"] = ( f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ) sample["completion"] = f"{sample['completion']}<|eot_id|>" return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/kto/user_defined.py ================================================ """ User-defined KTO strategies """ def default(cfg, dataset_idx=0, **kwargs): ds_cfg = cfg["datasets"][dataset_idx]["type"] if not isinstance(ds_cfg, dict): raise ValueError( f"User-defined dataset type must be a dictionary. Got: {ds_cfg}" ) field_prompt = ds_cfg.get("field_prompt", "prompt") field_system = ds_cfg.get("field_system", "system") field_completion = ds_cfg.get("field_completion", "completion") field_label = ds_cfg.get("field_label", "label") prompt_format = ds_cfg.get("prompt_format") if not prompt_format: prompt_format = "{" + field_prompt + "}" completion_format = ds_cfg.get("completion_format") if not completion_format: chosen_format = "{" + field_completion + "}" def transform_fn(sample): if ( "{" + field_system + "}" in prompt_format and field_system in sample and sample[field_system] ): sample["prompt"] = prompt_format.format( system=sample[field_system], prompt=sample[field_prompt] ) else: sample["prompt"] = prompt_format.format(prompt=sample["prompt"]) sample["completion"] = chosen_format.format(chosen=sample[field_completion]) sample["label"] = sample[field_label] return sample return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/llama2_chat.py ================================================ """ Prompt Strategy for finetuning Llama2 chat models see also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation. This implementation is based on the Vicuna PR and the fastchat repo, see also: https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847 Use dataset type: "llama2_chat" in conig.yml to use this prompt style. E.g. in the config.yml: ``` datasets: - path: llama_finetune_train.jsonl type: llama2_chat ``` The dataset itself should look like this: ``` {'conversations':[{"from": "human", "value": "Who are you?"}, {"from": "gpt", "value": "I am Vicuna"},...]} ``` in a jsonl file. The first message should be from the human, the second from gpt. For a custom system message, the first "from" can be "system" (followed by alternating "human" and "gpt" turns). Important: Don't use "special_tokens:" in your config.yml if you are not sure what you are doing! """ from dataclasses import dataclass, field from typing import Generator, List, Sequence from axolotl.prompt_tokenizers import PromptTokenizingStrategy from axolotl.prompters import ALTERNATING_ASSERTION_FAILED_ROLE, IGNORE_TOKEN_ID from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @dataclass class Llama2ChatConversation: """A class that manages prompt templates and keeps all conversation history. copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py""" name: str = "llama2" # The system prompt system: str = ( "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. " "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. " "Please ensure that your responses are socially unbiased and positive in nature.\n\n" "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. " "If you don't know the answer to a question, please don't share false information.\n<>\n\n" ) roles: Sequence[str] = ("[INST]", "[/INST]") messages: List[List[str]] = field(default_factory=list) offset: int = 0 sep = " " sep2 = " " stop_token_ids = [2] def get_prompt(self) -> str: """Get the prompt for generation.""" seps = [self.sep, self.sep2] ret = "" for i, (role, message) in enumerate(self.messages): if (i == len(self.messages) - 1) and (role == self.roles[0]): # last message is from user (due to length), # return prompt without it for training return ret if i == 0: ret += self.system + message.strip() else: ret += role + " " + message.strip() + seps[i % 2] return ret def append_message(self, role: str, message: str): """Append a new message.""" self.messages.append([role, message]) class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for Llama2 prompts. adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.tokenizer.add_special_tokens( {"pad_token": getattr(self.tokenizer, "pad_token", "")} ) # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/added_tokens.json def tokenize_prompt(self, prompt): conv = next(self.prompter.build_prompt(prompt)) conversation_str = conv.get_prompt() # Tokenize conversations input_ids = self.tokenizer( conversation_str, return_tensors="pt", padding="max_length", max_length=self.sequence_len, truncation=True, ).input_ids[0] target = input_ids.clone() # Mask targets. Only compute loss on the assistant outputs. sep = conv.roles[1] total_len = int(target.ne(self.tokenizer.pad_token_id).sum()) turns = conversation_str.split(conv.sep2) cur_len = 1 target[:cur_len] = IGNORE_TOKEN_ID for turn in turns: if turn == "": break turn_len = len(self.tokenizer(turn).input_ids) parts = turn.split(sep) if len(parts) != 2: break parts[0] += sep # "-1" is hardcoded for the LLaMA tokenizer to make the offset correct. instruction_len = len(self.tokenizer(parts[0]).input_ids) - 1 # Ignore the user instructions target[cur_len - 1 : cur_len + instruction_len] = IGNORE_TOKEN_ID cur_len += turn_len + 2 # due to length of role token target[cur_len:] = IGNORE_TOKEN_ID if cur_len < self.sequence_len: if cur_len != total_len: target[:] = IGNORE_TOKEN_ID LOG.warning( f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)" ) attention_mask = input_ids.ne(self.tokenizer.pad_token_id).tolist() input_ids = input_ids.tolist() target = target.tolist() # this is a fix for the tokenizer which tokenizes [ differently with eos tokens and # follows the original llama implementation for i in range(2, total_len - 2): if input_ids[i] == 29961: input_ids[i] = 518 if target[i] == 29961: target[i] = 518 return { "input_ids": input_ids, "labels": target, "attention_mask": attention_mask, } class Llama2ChatPrompter: """ A prompter that generates prompts for Llama2 models. """ system_prompt = ( "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. " "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. " "Please ensure that your responses are socially unbiased and positive in nature.\n\n" "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. " "If you don't know the answer to a question, please don't share false information.\n<>\n\n" ) def build_prompt(self, source) -> Generator[Llama2ChatConversation, None, None]: # see https://github.com/lm-sys/FastChat/blob/da0641e567cf93756b0978ab5a6b092e96f06240/fastchat/train/train.py#L78 source = source["conversations"] # fix data structure for datasets # if system prompt provided, use it if source[0]["from"] == "system": system = f"[INST] <>\n{source[0]['value']}\n<>\n\n" source = source[1:] else: system = self.system_prompt conv = Llama2ChatConversation(system=system) if len(source) < 2: # If there isn't a back and forth conversation, ignore it # also happens on the data splitting leaving empty conversations raise IndexError roles = {"human": conv.roles[0], "gpt": conv.roles[1]} if roles[source[0]["from"]] != conv.roles[0]: # Skip the first one if it is not from human source = source[1:] conv.messages = [] for j, sentence in enumerate(source): role = roles[sentence["from"]] assert role == conv.roles[j % 2], ALTERNATING_ASSERTION_FAILED_ROLE if sentence["value"]: conv.append_message(role, sentence["value"]) yield conv def load(tokenizer, cfg) -> LLama2ChatTokenizingStrategy: return LLama2ChatTokenizingStrategy( Llama2ChatPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/messages/__init__.py ================================================ """Module to load message prompt strategies.""" import importlib import inspect from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def load(tokenizer, cfg, ds_cfg, processor=None): try: strategy = ds_cfg.get("input_transform", "chat") load_fn = "load" if strategy.split(".")[-1].startswith("load_"): load_fn = strategy.split(".")[-1] strategy = ".".join(strategy.split(".")[:-1]) mod = importlib.import_module( f".{strategy}", "axolotl.prompt_strategies.messages" ) func = getattr(mod, load_fn) load_kwargs = {} sig = inspect.signature(func) if "ds_cfg" in sig.parameters: load_kwargs["ds_cfg"] = ds_cfg if "processor" in sig.parameters: load_kwargs["processor"] = processor return func(tokenizer, cfg, **load_kwargs) except ModuleNotFoundError: return None except Exception as exc: LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}") raise exc ================================================ FILE: src/axolotl/prompt_strategies/messages/chat.py ================================================ """ Chat dataset wrapping strategy for new internal messages representations """ from typing import Any, Callable, Dict, Optional from axolotl.core.datasets.chat import TokenizedChatDataset from axolotl.core.datasets.transforms.chat_builder import chat_message_transform_builder from axolotl.prompt_tokenizers import DatasetWrappingStrategy class ChatMessageDatasetWrappingStrategy(DatasetWrappingStrategy): """ Chat dataset wrapping strategy for new internal messages representations """ def __init__( self, processor, message_transform=None, formatter=None, **kwargs, ): """ :param processor: tokenizer or image processor :param kwargs: """ self.processor = processor self.dataset = None self.message_transform = message_transform self.formatter = formatter def wrap_dataset( self, dataset, process_count: Optional[int] = None, keep_in_memory: Optional[bool] = False, **kwargs, ): self.dataset = TokenizedChatDataset( dataset, message_transform=self.message_transform, model_transform=self.processor, formatter=self.formatter, process_count=process_count, keep_in_memory=keep_in_memory, ) return self.dataset def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): ds_cfg = ds_cfg or {} field_messages = ds_cfg.get("field_messages") message_property_mappings = ds_cfg.get("message_property_mappings") message_field_role = ( message_property_mappings.get("role") if message_property_mappings else None ) message_field_content = ( message_property_mappings.get("content") if message_property_mappings else None ) message_field_training = ds_cfg.get("message_field_training") builder_kwargs = {} if field_messages: builder_kwargs["conversations_field"] = field_messages if message_field_role: builder_kwargs["message_field_role"] = message_field_role if message_field_content: builder_kwargs["message_field_content"] = message_field_content if message_field_training: builder_kwargs["message_field_training"] = message_field_training chat_template = ds_cfg.get("chat_template", cfg.get("chat_template", "chatml")) def format_message(x): return x if chat_template == "chatml": from axolotl.core.chat.format.chatml import format_message # noqa F811 if chat_template.startswith("llama3"): from axolotl.core.chat.format.llama3x import format_message # noqa F811 message_transform: Callable = chat_message_transform_builder( train_on_inputs=ds_cfg.get("train_on_inputs", False), **builder_kwargs, ) strategy = ChatMessageDatasetWrappingStrategy( tokenizer, message_transform=message_transform, formatter=format_message ) return strategy ================================================ FILE: src/axolotl/prompt_strategies/metharme.py ================================================ """Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class""" from typing import Tuple from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter from axolotl.utils.logging import get_logger LOG = get_logger(__name__) IGNORE_TOKEN_ID = -100 class MetharmePromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for the Metharme models """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return (prompt["prompt"], "", prompt["generation"]) def _tokenize( self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False, num_eos_tokens: int = 3, ): result = self.tokenizer( prompt, truncation=True, max_length=self.sequence_len, padding=False, return_tensors=None, ) if len(result["input_ids"]) == 0: LOG.warning("Tokenizer result is empty. You may want to audit your dataset") # If there's already an EOS token there, subtract from the number added if result["input_ids"][-1] == self.tokenizer.eos_token_id: num_eos_tokens -= 1 if num_eos_tokens > 0 and add_eos_token and len(result["input_ids"]) > 0: for _ in range(num_eos_tokens): if len(result["input_ids"]) < self.sequence_len: result["input_ids"].append(self.tokenizer.eos_token_id) result["attention_mask"].append(1) if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token: result["input_ids"] = result["input_ids"][1:] result["attention_mask"] = result["attention_mask"][1:] result["labels"] = result["input_ids"].copy() return result class MetharmePrompter(AlpacaPrompter): """ Prompter for the Metharme models. """ system_prompt = "" system_no_input_prompt = "" system_format = "" turn_format = "{instruction}" turn_no_input_format = "{instruction}" def __init__(self, *args, **kwargs): pass def load(tokenizer, cfg): return MetharmePromptTokenizingStrategy( MetharmePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len ) ================================================ FILE: src/axolotl/prompt_strategies/orcamini.py ================================================ """ Prompt Strategy for finetuning Orca Mini (v2) models see also https://huggingface.co/psmathur/orca_mini_v2_7b for more information Use dataset type: orcamini in conig.yml to use this prompt style. Compared to the alpaca_w_system.open_orca dataset type, this one specifies the system prompt with "### System:". Not suited/tested for multiple-turn conversations without further adjustments. """ from typing import Generator, Union from axolotl.prompt_strategies.alpaca_w_system import OpenOrcaPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter class OrcaMiniPrompter(AlpacaPrompter): """Adjusted Prompter for Orca Mini (v2) datasets""" def match_prompt_style(self): self.turn_no_input_format = ( "### System:\n{system}\n\n### User:\n{instruction}\n\n### Response:\n" ) def build_prompt_w_system( self, system: str, instruction: str, output: Union[None, str] = None, ) -> Generator[str, None, None]: # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. res = self.turn_no_input_format.format(system=system, instruction=instruction) if output: res = f"{res}{output}" yield res def load(tokenizer, cfg): return OpenOrcaPromptTokenizingStrategy( OrcaMiniPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) ================================================ FILE: src/axolotl/prompt_strategies/orpo/__init__.py ================================================ """ module for ORPO style dataset transform strategies """ from functools import partial from ..base import load as load_base load = partial(load_base, module_base="axolotl.prompt_strategies.orpo") ================================================ FILE: src/axolotl/prompt_strategies/orpo/chat_template.py ================================================ """chatml prompt tokenization strategy for ORPO""" from typing import Any, Dict, Generator, List, Optional, Tuple from pydantic import BaseModel from axolotl.prompt_tokenizers import IGNORE_INDEX, PromptTokenizingStrategy from axolotl.prompters import Prompter from axolotl.utils.chat_templates import get_chat_template_from_config class Message(BaseModel): """message/turn""" role: str content: str label: Optional[bool] = None class MessageList(BaseModel): """conversation""" messages: List[Message] def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs): """ chatml transforms for datasets with system, input, chosen, rejected """ chat_template_string = get_chat_template_from_config( cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer ) tokenizer.chat_template = chat_template_string return ORPOTokenizingStrategy( ORPOPrompter(chat_template_string, tokenizer), tokenizer, cfg.train_on_inputs, cfg.sequence_len, dataset_parser=ORPODatasetParsingStrategy(), ) class ORPODatasetParsingStrategy: """Strategy to parse chosen rejected dataset into messagelist""" def get_chosen_conversation_thread(self, prompt) -> MessageList: """Dataset structure mappings""" messages: List[Message] = [] if system := prompt.get("system", None): messages.append(Message(role="system", content=system, label=False)) messages.append( Message(role="user", content=prompt["chosen"][0]["content"], label=False) ) messages.append( Message( role="assistant", content=prompt["chosen"][1]["content"], label=True ) ) return MessageList(messages=messages) def get_rejected_conversation_thread(self, prompt) -> MessageList: """Dataset structure mappings""" messages: List[Message] = [] if system := prompt.get("system", None): messages.append(Message(role="system", content=system, label=False)) messages.append( Message(role="user", content=prompt["rejected"][0]["content"], label=False) ) messages.append( Message( role="assistant", content=prompt["rejected"][1]["content"], label=True ) ) return MessageList(messages=messages) def get_prompt(self, prompt) -> MessageList: """Map the data to extract everything up to the last turn""" total_msg_len = len(prompt["chosen"]) total_msg_turns, remainder = divmod(total_msg_len, 2) assert remainder == 0, "invalid number of turns" messages: List[Message] = [] if system := prompt.get("system", None): messages.append(Message(role="system", content=system, label=False)) for i in range(total_msg_turns): if "prompt" in prompt: messages.append( Message(role="user", content=prompt["prompt"], label=False) ) else: messages.append( Message( role="user", content=prompt["chosen"][i * 2]["content"], label=False, ) ) if i < total_msg_turns - 1: messages.append( Message( role="assistant", content=prompt["chosen"][i * 2 + 1]["content"], label=False, ) ) return MessageList(messages=messages) def get_chosen(self, prompt) -> MessageList: res = self.get_prompt(prompt) res.messages.append( Message( role="assistant", content=prompt["chosen"][-1]["content"], label=True ) ) return res def get_rejected(self, prompt) -> MessageList: res = self.get_prompt(prompt) res.messages.append( Message( role="assistant", content=prompt["rejected"][-1]["content"], label=True ) ) return res class ORPOTokenizingStrategy(PromptTokenizingStrategy): """ rejected_input_ids input_ids rejected_attention_mask attention_mask rejected_labels labels """ def __init__( self, *args, dataset_parser=None, **kwargs, ): super().__init__(*args, **kwargs) self.dataset_parser = dataset_parser def tokenize_prompt(self, prompt): # pass the rejected prompt/row to the Prompter to get the formatted prompt prompt_len = 0 rejected_message_list: MessageList = ( self.dataset_parser.get_rejected_conversation_thread(prompt) ) input_ids = [] labels = [] for _, (part, label) in enumerate( self.prompter.build_prompt(rejected_message_list) ): if not part: continue _input_ids = self.tokenizer.encode(part, add_special_tokens=False) prev_idx = len(input_ids) input_ids += _input_ids[prev_idx:] if label: labels += input_ids[prev_idx:] else: labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx) prompt_len = len(input_ids) # remap the input_ids, attention_mask and labels rejected_input_ids = input_ids rejected_labels = labels # pass the chosen prompt/row to the Prompter to get the formatted prompt chosen_message_list: MessageList = ( self.dataset_parser.get_chosen_conversation_thread(prompt) ) input_ids = [] labels = [] for _, (part, label) in enumerate( self.prompter.build_prompt(chosen_message_list) ): if not part: continue _input_ids = self.tokenizer.encode(part, add_special_tokens=False) prev_idx = len(input_ids) input_ids += _input_ids[prev_idx:] if label: labels += input_ids[prev_idx:] else: labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx) return { "rejected_input_ids": rejected_input_ids, "rejected_labels": rejected_labels, "rejected_attention_mask": [1] * len(rejected_labels), "input_ids": input_ids, "labels": labels, "attention_mask": [1] * len(labels), "prompt_attention_mask": [1] * prompt_len + [0] * (len(labels) - prompt_len), } class ORPOPrompter(Prompter): """Single Turn prompter for ORPO""" def __init__(self, chat_template, tokenizer): self.chat_template = chat_template self.tokenizer = tokenizer def build_prompt( self, message_list: MessageList, ) -> Generator[Tuple[str, bool], None, None]: conversation = [] for message in message_list.messages: conversation.append(message.model_dump()) if message.role == "system": yield ( self.tokenizer.apply_chat_template( conversation, add_generation_prompt=False, chat_template=self.chat_template, tokenize=False, ), False, ) if message.role == "user": yield ( self.tokenizer.apply_chat_template( conversation, add_generation_prompt=True, chat_template=self.chat_template, tokenize=False, ), False, ) if message.role == "assistant": yield ( self.tokenizer.apply_chat_template( conversation, add_generation_prompt=False, chat_template=self.chat_template, tokenize=False, ), True, ) def argilla(cfg, **kwargs): dataset_parser = ORPODatasetParsingStrategy() def transform_fn(sample, tokenizer=None): res = {} chat_template_string = get_chat_template_from_config( cfg=cfg, tokenizer=tokenizer ) res["prompt"] = tokenizer.apply_chat_template( [msg.model_dump() for msg in dataset_parser.get_prompt(sample).messages], add_generation_prompt=True, chat_template=chat_template_string, tokenize=False, ) prompt_str_len = len(res["prompt"]) res["chosen"] = tokenizer.apply_chat_template( [msg.model_dump() for msg in dataset_parser.get_chosen(sample).messages], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, )[prompt_str_len:] res["rejected"] = tokenizer.apply_chat_template( [msg.model_dump() for msg in dataset_parser.get_rejected(sample).messages], add_generation_prompt=False, chat_template=chat_template_string, tokenize=False, )[prompt_str_len:] return res return transform_fn ================================================ FILE: src/axolotl/prompt_strategies/pretrain.py ================================================ """pretraining prompt strategies""" from typing import Generator from transformers import BatchEncoding from axolotl.prompt_tokenizers import PromptTokenizingStrategy class PretrainTokenizer: """basic tokenization class for pretraining""" def build_prompt(self, prompt) -> Generator[str, None, None]: yield prompt class PretrainTokenizationStrategy(PromptTokenizingStrategy): """handles tokenization for pretraining with strides""" @property def supports_batched(self): return True def __init__(self, *args, max_length=None, text_column="text", **kwargs): super().__init__(*args, **kwargs) if max_length: self.max_length = max_length self.text_column = text_column def _tokenize( self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False ) -> BatchEncoding: res = self.tokenizer( prompt, truncation=True, max_length=self.max_length - 1, add_special_tokens=True, return_overflowing_tokens=True, stride=256, ) res["input_ids"] = [ seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"] ] res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]] return res def tokenize_prompt(self, prompt): return self._tokenize(prompt[self.text_column]) def load(tokenizer, cfg): strat = PretrainTokenizationStrategy( PretrainTokenizer(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, text_column=cfg.pretraining_dataset[0]["text_column"] or "text", max_length=cfg.sequence_len * 64, ) return strat ================================================ FILE: src/axolotl/prompt_strategies/pygmalion.py ================================================ """Module containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class""" import copy from collections import defaultdict from typing import Generator, List, Tuple from axolotl.prompt_tokenizers import ( PromptTokenizingStrategy, parse_tokenized_to_result, tokenize_prompt_default, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) IGNORE_TOKEN_ID = -100 class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for Pygmalion. """ bot_prefix_token_ids: List[int] = [] def __init__(self, prompter, tokenizer, *args, **kwargs): super().__init__(prompter, tokenizer, *args, **kwargs) res = self._tokenize("<|model|>", add_eos_token=False, strip_bos_token=True) self.bot_prefix_token_ids = res["input_ids"] def tokenize_prompt(self, prompt): result, current_len = tokenize_prompt_default() for _, part in enumerate(self.prompter.build_prompt(prompt["conversations"])): role, message = part if role == "system": prefix = "<|system|>" # this should include a bos token, no eos token, strip trailing "\n" if message.endswith("\n"): message = message[:-8] res = self._tokenize( prefix + "Persona: " + message.strip(), add_eos_token=False, strip_bos_token=False, ) # everything from this is masked out from the labels labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) elif role == "human": prefix = "<|user|>" res = self._tokenize( prefix + " " + message.strip(), add_eos_token=False, strip_bos_token=True, ) # everything from this is masked out from the labels labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) elif role == "bot": prefix = "<|model|>" res = self._tokenize( prefix + " " + message.strip(), add_eos_token=True, strip_bos_token=True, ) # mask out the prefix token, rest is not masked out from labels # make sure we create the labels first, otherwise we get incorrect lengths labels = [IGNORE_TOKEN_ID] * len(self.bot_prefix_token_ids) + [ *copy.deepcopy(res["input_ids"]) ][len(self.bot_prefix_token_ids) :] else: LOG.warning(f"unknown role in conversation: {role}") res = defaultdict(lambda: []) result, current_len = parse_tokenized_to_result( result, current_len, res, labels, pad_token_id=self.tokenizer.pad_token_id, ) return result class PygmalionPrompter: """ Prompter for Pygmalion. """ def __init__(self, *args, **kwargs): pass def build_prompt( self, source, *args, **kwargs, ) -> Generator[Tuple[str, str], None, None]: for msg in source: yield msg["role"], msg["value"] def load(tokenizer, cfg): return PygmalionPromptTokenizingStrategy( PygmalionPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len ) ================================================ FILE: src/axolotl/prompt_strategies/stepwise_supervised.py ================================================ """ Module for stepwise datasets, typically including a prompt and reasoning traces, and (optionally) per-step, or per-prompt-trace labels for reward modelling. """ from itertools import chain from typing import Dict, List, Optional, Union from transformers import BatchEncoding, PreTrainedTokenizer from axolotl.prompt_tokenizers import IGNORE_INDEX from axolotl.utils.dict import DictDefault class StepwiseSupervisedPromptTokenizingStrategy: """ Tokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning. These datasets should include the following columns: - prompt: the prompt text - completions: a list of `n` completion steps - labels: a list of `n` labels indicating the "correctness" of each step """ def __init__( self, tokenizer, sequence_len: int = 2048, step_separator: str = "\n", max_completion_length: Optional[int] = None, train_on_last_step_only: bool = False, ): self.tokenizer = tokenizer self.sequence_len = sequence_len self.step_separator = step_separator self.max_completion_length = max_completion_length self.train_on_last_step_only = train_on_last_step_only def tokenize_prompt( self, prompt: Dict[str, Union[str, List[str]]] ) -> BatchEncoding: # Inspired by TRL's PRMTRainer # https://github.com/huggingface/trl/blob/ed7de87dc766478c024b68f12530d1b0e7c3ff23/trl/trainer/prm_trainer.py#L206 prompt_ids = self.tokenizer(prompt["prompt"], add_special_tokens=False)[ "input_ids" ] completions_ids = [ self.tokenizer(completion, add_special_tokens=False)["input_ids"] for completion in prompt["completions"] ] # Handle labels if self.train_on_last_step_only: labels = [IGNORE_INDEX] * (len(prompt["labels"]) - 1) + [ int(prompt["labels"][-1]) ] else: labels = [int(label) for label in prompt["labels"]] # Add step separators separator_ids = self.tokenizer.encode( self.step_separator, add_special_tokens=False ) completions_ids = [completion + separator_ids for completion in completions_ids] # Create step-wise labels labels = [ [IGNORE_INDEX] * (len(completion) - 1) + [label] # type: ignore for completion, label in zip(completions_ids, labels, strict=False) ] # Join all steps completion_ids = list(chain(*completions_ids)) labels = list(chain(*labels)) # type: ignore # Handle max lengths if self.max_completion_length: completion_ids = completion_ids[: self.max_completion_length] labels = labels[: self.max_completion_length] # Add BOS token if model has one if self.tokenizer.bos_token_id is not None: prompt_ids = [self.tokenizer.bos_token_id] + prompt_ids # Combine prompt and completion input_ids = prompt_ids + completion_ids full_labels = [IGNORE_INDEX] * len(prompt_ids) + labels # Apply max sequence length if self.sequence_len: input_ids = input_ids[: self.sequence_len] full_labels = full_labels[: self.sequence_len] return { "input_ids": input_ids, "labels": full_labels, "attention_mask": [1] * len(input_ids), } @property def supports_batched(self): return False def load( tokenizer: PreTrainedTokenizer, cfg: DictDefault, ds_cfg: DictDefault, ) -> StepwiseSupervisedPromptTokenizingStrategy: return StepwiseSupervisedPromptTokenizingStrategy( tokenizer, cfg.sequence_len, step_separator=ds_cfg.get("step_separator", "\n"), max_completion_length=ds_cfg.max_completion_length, train_on_last_step_only=ds_cfg.get("train_on_last_step_only", False), ) ================================================ FILE: src/axolotl/prompt_strategies/user_defined.py ================================================ """ User Defined prompts with configuration from the YML config """ from dataclasses import dataclass from functools import partial from typing import Optional, Tuple from axolotl.prompt_strategies.alpaca_w_system import ( InstructionWSystemPromptTokenizingStrategy, SystemDataPrompter, ) @dataclass class UserDefinedDatasetConfig: """ dataclass configuration representing a userdefined dataset type """ system_prompt: str = "" field_system: str = "system" field_instruction: str = "instruction" field_input: str = "input" field_output: str = "output" format: str = "{instruction} {input} " no_input_format: str = "{instruction} " system_format: str = "{system}" def __getitem__(self, item): return getattr(self, item) class UserDefinedPromptTokenizationStrategy(InstructionWSystemPromptTokenizingStrategy): """ Prompt Tokenization Strategy for user defined prompts """ def load(tokenizer, cfg, ds_cfg: Optional[UserDefinedDatasetConfig] = None): if not ds_cfg: raise ValueError("Missing dataset prompt configuration") system_prompt = "" if ds_cfg.system_prompt: system_prompt = ds_cfg.system_prompt def parse_instruction_fields( field_instruction, field_input, field_output, field_system, system_prompt, prompt, ) -> Tuple[str, str, str, str]: return ( prompt[field_instruction], prompt[field_input] if field_input in prompt else "", prompt[field_output] if field_output in prompt else "", prompt[field_system] if field_system in prompt else system_prompt, ) turn_format = ds_cfg.format turn_no_input_format = ds_cfg.no_input_format system_format = ds_cfg.system_format class UserDefinedPrompter(SystemDataPrompter): """ Prompter for user defined prompts """ def match_prompt_style(self): self.turn_format = turn_format self.turn_no_input_format = turn_no_input_format self.system_format = system_format prompter = UserDefinedPrompter() strat = UserDefinedPromptTokenizationStrategy( prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) strat.parse_instruction_fields = partial( # type: ignore[method-assign] parse_instruction_fields, ds_cfg.field_instruction, ds_cfg.field_input, ds_cfg.field_output, ds_cfg.field_system, system_prompt, ) return strat ================================================ FILE: src/axolotl/prompt_tokenizers.py ================================================ """Module containing PromptTokenizingStrategy and Prompter classes""" import abc from typing import Callable, Dict, List, Optional, Tuple, Union from datasets import Dataset from transformers import BatchEncoding, PreTrainedTokenizer from axolotl.prompters import Prompter from axolotl.utils.logging import get_logger LOG = get_logger(__name__) IGNORE_INDEX = -100 LLAMA_DEFAULT_PAD_TOKEN = "" # nosec LLAMA_DEFAULT_EOS_TOKEN = "" # nosec LLAMA_DEFAULT_BOS_TOKEN = "" # nosec LLAMA_DEFAULT_UNK_TOKEN = "" # nosec class InvalidDataException(Exception): """ Exception raised when the data is invalid """ class DatasetWrappingStrategy(abc.ABC): """ Abstract class for wrapping datasets for Chat Messages """ @abc.abstractmethod def wrap_dataset( self, dataset, process_count: int | None = None, keep_in_memory: bool | None = False, **kwargs, ) -> Dataset: pass class PromptTokenizingStrategy(abc.ABC): """ Abstract class for tokenizing strategies """ filter_rows: Optional[Callable] = None def __init__( self, prompter: Prompter, tokenizer, train_on_inputs: bool = False, sequence_len: int = 2048, ): self.prompter = prompter self.tokenizer: PreTrainedTokenizer = tokenizer self.train_on_inputs = train_on_inputs # sequence_len and max_length can be different for CompletionPromptTokenizingStrategy. # TODO: Document how they are different. self.sequence_len = sequence_len self.max_length = sequence_len @abc.abstractmethod def tokenize_prompt(self, prompt): pass @property def supports_batched(self): return False def _tokenize( self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False ) -> BatchEncoding: empty = BatchEncoding(data={"input_ids": [], "attention_mask": []}) if not prompt: LOG.warning_once("Empty text requested for tokenization.") return empty result = self.tokenizer( prompt, truncation=True, max_length=self.max_length, padding=False, return_tensors=None, ) if len(result["input_ids"]) == 0: LOG.warning("Tokenizer result is empty. You may want to audit your dataset") return empty if ( result["input_ids"][-1] != self.tokenizer.eos_token_id and len(result["input_ids"]) < self.max_length and add_eos_token ): result["input_ids"].append(self.tokenizer.eos_token_id) result["attention_mask"].append(1) if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token: result["input_ids"] = result["input_ids"][1:] result["attention_mask"] = result["attention_mask"][1:] result["labels"] = result["input_ids"].copy() return result class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for instruction-based prompts. """ def parse_instruction_fields( self, prompt ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]: raise NotImplementedError def tokenize_prompt(self, prompt): ( instruction, input, response, ) = self.parse_instruction_fields(prompt) user_prompt = next( iter( self.prompter.build_prompt( instruction, input, ) ) ) tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False) if not self.train_on_inputs: user_prompt_len = len(tokenized_prompt["input_ids"]) # TODO this could be sped up using numpy array slicing tokenized_prompt["labels"] = [IGNORE_INDEX] * user_prompt_len tokenized_res_prompt = self._tokenize( response, strip_bos_token=True, add_eos_token=True ) tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"] tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"] tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"] return tokenized_prompt def _build_full_prompt( self, instruction, input, response, ): return next( iter( self.prompter.build_prompt( instruction, input, response, ) ) ) class AlpacaPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Alpaca prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["instruction"], prompt["input"] if "input" in prompt else "", prompt["output"], ) class AlpacaMultipleChoicePromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Alpaca Multiple Choice prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["question"], "\n".join(f'- "{choice}"' for choice in prompt["choices"]), prompt["solution"] if "solution" in prompt else prompt["explanation"], ) class JeopardyPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for Jeopardy prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["question"], prompt["category"], "what is " + prompt["answer"], ) class OpenAssistantPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for OpenAssistant prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["INSTRUCTION"], "", prompt["RESPONSE"], ) class SummarizeTLDRPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for SummarizeTLDR prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["article"], "", prompt["summary"], ) class GPTeacherPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for GPTeacher prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["instruction"], prompt["input"] if "input" in prompt else "", prompt["response"], ) class NomicGPT4AllPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for NomicGPT4All prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: return ( prompt["prompt"], "", prompt["response"], ) class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy): """ Tokenizing strategy for Reflection prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str, str]: raise NotImplementedError def tokenize_prompt(self, prompt): ( instruction, input, output, reflection, corrected, ) = self.parse_instruction_fields(prompt) full_prompt = self._build_full_prompt( instruction, input, output, reflection, corrected ) tokenized_full_prompt = self._tokenize(full_prompt) if not self.train_on_inputs: user_prompt = next( iter( self.prompter.build_prompt( instruction, input, ) ) ) tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False) user_prompt_len = len(tokenized_user_prompt["input_ids"]) # TODO this could be sped up using numpy array slicing tokenized_full_prompt["labels"] = [ IGNORE_INDEX ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:] return tokenized_full_prompt def _build_full_prompt(self, instruction, input, output, reflection, corrected): return next( iter( self.prompter.build_prompt( instruction, input, output, reflection, corrected, ) ) ) def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False): result = self.tokenizer( prompt, truncation=True, max_length=self.sequence_len, padding=False, return_tensors=None, ) if ( result["input_ids"][-1] != self.tokenizer.eos_token_id and len(result["input_ids"]) < self.sequence_len and add_eos_token ): result["input_ids"].append(self.tokenizer.eos_token_id) result["attention_mask"].append(1) result["labels"] = result["input_ids"].copy() return result class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy): """ Tokenizing strategy for Alpaca Reflection prompts. """ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str, str]: return ( prompt["instruction"], prompt["input"] if "input" in prompt else "", prompt["output"], prompt["reflection"], prompt["corrected"], ) def tokenize_prompt_default() -> Tuple[Dict[str, List[int]], int]: """ Returns the default values for the tokenize prompt function """ result: Dict[str, List[int]] = { "input_ids": [], "attention_mask": [], "labels": [], } current_len = 0 return result, current_len def parse_tokenized_to_result( result: Dict[str, List[int]], current_len: int, res: Dict[str, List[int]], labels: List[int], pad_token_id: Union[int, None] = None, ) -> Tuple[Dict[str, List[int]], int]: """ Parses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result """ input_ids = res["input_ids"] input_len = len(input_ids) result["input_ids"][current_len : current_len + input_len] = input_ids result["attention_mask"][current_len : current_len + input_len] = [ 1 if x != pad_token_id else 0 for x in input_ids ] result["labels"][current_len : current_len + input_len] = labels current_len += input_len return result, current_len ================================================ FILE: src/axolotl/prompters.py ================================================ """Module containing prompters""" from enum import Enum from typing import Generator, Optional, Union from colorama import Fore from axolotl.utils.logging import get_logger LOG = get_logger(__name__) IGNORE_TOKEN_ID = -100 REPR_TEMPLATE = "\n\n" + Fore.CYAN + "{full_prompt}" + Fore.RESET + "\n\n" class PromptStyle(Enum): """ Enum for prompt styles """ INSTRUCT = "instruct" CHAT = "chat" CHATML = "chatml" PHI = "phi" class Prompter: """ Base prompter class for all prompters """ class AlpacaPrompter(Prompter): """ Base class for alpaca prompters """ system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request." system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request." system_format: str = "{system}" turn_format: str turn_no_input_format: str prompt_style: Optional[str] = None def __init__(self, prompt_style: Optional[str] = PromptStyle.INSTRUCT.value): self.prompt_style = prompt_style if prompt_style else PromptStyle.INSTRUCT.value self.match_prompt_style() def match_prompt_style(self): if self.prompt_style == PromptStyle.INSTRUCT.value: self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" self.turn_no_input_format = ( "### Instruction:\n{instruction}\n\n### Response:\n" ) self.system_format = "{system}\n\n" elif self.prompt_style == PromptStyle.CHAT.value: self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" self.system_format = "SYSTEM: {system}\n" elif self.prompt_style == PromptStyle.CHATML.value: self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n" self.turn_no_input_format = ( "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" ) self.system_format = "<|im_start|>system\n{system}<|im_end|>\n" elif self.prompt_style == PromptStyle.PHI.value: self.turn_format = "<|user|>\n{instruction}<|end|>{input}<|assistant|>" self.turn_no_input_format = ( "<|user|>\n{instruction}<|end|>\n<|assistant|>\n" ) self.system_format = "<|system|>\n{system}<|end|>\n" def _build_result(self, instruction, input_text, output): # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. if input_text: res = ( self.system_format.format(system=self.system_prompt) if self.system_prompt else "" ) + self.turn_format.format(instruction=instruction, input=input_text) else: res = ( self.system_format.format(system=self.system_no_input_prompt) if self.system_no_input_prompt else "" ) + self.turn_no_input_format.format(instruction=instruction) if output: res = f"{res}{output}" return res def build_prompt( self, instruction: str, input: Union[None, str] = None, output: Union[None, str] = None, ) -> Generator[str, None, None]: yield self._build_result(instruction, input, output) def __repr__(self) -> str: return REPR_TEMPLATE.format( full_prompt=self._build_result("{instruction}", "{input}", "{output}") ) class UnpromptedPrompter(AlpacaPrompter): """ Prompter for alpaca no system prompt """ system_prompt = "" system_no_input_prompt = "" class JeopardyPrompter(AlpacaPrompter): """ Prompter for Jeopardy """ prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" class MultipleChoiceExplainPrompter(AlpacaPrompter): """ Prompter for multiple choice explain """ system_prompt = ( "Choose the answer that best answers the question. Explain your reasoning.\n" ) system_no_input_prompt = ( "Choose the answer that best answers the question. Explain your reasoning.\n" ) class MultipleChoiceConcisePrompter(AlpacaPrompter): """ Prompter for multiple choice concise """ system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n" system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n" def match_prompt_style(self): self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" class SummarizeTLDRPrompter(AlpacaPrompter): """ Prompter for summarize TLDR """ system_prompt = "" system_no_input_prompt = "" def match_prompt_style(self): self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:" self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:" class GPTeacherPrompter(AlpacaPrompter): """ Prompter for GPTeacher """ class NomicGPT4AllPrompter(AlpacaPrompter): """ Prompter for NomicGPT4All """ class ReflectAlpacaPrompter(Prompter): """ Prompter for ReflectAlpaca """ system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n" system_no_input_prompt = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n" prompt_input = ( "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" ) prompt_no_input = "### Instruction:\n{instruction}\n\n### Response:\n" agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}" response_split = "### Response:" def __init__(self, prompt_style="instruct"): self.prompt_style = prompt_style self.match_prompt_style() def match_prompt_style(self): if self.prompt_style == PromptStyle.INSTRUCT.value: self.prompt_input = ( self.system_prompt + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" ) self.prompt_no_input = ( self.system_no_input_prompt + "### Instruction:\n{instruction}\n\n### Response:\n" ) self.agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}" self.response_split = "### Final Response:" if self.prompt_style == PromptStyle.CHAT.value: self.prompt_input = ( self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:" ) self.prompt_no_input = ( self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:" ) self.agent_label = ( "\nTHOUGHT: {output}\nASSISTANT REFLECTION: {reflection}\nASSISTANT:" ) self.response_split = "ASSISTANT:" def _build_result( self, instruction: str, input: Union[None, str] = None, output: Union[None, str] = None, reflection: Union[None, str] = None, corrected: Union[None, str] = None, ): # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. if input: res = self.prompt_input.format(instruction=instruction, input=input) else: res = self.prompt_no_input.format(instruction=instruction) if output and reflection and corrected: label = self.agent_label.format( output=output, reflection=reflection, corrected=corrected, ) res = f"{res}{label}" return res def build_prompt( self, instruction: str, input: Union[None, str] = None, output: Union[None, str] = None, reflection: Union[None, str] = None, corrected: Union[None, str] = None, ) -> Generator[str, None, None]: yield self._build_result( instruction, input, output, reflection, corrected, ) def __repr__(self) -> str: return REPR_TEMPLATE.format( full_prompt=self._build_result("{instruction}", "{input}", "{output}") ) ALTERNATING_ASSERTION_FAILED_ROLE = ( "Role did not alternate between turns (gpt and human). Please check your data." ) class UnsupportedPrompter(Prompter): """ A dummy class for custom prompters """ def __init__(self) -> None: pass def __repr__(self): return "Pre-tokenized or custom dataset types are unsupported for logging" ================================================ FILE: src/axolotl/scripts/__init__.py ================================================ ================================================ FILE: src/axolotl/scripts/vllm_serve_lora.py ================================================ """vLLM serve script with native LoRA adapter support. Extends TRL's vllm_serve to enable direct LoRA adapter loading in vLLM, instead of merging adapter weights into the base model before syncing. Usage: Set ``vllm.serve_module: axolotl.scripts.vllm_serve_lora`` in your config, or ``trl.vllm_lora_sync: true`` to auto-select. Benefits over merge-sync: - Syncs only LoRA adapter weights via filesystem instead of full merged model via NCCL - vLLM handles LoRA application natively (Punica kernels) - No NCCL communicator needed for weight sync """ import logging import os from contextlib import asynccontextmanager from dataclasses import dataclass, field from itertools import chain from multiprocessing import Pipe, Process from multiprocessing.connection import Connection from typing import Any from trl.scripts.vllm_serve import ( ScriptArguments, chunk_list, extract_logprobs, get_open_port, ) from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest logger = logging.getLogger(__name__) @dataclass class LoRAScriptArguments(ScriptArguments): """Extended script arguments with LoRA support.""" enable_lora: bool = field( default=True, metadata={"help": "Enable LoRA adapter support in vLLM."}, ) max_lora_rank: int = field( default=64, metadata={"help": "Maximum LoRA rank supported."}, ) max_loras: int = field( default=2, metadata={"help": "Maximum number of LoRA adapters loaded simultaneously."}, ) lora_dtype: str = field( default="bfloat16", metadata={"help": "Data type for LoRA weights."}, ) def llm_worker( script_args: LoRAScriptArguments, data_parallel_rank: int, master_port: int, connection: Connection, ) -> None: """Worker process that creates a vLLM LLM with LoRA enabled.""" os.environ["VLLM_DP_RANK"] = str(data_parallel_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(data_parallel_rank) os.environ["VLLM_DP_SIZE"] = str(script_args.data_parallel_size) os.environ["VLLM_DP_MASTER_PORT"] = str(master_port) llm = LLM( model=script_args.model, revision=script_args.revision, tensor_parallel_size=script_args.tensor_parallel_size, gpu_memory_utilization=script_args.gpu_memory_utilization, enforce_eager=script_args.enforce_eager, dtype=script_args.dtype, enable_prefix_caching=script_args.enable_prefix_caching, kv_cache_dtype=script_args.kv_cache_dtype, max_model_len=script_args.max_model_len, # Use batch-capable worker extension (adds batch_update_named_params + auto-close) worker_extension_cls="axolotl.scripts.vllm_worker_ext.BatchWeightSyncWorkerExtension", trust_remote_code=script_args.trust_remote_code, model_impl=script_args.vllm_model_impl, logprobs_mode="processed_logprobs", # LoRA enable_lora=script_args.enable_lora, max_lora_rank=script_args.max_lora_rank, max_loras=script_args.max_loras, lora_dtype=script_args.lora_dtype, ) connection.send({"status": "ready"}) while True: try: command = connection.recv() except KeyboardInterrupt: llm.collective_rpc(method="close_communicator") break if command["type"] in ["call", "fire_and_forget"]: method_name = command["method"] args = command.get("args", ()) kwargs = command.get("kwargs", {}) # Reconstruct LoRARequest from serialized dict (can't pickle across pipe) if "lora_request" in kwargs and kwargs["lora_request"] is not None: lr = kwargs["lora_request"] kwargs["lora_request"] = LoRARequest( lora_name=lr["lora_name"], lora_int_id=lr["lora_int_id"], lora_path=lr["lora_path"], load_inplace=lr.get("load_inplace", False), ) method = getattr(llm, method_name) result = method(*args, **kwargs) if command["type"] == "call": connection.send(result) elif command["type"] == "shutdown": break def main(script_args: ScriptArguments): """Start vLLM workers with LoRA support and the HTTP server.""" import asyncio import uvicorn from fastapi import FastAPI from pydantic import BaseModel, Field as PydanticField # Request/Response models (defined locally like TRL's vllm_serve.main) class GenerateRequest(BaseModel): prompts: list[str] images: list[str] | None = None n: int = 1 repetition_penalty: float = 1.0 temperature: float = 1.0 top_p: float = 1.0 top_k: int = -1 min_p: float = 0.0 max_tokens: int = 16 logprobs: int | None = 0 truncate_prompt_tokens: int | None = None structured_outputs_regex: str | None = None generation_kwargs: dict = PydanticField(default_factory=dict) class GenerateResponse(BaseModel): prompt_ids: list[list[int]] completion_ids: list[list[int]] logprobs: list[list[list[float]]] logprob_token_ids: list[list[list[int]]] class ChatRequest(BaseModel): messages: list[list[dict]] n: int = 1 repetition_penalty: float = 1.0 temperature: float = 1.0 top_p: float = 1.0 top_k: int = -1 min_p: float = 0.0 max_tokens: int = 16 logprobs: int | None = 0 truncate_prompt_tokens: int | None = None structured_outputs_regex: str | None = None generation_kwargs: dict = PydanticField(default_factory=dict) chat_template_kwargs: dict = PydanticField(default_factory=dict) class ChatResponse(BaseModel): prompt_ids: list[list[int]] completion_ids: list[list[int]] logprobs: list[list[list[float]]] logprob_token_ids: list[list[list[int]]] class InitCommunicatorRequest(BaseModel): host: str port: int world_size: int client_device_uuid: str # Wrap plain ScriptArguments with LoRA defaults if not isinstance(script_args, LoRAScriptArguments): lora_args = LoRAScriptArguments.__new__(LoRAScriptArguments) for f in ScriptArguments.__dataclass_fields__: setattr(lora_args, f, getattr(script_args, f)) # Apply LoRA defaults for f in LoRAScriptArguments.__dataclass_fields__: if f not in ScriptArguments.__dataclass_fields__: setattr( lora_args, f, LoRAScriptArguments.__dataclass_fields__[f].default ) script_args = lora_args # Spawn workers master_port = get_open_port() connections: list[Connection] = [] processes: list[Process] = [] for dp_rank in range(script_args.data_parallel_size): parent_conn, child_conn = Pipe() process = Process( target=llm_worker, args=(script_args, dp_rank, master_port, child_conn), ) process.start() connections.append(parent_conn) processes.append(process) @asynccontextmanager async def lifespan(app: FastAPI): import time startup_timeout = 300 # 5 minutes start_time = time.monotonic() ready: set[int] = set() while len(ready) < script_args.data_parallel_size: elapsed = time.monotonic() - start_time if elapsed > startup_timeout: raise RuntimeError( f"vLLM workers failed to start within {startup_timeout}s " f"({len(ready)}/{script_args.data_parallel_size} ready)" ) for i, (conn, proc) in enumerate(zip(connections, processes, strict=True)): if id(conn) in ready: continue if not proc.is_alive(): raise RuntimeError( f"vLLM worker {i} exited unexpectedly during startup" ) if conn.poll(): msg = conn.recv() if isinstance(msg, dict) and msg.get("status") == "ready": ready.add(id(conn)) await asyncio.sleep(0.1) yield for p in processes: p.join(timeout=10) if p.is_alive(): p.terminate() p.join() app = FastAPI(lifespan=lifespan) # --- Active LoRA state (shared across endpoints via closure) --- active_lora: dict = {"request": None} # ------------------------------------------------------------------ # LoRA-specific endpoints # ------------------------------------------------------------------ class SetLoRARequest(BaseModel): lora_name: str lora_int_id: int lora_path: str load_inplace: bool = False @app.post("/set_lora_adapter/") async def set_lora_adapter(request: SetLoRARequest): """Register a LoRA adapter for all subsequent generate/chat calls.""" active_lora["request"] = { "lora_name": request.lora_name, "lora_int_id": request.lora_int_id, "lora_path": request.lora_path, "load_inplace": request.load_inplace, } logger.info( "Set active LoRA: %s (id=%d, path=%s)", request.lora_name, request.lora_int_id, request.lora_path, ) return {"status": "ok"} @app.post("/clear_lora_adapter/") async def clear_lora_adapter(): """Clear active LoRA adapter (revert to base model).""" active_lora["request"] = None return {"status": "ok"} # ------------------------------------------------------------------ # Standard endpoints (mirrors TRL's vllm_serve) # ------------------------------------------------------------------ @app.get("/health/") async def health(): return {"status": "ok"} @app.get("/get_world_size/") async def get_world_size(): return { "world_size": script_args.tensor_parallel_size * script_args.data_parallel_size } @app.post("/generate/", response_model=GenerateResponse) async def generate(request: GenerateRequest): """Generate completions with optional LoRA adapter.""" import base64 from io import BytesIO import vllm from packaging.version import Version from vllm.sampling_params import GuidedDecodingParams images: list[str | None] = request.images or [None] * len(request.prompts) # type: ignore[assignment,list-item] prompts: list[dict[str, Any]] = [] for prompt, image in zip(request.prompts, images, strict=True): row: dict[str, Any] = {"prompt": prompt} if image is not None: from PIL import Image row["multi_modal_data"] = { "image": Image.open(BytesIO(base64.b64decode(image))) } prompts.append(row) generation_kwargs = { "n": request.n, "repetition_penalty": request.repetition_penalty, "temperature": request.temperature, "top_p": request.top_p, "top_k": request.top_k, "min_p": request.min_p, "max_tokens": request.max_tokens, "logprobs": request.logprobs, } generation_kwargs.update(request.generation_kwargs) if Version(vllm.__version__) <= Version("0.10.2"): key = "guided_decoding" if request.structured_outputs_regex is not None: generation_kwargs[key] = GuidedDecodingParams( regex=request.structured_outputs_regex ) else: generation_kwargs.setdefault(key, None) else: from vllm.sampling_params import StructuredOutputsParams key = "structured_outputs" if request.structured_outputs_regex is not None: generation_kwargs[key] = StructuredOutputsParams( regex=request.structured_outputs_regex ) elif isinstance(generation_kwargs.get(key), dict): generation_kwargs[key] = StructuredOutputsParams( **generation_kwargs[key] ) else: generation_kwargs.setdefault(key, None) sampling_params = SamplingParams(**generation_kwargs) chunked_prompts = chunk_list(prompts, script_args.data_parallel_size) for conn, chunk in zip(connections, chunked_prompts, strict=True): if not chunk: chunk = [{"prompt": ""}] kwargs = { "prompts": chunk, "sampling_params": sampling_params, "lora_request": active_lora["request"], } conn.send({"type": "call", "method": "generate", "kwargs": kwargs}) all_outputs = [conn.recv() for conn in connections] all_outputs = [ o for o, c in zip(all_outputs, chunked_prompts, strict=True) if c ] all_outputs = list(chain.from_iterable(all_outputs)) return { "prompt_ids": [o.prompt_token_ids for o in all_outputs], "completion_ids": [ list(out.token_ids) for o in all_outputs for out in o.outputs ], "logprobs": extract_logprobs(all_outputs)[0], "logprob_token_ids": extract_logprobs(all_outputs)[1], } @app.post("/chat/", response_model=ChatResponse) async def chat(request: ChatRequest): """Chat endpoint with optional LoRA adapter.""" generation_kwargs = { "n": request.n, "repetition_penalty": request.repetition_penalty, "temperature": request.temperature, "top_p": request.top_p, "top_k": request.top_k, "min_p": request.min_p, "max_tokens": request.max_tokens, "logprobs": request.logprobs, } generation_kwargs.update(request.generation_kwargs) sampling_params = SamplingParams(**generation_kwargs) chunked = chunk_list(request.messages, script_args.data_parallel_size) for conn, chunk in zip(connections, chunked, strict=True): if not chunk: chunk = [[{"role": "user", "content": ""}]] kwargs = { "messages": chunk, "sampling_params": sampling_params, "use_tqdm": False, "lora_request": active_lora["request"], } conn.send({"type": "call", "method": "chat", "kwargs": kwargs}) all_outputs = [conn.recv() for conn in connections] all_outputs = [o for o, c in zip(all_outputs, chunked, strict=True) if c] all_outputs = list(chain.from_iterable(all_outputs)) return { "prompt_ids": [o.prompt_token_ids for o in all_outputs], "completion_ids": [ list(out.token_ids) for o in all_outputs for out in o.outputs ], "logprobs": extract_logprobs(all_outputs)[0], "logprob_token_ids": extract_logprobs(all_outputs)[1], } # --- Weight sync endpoints (legacy fallback, same as TRL) --- @app.post("/init_communicator/") async def init_communicator(request: InitCommunicatorRequest): world_size = ( script_args.tensor_parallel_size * script_args.data_parallel_size + 1 ) kwargs = { "method": "init_communicator", "args": ( request.host, request.port, world_size, request.client_device_uuid, ), } msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs} loop = asyncio.get_running_loop() await asyncio.gather( *(loop.run_in_executor(None, c.send, msg) for c in connections) ) return {"message": "Initializing communicator"} class UpdateWeightsRequest(BaseModel): name: str dtype: str shape: list[int] @app.post("/update_named_param/") async def update_named_param(request: UpdateWeightsRequest): kwargs = { "method": "update_named_param", "args": (request.name, request.dtype, tuple(request.shape)), } msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs} loop = asyncio.get_running_loop() await asyncio.gather( *(loop.run_in_executor(None, c.send, msg) for c in connections) ) return {"message": "Updating parameter"} class BatchUpdateWeightsRequest(BaseModel): params: list[dict] @app.post("/batch_update_named_params/") async def batch_update_named_params(request: BatchUpdateWeightsRequest): params_list = [ (p["name"], p["dtype"], tuple(p["shape"])) for p in request.params ] kwargs = {"method": "batch_update_named_params", "args": (params_list,)} msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs} loop = asyncio.get_running_loop() await asyncio.gather( *(loop.run_in_executor(None, c.send, msg) for c in connections) ) return {"message": f"Batch update for {len(params_list)} params"} @app.post("/reset_prefix_cache/") async def reset_prefix_cache(): for conn in connections: conn.send({"type": "call", "method": "reset_prefix_cache"}) results = [conn.recv() for conn in connections] return {"message": f"Reset prefix cache: {all(results)}"} @app.post("/close_communicator/") async def close_communicator(): kwargs = {"method": "close_communicator"} for conn in connections: conn.send( { "type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs, } ) return {"message": "Closing communicator"} uvicorn.run( app, host=script_args.host, port=script_args.port, log_level=script_args.log_level, access_log=True, ) ================================================ FILE: src/axolotl/scripts/vllm_worker_ext.py ================================================ """Extended vLLM worker extension with batch weight sync support. Subclasses TRL's WeightSyncWorkerExtension to add: - batch_update_named_params: receives multiple params in one call - Auto-close stale communicator on re-init - _direct_set_weight: proper handling for stacked (qkv_proj, gate_up_proj) params, including LoRA-wrapped models where vLLM inserts base_layer into the hierarchy """ import logging import torch try: from transformers import is_torch_xpu_available except ImportError: is_torch_xpu_available = lambda: False # noqa: E731 from trl.scripts.vllm_serve import WeightSyncWorkerExtension logger = logging.getLogger(__name__) # Stacked param name mapping: shard_name -> (packed_name, shard_order) _STACKED_PARAMS = { "q_proj": ("qkv_proj", 0), "k_proj": ("qkv_proj", 1), "v_proj": ("qkv_proj", 2), "gate_proj": ("gate_up_proj", 0), "up_proj": ("gate_up_proj", 1), } class BatchWeightSyncWorkerExtension(WeightSyncWorkerExtension): """Worker extension that adds batch weight update and direct weight setting.""" def init_communicator(self, host, port, world_size, client_device_uuid): """Auto-close stale communicator before re-initializing.""" if self.communicator is not None: self.close_communicator() super().init_communicator(host, port, world_size, client_device_uuid) def _direct_set_weight(self, name: str, weight: torch.Tensor) -> None: """Directly copy weight data into the model, handling stacked params. Bypasses model.load_weights() which may fail on vLLM 0.17's new module-tree weight loader for stacked params (qkv_proj, gate_up_proj). Handles LoRA-wrapped params where vLLM inserts ``base_layer`` into the parameter hierarchy (e.g. ``qkv_proj.base_layer.weight``). """ model = self.model_runner.model params_dict = dict(model.named_parameters()) # Check if this is a simple direct param (exists as-is) if name in params_dict: params_dict[name].data.copy_(weight.to(params_dict[name].dtype)) return # Also check with base_layer inserted: x.y.weight -> x.y.base_layer.weight parts_bl = name.rsplit(".", 1) if len(parts_bl) == 2: base_layer_name = f"{parts_bl[0]}.base_layer.{parts_bl[1]}" if base_layer_name in params_dict: params_dict[base_layer_name].data.copy_( weight.to(params_dict[base_layer_name].dtype) ) return # Handle stacked params: e.g. "model.layers.0.self_attn.q_proj.weight" # -> "model.layers.0.self_attn.qkv_proj.weight" with shard offset parts = name.rsplit(".", 2) # [prefix, layer_name, suffix] if len(parts) == 3: prefix, layer_name, suffix = parts if layer_name in _STACKED_PARAMS: packed_name, shard_idx = _STACKED_PARAMS[layer_name] for packed_full in [ f"{prefix}.{packed_name}.{suffix}", f"{prefix}.{packed_name}.base_layer.{suffix}", ]: if packed_full not in params_dict: continue param = params_dict[packed_full] # Navigate to the packed module to find shard sizes module_path = packed_full.rsplit(".", 1)[0] # strip .weight/.bias if ".base_layer" in module_path: module_path = module_path.replace(".base_layer", "") module = model for attr in module_path.split("."): module = getattr(module, attr, None) if module is None: break # LoRA wrappers don't have output_sizes directly; # check base_layer for the underlying parallel linear if module is not None and not hasattr(module, "output_sizes"): base = getattr(module, "base_layer", None) if base is not None and hasattr(base, "output_sizes"): module = base if module is not None and hasattr(module, "output_sizes"): tp_size = getattr(module, "tp_size", 1) sizes = [s // tp_size for s in module.output_sizes] offset = sum(sizes[:shard_idx]) shard_size = sizes[shard_idx] param.data[offset : offset + shard_size].copy_( weight.to(param.dtype) ) return # Fallback: try load_weights (may work for non-stacked params) logger.warning("Falling back to load_weights for param: %s", name) model.load_weights(weights=[(name, weight)]) def update_named_param(self, name, dtype, shape): """Override to use _direct_set_weight instead of load_weights.""" if self.communicator is None: raise RuntimeError("Communicator not initialized.") dtype = getattr(torch, dtype.split(".")[-1]) weight = torch.empty(shape, dtype=dtype, device=self.device) if is_torch_xpu_available(): self.communicator.broadcast(weight, root=self.client_rank) self.communicator.barrier() else: self.communicator.broadcast(weight, src=self.client_rank) self.communicator.group.barrier() self._direct_set_weight(name, weight) def batch_update_named_params(self, params_list: list[tuple[str, str, tuple]]): """Receive and apply multiple weight tensors in sequence. Args: params_list: List of (name, dtype_str, shape) tuples. """ if self.communicator is None: raise RuntimeError("Communicator not initialized.") weights_to_load = [] for name, dtype_str, shape in params_list: dtype = getattr(torch, dtype_str.split(".")[-1]) weight = torch.empty(shape, dtype=dtype, device=self.device) if is_torch_xpu_available(): self.communicator.broadcast(weight, root=self.client_rank) else: self.communicator.broadcast(weight, src=self.client_rank) weights_to_load.append((name, weight)) # Single barrier after all broadcasts if is_torch_xpu_available(): self.communicator.barrier() else: self.communicator.group.barrier() # Load weights using direct set (handles stacked params) for name, weight in weights_to_load: self._direct_set_weight(name, weight) ================================================ FILE: src/axolotl/telemetry/__init__.py ================================================ ================================================ FILE: src/axolotl/telemetry/callbacks.py ================================================ """Trainer callbacks for reporting runtime metrics at regular intervals.""" import logging import time from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.telemetry.manager import TelemetryManager from axolotl.telemetry.runtime_metrics import RuntimeMetricsTracker LOG = logging.getLogger(__name__) TIME_SINCE_LAST = 60 class TelemetryCallback(TrainerCallback): """ Trainer callback for tracking and reporting runtime metrics. This callback tracks training progress, runtime, and memory usage, sending telemetry at configurable intervals. """ report_interval_steps: int = 100 def __init__(self): """Initialize the metrics callback.""" self.tracker = RuntimeMetricsTracker() self.telemetry_manager = TelemetryManager.get_instance() self.current_epoch = -1 self.start_time = time.time() self.last_report_time = None self.last_report_step = 0 # pylint: disable=unused-argument def on_train_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Handle training start.""" self.telemetry_manager.send_event(event_type="train-start") # pylint: disable=unused-argument def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Handle training end.""" # Send training completion event self.telemetry_manager.send_event( event_type="train-end", properties=self._extract_last_metrics(state) | self.tracker.metrics.to_dict(), ) # pylint: disable=unused-argument def on_epoch_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Handle epoch start.""" self.current_epoch += 1 self.tracker.start_epoch(self.current_epoch) # pylint: disable=unused-argument def on_epoch_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Handle epoch end.""" self.tracker.end_epoch(self.current_epoch) # pylint: disable=unused-argument def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Handle step end.""" step = state.global_step self.tracker.update_step(step) # Check if we should report metrics should_report = ( step % self.report_interval_steps == 0 or step == 1 # Always report first step or step - self.last_report_step >= self.report_interval_steps ) if should_report: current_time = time.time() if self.last_report_time is not None: time_since_last_report = current_time - self.last_report_time else: time_since_last_report = current_time - self.start_time steps_since_last_report = step - self.last_report_step # Only report if enough time has passed if ( step == 1 or time_since_last_report >= TIME_SINCE_LAST or steps_since_last_report >= self.report_interval_steps ): # Calculate steps per second for this interval if time_since_last_report > 0 and steps_since_last_report > 0: steps_per_second = steps_since_last_report / time_since_last_report else: steps_per_second = 0 # Update memory metrics self.tracker.update_memory_metrics() # Prepare metrics to report metrics = self._extract_last_metrics(state) | { "step": step, "epoch": self.current_epoch, "progress": state.epoch, # Fractional epoch progress "steps_per_second": steps_per_second, "elapsed_time": current_time - self.start_time, "time_since_last_report": time_since_last_report, } # Add memory metrics memory_metrics = self.tracker.get_memory_metrics() metrics.update({"memory": memory_metrics}) # Send telemetry self.telemetry_manager.send_event( event_type="train-progress", properties=metrics ) # Update last report time and step self.last_report_time = current_time self.last_report_step = step def _extract_last_metrics(self, state: TrainerState) -> dict: """Extract last loss, learning_rate, grad_norm, and token metrics from log history.""" if not state.log_history: return { "loss": 0, "ppl": 0, "learning_rate": 0, "grad_norm": 0, "tokens/total": 0, "tokens/trainable": 0, "tokens/train_per_sec_per_gpu": 0, } last_log = state.log_history[-1] return { "loss": last_log.get("loss", 0), "ppl": last_log.get("ppl", 0), "learning_rate": last_log.get("learning_rate", 0), "grad_norm": last_log.get("grad_norm", 0), "tokens/total": last_log.get("tokens/total", 0), "tokens/trainable": last_log.get("tokens/trainable", 0), "tokens/train_per_sec_per_gpu": last_log.get( "tokens/train_per_sec_per_gpu", 0 ), } ================================================ FILE: src/axolotl/telemetry/errors.py ================================================ """Telemetry utilities for exception and traceback information.""" import logging import os import re import traceback from functools import wraps from inspect import getmodule from typing import Any, Callable from axolotl.telemetry.manager import TelemetryManager LOG = logging.getLogger(__name__) ERROR_HANDLED = False def sanitize_stack_trace(stack_trace: str) -> str: """ Remove personal information from stack trace messages while keeping Python package codepaths. This function identifies Python packages by looking for common patterns in virtual environment and site-packages directories, preserving the package path while removing user-specific paths. Args: stack_trace: The original stack trace string. Returns: A sanitized version of the stack trace with Python package paths preserved. """ # Split the stack trace into lines to process each file path separately lines = stack_trace.split("\n") sanitized_lines = [] # Regular expression to find file paths in the stack trace path_pattern = re.compile(r'(?:File ")(.*?)(?:")') # Regular expression to identify paths in site-packages or dist-packages # This matches path segments like "site-packages/package_name" or "dist-packages/package_name" site_packages_pattern = re.compile( r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)" ) # Additional common virtual environment patterns venv_lib_pattern = re.compile( r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)" ) for line in lines: # Check if this line contains a file path path_match = path_pattern.search(line) if path_match: full_path = path_match.group(1) sanitized_path = "" # Try to match site-packages pattern site_packages_match = site_packages_pattern.search(full_path) venv_lib_match = venv_lib_pattern.search(full_path) if site_packages_match: # Find the index where the matched pattern starts idx = full_path.find("site-packages") if idx == -1: idx = full_path.find("dist-packages") # Keep from 'site-packages' onward if idx >= 0: sanitized_path = full_path[idx:] elif venv_lib_match: # For other virtual environment patterns, find the package directory match_idx = venv_lib_match.start(1) if match_idx > 0: # Keep from the package name onward package_name = venv_lib_match.group(1) idx = full_path.rfind( package_name, 0, match_idx + len(package_name) ) if idx >= 0: sanitized_path = full_path[idx:] # If we couldn't identify a package pattern but path contains 'axolotl' elif "axolotl" in full_path: idx = full_path.rfind("axolotl") if idx >= 0: sanitized_path = full_path[idx:] # Apply the sanitization to the line if sanitized_path: line = line.replace(full_path, sanitized_path) else: # If we couldn't identify a package pattern, just keep the filename filename = os.path.basename(full_path) if filename: line = line.replace(full_path, filename) else: line = line.replace(full_path, "") sanitized_lines.append(line) return "\n".join(sanitized_lines) def send_errors(func: Callable) -> Callable: """ Decorator to send exception info in a function. If an exception is raised, we send telemetry containing the stack trace and error message. If an error occurs in a decorated function that is called by another decorated function, we'll only send telemetry corresponding to the lower-level function. Args: func: Function to decorate. Returns: Decorated function. """ @wraps(func) def wrapper(*args, **kwargs) -> Any: telemetry_manager = TelemetryManager.get_instance() if not telemetry_manager.enabled: return func(*args, **kwargs) try: return func(*args, **kwargs) except Exception as exception: # Only track if we're not already handling an error. This prevents us from # capturing an error more than once in nested decorated function calls. global ERROR_HANDLED # pylint: disable=global-statement if not ERROR_HANDLED: ERROR_HANDLED = True # Get function module path module = getmodule(func) module_path = ( f"{module.__name__}.{func.__name__}" if module else func.__name__ ) # Get stack trace stack_trace = "".join( traceback.format_exception( type(exception), exception, exception.__traceback__ ) ) stack_trace = sanitize_stack_trace(stack_trace) # Send error telemetry telemetry_manager.send_event( event_type=f"{module_path}-error", properties={ "exception": str(exception), "stack_trace": stack_trace, }, ) LOG.error( f"Error captured in telemetry. Run ID: {telemetry_manager.run_id}" ) raise return wrapper ================================================ FILE: src/axolotl/telemetry/manager.py ================================================ """Telemetry manager and associated utilities.""" import atexit import importlib import logging import os import platform import uuid from pathlib import Path from typing import Any import posthog import psutil import torch import yaml LOG = logging.getLogger(__name__) POSTHOG_HOST = "https://app.posthog.com" POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y" WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml") # NOTE: Need to keep these up to date with any config schema changes FIELDS_TO_REDACT = { "base_model", "tokenizer_config", "base_model_config", "pretraining_dataset", # NOTE: this field may be a string or a dictionary "resume_from_checkpoint", "hub_model_id", } PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_", "trackio_", "swanlab_"} PATH_INDICATORS = {"path", "dir", "data_files"} # pylint: disable=duplicate-code RELEVANT_PACKAGES = { "torch", "transformers", "trl", "datasets", "peft", "bitsandbytes", "accelerate", "optimum", "deepspeed", "ray", "axolotl", "triton", "mamba-ssm", "flash-attn", "xformers", "autoawq", "tokenizers", "sentencepiece", "torchao", "lm_eval", } def is_main_process() -> bool: """ Check whether we're running in the main process. Note: We're using this function instead of `torch.utils.distributed.is_main_process` causes issues with DeepSpeed world_size since. This function avoids that issue by checking env vars that are set by various launchers. Returns: Whether we're running in the main process. """ # If PyTorch distributed is already initialized, use it if torch.distributed.is_initialized(): return torch.distributed.get_rank() == 0 # Otherwise check environment variables for global rank # NOTE: need to verify this in SLURM / OpenMPI environments global_rank = int( os.environ.get( "RANK", os.environ.get( "GLOBAL_RANK", os.environ.get( "SLURM_PROCID", os.environ.get( "OMPI_COMM_WORLD_RANK", "0", ), ), ), ) ) return global_rank == 0 class TelemetryManager: """Manages telemetry collection and transmission""" _instance = None _initialized = False def __new__(cls): """ Telemetry manager constructor. Creates the singleton instance of this class if it doesn't already exist. """ if cls._instance is None: cls._instance = super(TelemetryManager, cls).__new__(cls) cls._instance._initialized = False return cls._instance def __init__(self): """Telemetry manager initializer""" if self._initialized: return self.enabled = self._check_telemetry_enabled() if self.enabled: self.run_id = str(uuid.uuid4()) self.whitelist = self._load_whitelist() try: self.system_info = self._get_system_info() except Exception as e: # pylint: disable=broad-exception-caught LOG.warning(f"Error during system info collection: {e}") self.system_info = None self._init_posthog() # Register shutdown method to flush posthog telemetry atexit.register(self.shutdown) self._initialized = True @classmethod def get_instance(cls) -> "TelemetryManager": if cls._instance is None: cls._instance = TelemetryManager() return cls._instance def _check_telemetry_enabled(self) -> bool: """ Check if telemetry is enabled based on environment variables. We also check whether this is the main process (for the distributed setting and to avoid sending duplicate PostHog events per GPU). Note: This is enabled by default on an opt-out basis. Set `AXOLOTL_DO_NOT_TRACK=1` to disable telemetry. For more details, see https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html. Returns: Boolean denoting whether telemetry is enabled or not. """ # Only rank 0 will send telemetry if not is_main_process(): return False # Parse relevant env vars axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK") do_not_track = os.getenv("DO_NOT_TRACK") # Default to enabled (opt-out model) if axolotl_do_not_track is None or axolotl_do_not_track.lower() not in ( "0", "1", "false", "true", ): return True if do_not_track is None: do_not_track = "0" # Respect AXOLOTL_DO_NOT_TRACK, DO_NOT_TRACK if enabled enabled = axolotl_do_not_track.lower() not in ( "1", "true", ) and do_not_track.lower() not in ("1", "true") return enabled def _load_whitelist(self) -> dict: """Load HuggingFace Hub organization whitelist""" with open(WHITELIST_PATH, encoding="utf-8") as f: whitelist = yaml.safe_load(f) # Send org strings to lowercase since model names are case insensitive whitelist["organizations"] = { org.lower() for org in whitelist["organizations"] } return whitelist def _is_whitelisted(self, value: str) -> bool: """ Check if model / dataset / etc. org is in whitelist. Args: value: Value for one of `axolotl.telemetry.manager.FIELDS_WITH_ORGS` ("base_model", etc.). Returns: Boolean indicating whitelist membership. """ # NOTE: This membership-checking logic can be improved. # What happens when a local model path matches a whitelisted org? parts = value.split("/") if len(parts) < 2: return False org = parts[0] whitelisted = org.lower() in self.whitelist["organizations"] return whitelisted def _init_posthog(self): """Initialize PostHog client""" posthog.api_key = POSTHOG_WRITE_KEY posthog.project_api_key = POSTHOG_WRITE_KEY posthog.host = POSTHOG_HOST def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]: """ Redact properties to remove any paths, so as to avoid inadvertently collecting private or personally identifiable information (PII). We also remove information related to Wandb, MLflow, etc. configuration. Args: properties: Dictionary of properties to redact. Returns: Properties dictionary with redaction applied. """ if not properties: return {} def redact_value(value: Any, key: str = "") -> Any: """Recursively sanitize values, redacting those with path-like keys""" if isinstance(key, str) and isinstance(value, str): # Other redaction special cases if ( key in FIELDS_TO_REDACT or any(prefix in key for prefix in PREFIXES_TO_REDACT) or any(indicator in key.lower() for indicator in PATH_INDICATORS) ): # Fields with whitelisted orgs don't need to be redacted if not self._is_whitelisted(value): return "[REDACTED]" # Handle nested values if isinstance(value, dict): return {k: redact_value(v, k) for k, v in value.items()} if isinstance(value, list): return [redact_value(item) for item in value] return value # Create new dict with redacted values redacted = {k: redact_value(v, k) for k, v in properties.items()} return redacted def _get_system_info(self) -> dict[str, Any]: """Collect system information for various hardware accelerators""" gpu_info = [] accelerator_type = "none" # NVIDIA GPUs if torch.cuda.is_available(): accelerator_type = "cuda" for i in range(torch.cuda.device_count()): gpu_info.append( { "name": torch.cuda.get_device_name(i), "memory": torch.cuda.get_device_properties(i).total_memory, } ) # AMD GPUs elif hasattr(torch, "hip") and torch.hip.is_available(): accelerator_type = "hip" for i in range(torch.hip.device_count()): gpu_info.append( { "name": torch.hip.get_device_name(i), "memory": ( torch.hip.get_device_properties(i).total_memory if hasattr(torch.hip, "get_device_properties") else None ), } ) # Apple Silicon elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): accelerator_type = "mps" gpu_info.append( { "name": "Apple Silicon", # NOTE: this is memory allocated to this process, not total memory "memory": torch.mps.driver_allocated_memory(), } ) # Intel GPUs elif hasattr(torch, "xpu") and torch.xpu.is_available(): accelerator_type = "xpu" for i in range(torch.xpu.device_count()): memory = None if hasattr(torch.xpu, "get_device_properties"): memory = torch.xpu.get_device_properties(i).total_memory gpu_info.append( { "name": torch.xpu.get_device_name(i), "memory": memory, } ) # NPUs elif hasattr(torch, "npu") and torch.npu.is_available(): accelerator_type = "npu" for i in range(torch.npu.device_count()): memory = None if hasattr(torch.npu, "get_device_properties"): memory = torch.npu.get_device_properties(i).total_memory gpu_info.append( { "name": torch.npu.get_device_name(i), "memory": memory, } ) # Get relevant package versions installed_packages = {} for package in RELEVANT_PACKAGES: try: version = importlib.metadata.version(package) installed_packages[f"{package}_version"] = version except importlib.metadata.PackageNotFoundError: pass return { "os": platform.system(), "python_version": platform.python_version(), "cpu_count": psutil.cpu_count(), "memory_total": psutil.virtual_memory().total, "accelerator_type": accelerator_type, "accelerator_count": len(gpu_info), "accelerator_info": gpu_info, **installed_packages, } def send_event(self, event_type: str, properties: dict[str, Any] | None = None): """Send a telemetry event""" if not self.enabled: return if properties is None: properties = {} # Sanitize properties to remove PII properties = self._redact_paths(properties) # Wrap PostHog errors in try / except to not raise errors during Axolotl usage try: # Send event via PostHog posthog.capture( distinct_id=self.run_id, event=event_type, properties=properties, disable_geoip=True, ) except Exception as e: # pylint: disable=broad-exception-caught LOG.warning(f"Failed to send telemetry event: {e}") # Additionally, send system info telemetry when loading config. # NOTE: Is this the best place for this? if event_type == "config-loaded": self.send_system_info() def send_system_info(self): """Helper method for sending system info""" if self.system_info is not None: self.send_event(event_type="system-info", properties=self.system_info) def shutdown(self): """Ensure all queued events are processed before shutdown""" if self.enabled: posthog.shutdown() ================================================ FILE: src/axolotl/telemetry/runtime_metrics.py ================================================ """Telemetry utilities for runtime and memory metrics.""" import logging import time from dataclasses import dataclass, field from typing import Any import psutil import torch from axolotl.telemetry.manager import TelemetryManager LOG = logging.getLogger(__name__) @dataclass class RuntimeMetrics: """Container for runtime metrics to be tracked throughout training.""" # Timing metrics start_time: float epoch_start_times: dict[int, float] = field(init=False) epoch_end_times: dict[int, float] = field(init=False) # Memory metrics peak_cpu_memory: int = 0 peak_gpu_memory: dict[int, int] = field(init=False) # Progress metrics total_steps: int = 0 current_epoch: int = 0 current_step: int = 0 def __post_init__(self): """Initialize empty metric mappings.""" self.epoch_start_times = {} self.epoch_end_times = {} self.peak_gpu_memory = {} @property def elapsed_time(self) -> float: """Calculate total elapsed time in seconds.""" return time.time() - self.start_time def epoch_time(self, epoch: int) -> float | None: """Calculate time taken for a specific epoch in seconds.""" if epoch in self.epoch_start_times and epoch in self.epoch_end_times: return self.epoch_end_times[epoch] - self.epoch_start_times[epoch] return None def average_epoch_time(self) -> float | None: """Calculate average time per epoch in seconds.""" completed_epochs = [ epoch for epoch in self.epoch_start_times if epoch in self.epoch_end_times ] if not completed_epochs: return None total_time = 0.0 for epoch in completed_epochs: epoch_time = self.epoch_time(epoch) if epoch_time is not None: # Check to avoid mypy warning total_time += epoch_time return total_time / len(completed_epochs) def steps_per_second(self) -> float | None: """Calculate average steps per second across all training.""" if self.total_steps == 0 or self.elapsed_time == 0: return None return self.total_steps / self.elapsed_time def to_dict(self) -> dict[str, Any]: """Convert metrics to a dictionary for telemetry reporting.""" metrics = { "total_time_seconds": self.elapsed_time, "total_steps": self.total_steps, "steps_per_second": self.steps_per_second(), "epochs_completed": len( [ epoch for epoch in self.epoch_start_times if epoch in self.epoch_end_times ] ), "peak_cpu_memory_bytes": self.peak_cpu_memory, } # Add per-epoch timing if available epoch_times: dict[str, float] = {} for epoch in sorted(self.epoch_end_times.keys()): time_taken = self.epoch_time(epoch) if time_taken is not None: epoch_times[f"epoch_{epoch}_seconds"] = time_taken if epoch_times: metrics["epoch_times"] = epoch_times # type: ignore metrics["average_epoch_time_seconds"] = self.average_epoch_time() # Add GPU memory metrics if available if self.peak_gpu_memory: gpu_metrics: dict[str, int] = {} for gpu_id, memory in self.peak_gpu_memory.items(): gpu_metrics[f"gpu_{gpu_id}_peak_memory_bytes"] = memory metrics["gpu_memory"] = gpu_metrics # type: ignore return metrics class RuntimeMetricsTracker: """Tracker for runtime metrics during training.""" update_interval = 100 def __init__(self): """Initialize the runtime metrics tracker.""" self.metrics = RuntimeMetrics(start_time=time.time()) self.telemetry_manager = TelemetryManager.get_instance() self._process = psutil.Process() def start_epoch(self, epoch: int): """Record the start of a new epoch.""" self.metrics.current_epoch = epoch self.metrics.epoch_start_times[epoch] = time.time() self.update_memory_metrics() def end_epoch(self, epoch: int): """Record the end of an epoch.""" self.metrics.epoch_end_times[epoch] = time.time() def update_step(self, step: int): """Update the current step count.""" self.metrics.current_step = step self.metrics.total_steps += 1 # Periodically update memory metrics if step % self.update_interval == 0: self.update_memory_metrics() def _get_allocated_memory(self) -> dict[int, int]: """ Helper function for getting accelerator-agnostic allocated memory. Returns: A dictionary mapping device IDs to allocated memory in bytes """ memory_used: dict[int, int] = {} # NVIDIA GPUs if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): memory_used[i] = torch.cuda.memory_allocated(i) # AMD GPUs elif hasattr(torch, "hip") and torch.hip.is_available(): for i in range(torch.hip.device_count()): if hasattr(torch.hip, "memory_allocated"): memory_used[i] = torch.hip.memory_allocated(i) # Apple Silicon elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): # MPS doesn't have per-device memory stats since there's only one device if hasattr(torch.mps, "current_allocated_memory"): memory_used[0] = torch.mps.current_allocated_memory() # Intel GPUs elif hasattr(torch, "xpu") and torch.xpu.is_available(): for i in range(torch.xpu.device_count()): if hasattr(torch.xpu, "memory_allocated"): memory_used[i] = torch.xpu.memory_allocated(i) # NPUs elif hasattr(torch, "npu") and torch.npu.is_available(): for i in range(torch.npu.device_count()): if hasattr(torch.npu, "memory_allocated"): memory_used[i] = torch.npu.memory_allocated(i) return memory_used def update_memory_metrics(self): """Update peak memory usage metrics.""" # CPU memory cpu_memory = self._process.memory_info().rss self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory) # GPU memory (if available) memory_used = self._get_allocated_memory() for i, memory in memory_used.items(): self.metrics.peak_gpu_memory[i] = max( self.metrics.peak_gpu_memory.get(i, 0), memory ) def get_memory_metrics(self) -> dict[str, Any]: """Get the current memory metrics as a dictionary.""" memory_metrics = { "cpu_memory_bytes": self._process.memory_info().rss, "peak_cpu_memory_bytes": self.metrics.peak_cpu_memory, } # GPU memory (if available) memory_used = self._get_allocated_memory() for i, memory in memory_used.items(): memory_metrics[f"gpu_{i}_memory_bytes"] = memory memory_metrics[f"gpu_{i}_peak_memory_bytes"] = ( self.metrics.peak_gpu_memory.get(i, 0) ) return memory_metrics ================================================ FILE: src/axolotl/telemetry/whitelist.yaml ================================================ organizations: - "axolotl-ai-co" - "meta-llama" - "huggingface" - "nvidia" - "facebook" - "google" - "microsoft" - "deepseek-ai" - "HuggingFaceTB" - "mistralai" - "Qwen" - "unsloth" - "NousResearch" - "allenai" - "amd" - "tiiuae" - "tencent" - "zai-org" - "openai" - "ibm-granite" - "arcee-ai" - "swiss-ai" - "CohereForAI" - "deepcogito" - "THUDM" - "ai21labs" - "LiquidAI" - "canopylabs" - "state-spaces" - "mistral-community" - "llava-hf" - "ByteDance-Seed" - "ACE-Step" - "openbmb" - "MiniMaxAI" - "stepfun-ai" - "internlm" - "katanemo" - "XiaomiMiMo" ================================================ FILE: src/axolotl/train.py ================================================ """Prepare and train a model on a dataset. Can also infer from a model or merge lora""" from __future__ import annotations import importlib import inspect import json import os import shutil import signal import sys import typing import weakref from collections import OrderedDict from contextlib import ExitStack from pathlib import Path from typing import Any, Dict import torch import transformers.modelcard from datasets import Dataset from huggingface_hub.errors import OfflineModeIsEnabled from peft import PeftConfig, PeftModel from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer import Trainer from axolotl.common.datasets import TrainDatasetMeta from axolotl.contribs.lgpl import ( # pylint: disable = no-name-in-module fix_untrained_tokens, ) from axolotl.integrations.base import PluginManager from axolotl.loaders import ModelLoader, load_processor, load_tokenizer from axolotl.telemetry.errors import send_errors from axolotl.telemetry.manager import TelemetryManager from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import cleanup_distributed from axolotl.utils.freeze import freeze_layers_except from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import RLType from axolotl.utils.train import determine_last_checkpoint from axolotl.utils.trainer import setup_trainer if typing.TYPE_CHECKING: from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder LOG = get_logger(__name__) TELEMETRY_MANAGER = TelemetryManager.get_instance() PLUGIN_MANAGER = PluginManager.get_instance() def setup_model_and_tokenizer( cfg: DictDefault, ) -> tuple[ PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None ]: """Load the tokenizer, processor (for multimodal models), and model based on configuration. Args: cfg: Dictionary mapping `axolotl` config keys to values. Returns: Tuple containing model, tokenizer, `peft_config` (if LoRA / QLoRA, else `None`), and processor (if multimodal, else `None`). """ # Load tokenizer LOG.debug( f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}", ) tokenizer = load_tokenizer(cfg) # Load processor for multimodal models if needed processor = None if cfg.is_multimodal: processor = load_processor(cfg, tokenizer) # Load the model LOG.debug("Loading model") model_loader = ModelLoader(cfg, tokenizer, processor=processor) model, peft_config = model_loader.load() if model.generation_config is not None: model.generation_config.do_sample = True model_properties = model.config.to_dict() try: model_properties["num_parameters"] = model.num_parameters() except Exception: # pylint: disable=broad-exception-caught model_properties["num_parameters"] = sum(p.numel() for p in model.parameters()) # if the num_parameters is less than 2B, let's round to nearest 100M, else round to nearest 1B if model_properties["num_parameters"] < 2e9: model_properties["num_parameters_est"] = ( f"{round(model_properties['num_parameters'] / 1e8) * 100}M" ) else: model_properties["num_parameters_est"] = ( f"{round(model_properties['num_parameters'] / 1e9)}B" ) TELEMETRY_MANAGER.send_event(event_type="model-load", properties=model_properties) if peft_config: TELEMETRY_MANAGER.send_event( event_type="peft-config-load", properties=peft_config.to_dict() ) # Apply freezing if specified if cfg.unfrozen_parameters: freeze_layers_except(model, cfg.unfrozen_parameters) if any( any(embed in param for embed in ["lm_head", "embed_tokens"]) for param in cfg.unfrozen_parameters ): model.enable_input_require_grads() return model, tokenizer, peft_config, processor def setup_reference_model( cfg: DictDefault, tokenizer: PreTrainedTokenizer ) -> PreTrainedModel | None: """ Set up the reference model for RL training if needed. Args: cfg: Dictionary mapping `axolotl` config keys to values. tokenizer: The tokenizer to use for the reference model. Returns: Reference model if needed for RL training, `None` otherwise. """ model_ref = None if cfg.rl and cfg.rl != RLType.ORPO: if cfg.adapter and not cfg.rl_adapter_ref_model: # use built-in trl autounwrap LOG.debug("Passing model_ref: None to RL trainer") model_ref = None # explicit setting to None else: reference_model: bool = True if cfg.rl == RLType.GRPO and cfg.trl.beta == 0: reference_model = False # load the model again for model_ref/baseline model_loader = ModelLoader(cfg, tokenizer, reference_model=reference_model) model_ref, _ = model_loader.load() return model_ref def setup_signal_handler(cfg: DictDefault, model: PreTrainedModel): """ Set up signal handler for graceful termination. Args: cfg: Dictionary mapping `axolotl` config keys to values. model: The model to save on termination """ # ray workers don't have access to this signal if cfg.local_rank == 0 and not cfg.use_ray: def terminate_handler(_, __, model_weakref): if model_weakref() is not None: _model = model_weakref() _model.save_pretrained(cfg.output_dir) cleanup_distributed() sys.exit(0) _model_weakref = weakref.ref(model) signal.signal( signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, _model_weakref), ) def execute_training( cfg: DictDefault, trainer: Any, resume_from_checkpoint: str | None ): """ Execute the training process with appropriate SDP kernel configurations. Args: cfg: Dictionary mapping `axolotl` config keys to values. trainer: The configured trainer object. resume_from_checkpoint: Path to checkpoint to resume from, if applicable. """ with ExitStack() as stack: # Define the context managers to use if cfg.flash_optimum: stack.enter_context( torch.backends.cuda.sdp_kernel( enable_flash=True, enable_math=True, enable_mem_efficient=True, ) ) if cfg.context_parallel_size > 1: models = [trainer.model] if hasattr(trainer, "ref_model") and trainer.ref_model: models.append(trainer.ref_model) stack.enter_context( SequenceParallelContextManager( models=models, context_parallel_size=cfg.context_parallel_size, gradient_accumulation_steps=cfg.gradient_accumulation_steps, ring_attn_func=cfg.ring_attn_func, heads_k_stride=cfg.heads_k_stride, gather_outputs=cfg.rl is RLType.GRPO, device_mesh=trainer.accelerator.torch_device_mesh, ) ) # TODO: disabling for now as not compatible with FSDP2 + torchao low bit optimizers # if cfg.bf16: # torch.set_default_dtype(torch.bfloat16) LOG.info("Starting trainer...") trainer.train(resume_from_checkpoint=resume_from_checkpoint) PLUGIN_MANAGER.post_train(cfg, trainer.model) def save_trained_model( cfg: DictDefault, trainer: Any, model: PreTrainedModel, ): """ Save the trained model according to configuration and training setup. Args: cfg: Dictionary mapping `axolotl` config keys to values. trainer: The trainer object. model: The trained model to save. """ LOG.info(f"Training completed! Saving trained model to {cfg.output_dir}.") # Post training module hooks for name, module in model.named_modules(): if hasattr(module, "_post_training"): module._post_training(model, name) # handle QAT if cfg.qat: from axolotl.utils.quantization import convert_qat_model convert_qat_model( model, quantize_embedding=cfg.qat.quantize_embedding, ) LOG.info( "QAT usage note: please ensure you quantize your model fine-tuned using QAT by running `axolotl quantize`" " with the same config which you used for training." ) # Handle ReLoRA early return case if cfg.relora: if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit): model = model.merge_and_unload() else: # final model weights have already been saved by `ReLoRACallback.on_train_end` return if trainer.is_fsdp_enabled or cfg.fsdp_config: if cfg.fsdp_config or cfg.fsdp: if cfg.fsdp_config.final_state_dict_type: state_dict_type = cfg.fsdp_config.final_state_dict_type else: state_dict_type = cfg.fsdp_config.state_dict_type trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type) trainer.save_model(cfg.output_dir) # only handles FULL_STATE_DICT if state_dict_type == "SHARDED_STATE_DICT": LOG.info( "The final model was saved with a sharded state dict. Please ensure you merge " "the sharded weights with `merge-sharded-fsdp-weights`." ) checkpoint_dir = determine_last_checkpoint(cfg, update=False) if ( not (Path(cfg.output_dir) / "model.safetensors.index.json").exists() and checkpoint_dir ): # import here to prevent circular import from axolotl.cli.merge_sharded_fsdp_weights import merge_fsdp_weights fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0" merged_path = str(Path(cfg.output_dir) / "merged") merge_fsdp_weights( checkpoint_dir=str(fsdp_dir), output_path=merged_path, ) trainer.accelerator.wait_for_everyone() if trainer.accelerator.is_main_process: # move all files in merged_path to cfg.output_dir for merged_file in Path(merged_path).iterdir(): if (Path(cfg.output_dir) / merged_file.name).exists(): (Path(cfg.output_dir) / merged_file.name).unlink() shutil.move(str(merged_file), cfg.output_dir) shutil.rmtree(merged_path) # remove what should be an empty dir # TODO(wing):see https://github.com/huggingface/transformers/pull/40207 # cleanup the FSDP prefix in the model config.json if trainer.accelerator.is_main_process: with open( Path(cfg.output_dir) / "config.json", "r", encoding="utf-8" ) as config_file_io: # read the model config as an OrderedDict config = json.load(config_file_io, object_pairs_hook=OrderedDict) config["architectures"] = [ name.lstrip("FSDP") for name in config["architectures"] ] # write the updated model config back with open( os.path.join(cfg.output_dir, "config.json"), "w", encoding="utf-8" ) as config_file_io: json.dump(config, config_file_io, indent=2) elif cfg.deepspeed and is_deepspeed_zero3_enabled(): # Copied over from: https://github.com/huggingface/accelerate/blob/5ae611118057232f441055f7ef9ba0b0f2b8d533/docs/source/usage_guides/deepspeed.md#saving-and-loading trainer.accelerator.wait_for_everyone() trainer.save_model(cfg.output_dir) # the trainer saved a model.safetensors file in the output directory, # but it is most likely a proxy model and if so, should be deleted maybe_proxy = os.path.exists(os.path.join(cfg.output_dir, "model.safetensors")) maybe_sharded = os.path.exists( os.path.join(cfg.output_dir, "model.safetensors.index.json") ) if maybe_proxy and maybe_sharded: LOG.info(f"Deleting {os.path.join(cfg.output_dir, 'model.safetensors')}") LOG.info("This is a proxy model and should be deleted") try: os.remove(os.path.join(cfg.output_dir, "model.safetensors")) except FileNotFoundError: pass elif cfg.local_rank == 0: if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model: trainer.model.save_pretrained(cfg.output_dir) model.save_pretrained(cfg.output_dir) if hasattr(cfg, "llmcompressor") and cfg.llmcompressor: # TODO: add integration support so this can be implemented completely within the plugin from axolotl.integrations.llm_compressor.utils import save_compressed_model save_compressed_model( model=model, output_dir=cfg.output_dir, trainer=trainer, save_compressed=cfg.llmcompressor.save_compressed, ) LOG.info(f"Model successfully saved to {cfg.output_dir}") def create_model_card(cfg: DictDefault, trainer: Trainer): """ Create a model card for the trained model if needed. Args: cfg: Dictionary mapping `axolotl` config keys to values. trainer: The trainer object with model card creation capabilities. """ if not cfg.hub_model_id: # Guard since create_model_card may fail if dataset_tags is empty list try: model_card_kwarg = { "model_name": cfg.output_dir.lstrip("./") .encode("utf-8") .decode("utf-8") } # We check if we're using a TRL trainer; if so, `dataset_tags` is not consumed. rl = cfg.rl is not None or cfg.reward_model or cfg.process_reward_model if cfg.datasets is not None and not rl: dataset_tags = [ d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir() ] dataset_tags = [d for d in dataset_tags if not d.startswith("https://")] if dataset_tags: model_card_kwarg["dataset_tags"] = dataset_tags trainer.create_model_card(**model_card_kwarg) except (AttributeError, UnicodeDecodeError, OfflineModeIsEnabled): pass elif cfg.hub_model_id: # Defensively push to the hub to ensure the model card is updated trainer.push_to_hub() def save_initial_configs( cfg: DictDefault, tokenizer: PreTrainedTokenizer, model: PreTrainedModel, peft_config: PeftConfig | None, processor: ProcessorMixin | None, ): """ Save initial configurations before training. Args: cfg: Dictionary mapping `axolotl` config keys to values. tokenizer: The tokenizer to save. model: The model to save configuration for. peft_config: The PEFT configuration to save if applicable. """ # Create output_dir if it doesn't already exist output_dir = Path(cfg.output_dir) if not output_dir.is_dir(): os.makedirs(cfg.output_dir, exist_ok=True) # Pre-save adapter config so it's available to inspect if peft_config: LOG.info(f"Pre-saving adapter config to {cfg.output_dir}...") peft_config.save_pretrained(cfg.output_dir) # Pre-save the tokenizer and model configs LOG.info(f"Pre-saving tokenizer to {cfg.output_dir}...") tokenizer.save_pretrained( str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files ) if hasattr(model, "config"): LOG.info(f"Pre-saving model config to {cfg.output_dir}...") model.config.save_pretrained(str(output_dir)) if processor: LOG.info(f"Pre-saving processor to {cfg.output_dir}...") processor.save_pretrained(str(output_dir)) def setup_model_card(cfg: DictDefault): """ Set up the Axolotl badge and add the Axolotl config to the model card if available. Args: cfg: Dictionary mapping `axolotl` config keys to values. """ badge_markdown = """[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl)""" transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}" if cfg.axolotl_config_path: raw_axolotl_cfg = Path(cfg.axolotl_config_path) version = importlib.metadata.version("axolotl") if raw_axolotl_cfg.is_file(): transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n
See axolotl config\n\naxolotl version: `{version}`\n```yaml\n{raw_axolotl_cfg.read_text(encoding='utf-8')}\n```\n\n

\n" def handle_untrained_tokens_fix( cfg: DictDefault, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, train_dataset: Dataset, ): """ Apply fixes for untrained tokens if configured. Args: cfg: Dictionary mapping `axolotl` config keys to values. model: The model to apply fixes to. tokenizer: The tokenizer for token identification. train_dataset: The training dataset to use. """ if not cfg.fix_untrained_tokens: return is_ds_zero3: bool = False if os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3": is_ds_zero3 = True # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args sig = inspect.signature(fix_untrained_tokens) fix_kwargs: Dict[str, Any] = {} # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list if "token_ids_to_fix" in sig.parameters and isinstance( cfg.fix_untrained_tokens, list ): fix_kwargs["token_ids_to_fix"] = cfg.fix_untrained_tokens if "is_ds_zero3" in sig.parameters: fix_kwargs["is_ds_zero3"] = is_ds_zero3 fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs) if cfg.local_rank == 0: model.save_pretrained(str(Path(cfg.output_dir))) def setup_model_and_trainer( cfg: DictDefault, dataset_meta: TrainDatasetMeta ) -> tuple[ "HFRLTrainerBuilder" | "HFCausalTrainerBuilder", PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None, ]: """ Load model, tokenizer, trainer, etc. Helper function to encapsulate the full trainer setup. Args: cfg: The configuration dictionary with training parameters. dataset_meta: Object with training, validation datasets and metadata. Returns: Tuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor """ # Load tokenizer, processor and model model, tokenizer, peft_config, processor = setup_model_and_tokenizer(cfg) # Set up reference model for RL if needed model_ref = setup_reference_model(cfg, tokenizer) # Get datasets from metadata train_dataset = dataset_meta.train_dataset eval_dataset = dataset_meta.eval_dataset total_num_steps = dataset_meta.total_num_steps # Set up trainer trainer = setup_trainer( cfg=cfg, train_dataset=train_dataset, eval_dataset=eval_dataset, model=model, tokenizer=tokenizer, processor=processor, total_num_steps=total_num_steps, model_ref=model_ref, peft_config=peft_config, ) PLUGIN_MANAGER.post_trainer_create(cfg, trainer) if cfg.use_ray: try: import ray.train.huggingface.transformers trainer = ray.train.huggingface.transformers.prepare_trainer(trainer) except ImportError: LOG.warning( "The Ray integration with Hugging Face Transformers is not available. " "To use Ray, install the 'ray[train]' package." ) return ( trainer, model, tokenizer, peft_config, processor, ) @send_errors def train( cfg: DictDefault, dataset_meta: TrainDatasetMeta ) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]: """ Train a model on the given dataset. Args: cfg: The configuration dictionary with training parameters dataset_meta: Object with training, validation datasets and metadata Returns: Tuple of (model, tokenizer) after training """ # Setup model, tokenizer, (causal or RLHF) trainer, etc. ( trainer, model, tokenizer, peft_config, processor, ) = setup_model_and_trainer(cfg, dataset_meta) # Handle untrained tokens if configured train_dataset = dataset_meta.train_dataset handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset) # Additional setup save_initial_configs(cfg, tokenizer, model, peft_config, processor) setup_signal_handler(cfg, model) setup_model_card(cfg) # Execute the training resume_from_checkpoint = determine_last_checkpoint(cfg) execute_training(cfg, trainer, resume_from_checkpoint) # clear cache if torch.cuda.is_available(): torch.cuda.empty_cache() # Save the trained model and cleanup save_trained_model(cfg, trainer, model) tokenizer.save_pretrained( str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files ) create_model_card(cfg, trainer) if not cfg.use_ray: cleanup_distributed() PLUGIN_MANAGER.post_train(cfg, model) return model, tokenizer, trainer ================================================ FILE: src/axolotl/utils/__init__.py ================================================ """ Basic utils for Axolotl """ import importlib.util import os import re import torch def is_mlflow_available(): return importlib.util.find_spec("mlflow") is not None def is_comet_available(): return importlib.util.find_spec("comet_ml") is not None def is_opentelemetry_available(): return ( importlib.util.find_spec("opentelemetry") is not None and importlib.util.find_spec("prometheus_client") is not None ) def is_trackio_available(): return importlib.util.find_spec("trackio") is not None def get_pytorch_version() -> tuple[int, int, int]: """ Get Pytorch version as a tuple of (major, minor, patch). """ torch_version = torch.__version__ version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version) if not version_match: raise ValueError("Invalid version format") major, minor, patch = version_match.groups() major, minor = int(major), int(minor) patch = int(patch) if patch is not None else 0 # Default patch to 0 if not present return major, minor, patch def set_pytorch_cuda_alloc_conf(): """Set up CUDA allocation config""" torch_version = torch.__version__.split(".") torch_major, torch_minor = int(torch_version[0]), int(torch_version[1]) config_value = "expandable_segments:True,roundup_power2_divisions:16" if ( torch_major == 2 and torch_minor >= 9 and os.getenv("PYTORCH_ALLOC_CONF") is None ): os.environ["PYTORCH_ALLOC_CONF"] = config_value elif ( torch_major == 2 and torch_minor >= 2 and os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None ): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = config_value def set_misc_env(): if os.getenv("XFORMERS_IGNORE_FLASH_VERSION_CHECK") is None: os.environ["XFORMERS_IGNORE_FLASH_VERSION_CHECK"] = "1" def get_not_null(value, default=None): """ return the value if it's not None, otherwise return the default value """ return value if value is not None else default ================================================ FILE: src/axolotl/utils/bench.py ================================================ """Benchmarking and measurement utilities""" import functools import logging import torch from transformers.utils.import_utils import is_torch_npu_available from axolotl.utils.distributed import get_device_type try: from pynvml import ( NVMLError, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit, ) except ImportError: NVMLError = None nvmlDeviceGetHandleByIndex = None nvmlDeviceGetMemoryInfo = None nvmlInit = None def check_cuda_device(default_value): """ wraps a function and returns the default value instead of running the wrapped function if cuda isn't available or the device is auto :param default_value: :return: """ def deco(func): @functools.wraps(func) def wrapper(*args, **kwargs): device = kwargs.get("device", args[0] if args else None) if ( device is None or not torch.cuda.is_available() or device == "auto" or torch.device(device).type == "cpu" or torch.device(device).type == "meta" ): return default_value return func(*args, **kwargs) return wrapper return deco @check_cuda_device(0.0) def gpu_memory_usage(device=0): return torch.cuda.memory_allocated(device) / 1024.0**3 @check_cuda_device((0.0, 0.0, 0.0)) def gpu_memory_usage_all(device=0): active = torch.cuda.memory_stats().get("active_bytes.all.peak", 0) / 1024.0**3 allocated = torch.cuda.max_memory_allocated(device) / 1024.0**3 reserved = torch.cuda.max_memory_reserved(device) / 1024.0**3 torch.cuda.reset_peak_memory_stats(device) return active, allocated, reserved def mps_memory_usage_all(): active = torch.mps.current_allocated_memory() / 1024.0**3 allocated = torch.mps.driver_allocated_memory() / 1024.0**3 return active, allocated, 0 def npu_memory_usage_all(device=0): usage = torch.npu.memory_allocated(device) / 1024.0**3 reserved = torch.npu.memory_reserved(device) / 1024.0**3 return usage, reserved - usage, 0 @check_cuda_device(0.0) def gpu_memory_usage_smi(device=0): if isinstance(device, torch.device): device = device.index if isinstance(device, str) and device.startswith("cuda:"): device = int(device[5:]) if not nvmlInit: return 0.0 try: nvmlInit() handle = nvmlDeviceGetHandleByIndex(device) info = nvmlDeviceGetMemoryInfo(handle) return info.used / 1024.0**3 except NVMLError: return 0.0 def get_gpu_memory_usage(device: int | torch.device = 0): cur_device_type = str(get_device_type()) if torch.backends.mps.is_available(): usage, cache, misc = mps_memory_usage_all() elif "npu" in cur_device_type and is_torch_npu_available(): usage, cache, misc = npu_memory_usage_all(device) elif "cuda" in cur_device_type and torch.cuda.is_available(): usage, cache, misc = gpu_memory_usage_all(device) else: return 0.0, 0.0, 0.0 return usage, cache, misc def log_gpu_memory_usage( log: logging.Logger | logging.LoggerAdapter, msg: str = "", device: int | torch.device = 0, ): try: active, allocated, reserved = get_gpu_memory_usage(device) except ValueError: # likely CPU, ignore return cur_device_type = str(get_device_type()) extras = [] if allocated > 0: extras.append(f"+{allocated:.03f}GB allocated") if reserved > 0: extras.append(f"+{reserved:.03f}GB reserved") msg = f"{cur_device_type} memory active:" if not msg else msg log.debug( f"{msg} {active:.03f}GB ({', '.join(extras)})", stacklevel=2, ) ================================================ FILE: src/axolotl/utils/callbacks/__init__.py ================================================ """Callbacks for Trainer class""" from __future__ import annotations import gc import json import os import traceback from shutil import copyfile from tempfile import NamedTemporaryFile from typing import TYPE_CHECKING, Any, Dict, List import evaluate import numpy as np import pandas as pd import torch import torch.distributed as dist import wandb import yaml from datasets import load_dataset from tqdm import tqdm from transformers import ( GenerationConfig, Trainer, TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from transformers.trainer_utils import ( SaveStrategy, ) from trl.models import unwrap_model_for_generation from axolotl.utils import is_comet_available, is_mlflow_available from axolotl.utils.callbacks.perplexity import Perplexity from axolotl.utils.distributed import ( barrier, broadcast_dict, gather_scalar_from_all_ranks, get_world_size, is_distributed, is_main_process, zero_first, ) from axolotl.utils.logging import get_logger from axolotl.utils.schemas.config import AxolotlInputConfig if TYPE_CHECKING: from axolotl.core.training_args import AxolotlTrainingArguments IGNORE_INDEX = -100 LOG = get_logger(__name__) class LossWatchDogCallback(TrainerCallback): """Callback to track loss and stop training if loss is too high""" def __init__(self, cfg): self.cfg = cfg self.violations = 0 self.threshold = cfg.loss_watchdog_threshold self.patience = cfg.loss_watchdog_patience or 3 def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **_kwargs, ) -> TrainerControl: if len(state.log_history) > 0 and "loss" in state.log_history[-1]: if state.log_history[-1]["loss"] > self.threshold: self.violations += 1 if self.violations >= self.patience: LOG.warning( "Loss is too high, stopping training (loss_watchdog_threshold)" ) control.should_training_stop = True else: self.violations = 0 return control class SaveModelOnFirstStepCallback(TrainerCallback): """Callback to save the model on the first step of training if enabled""" def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **_kwargs, ) -> TrainerControl: if state.global_step == 1: control.should_save = True return control def bench_eval_callback_factory(trainer, tokenizer): accuracy = evaluate.load("accuracy") abcd_idx = [ tokenizer("A", add_special_tokens=False).input_ids[0], tokenizer("B", add_special_tokens=False).input_ids[0], tokenizer("C", add_special_tokens=False).input_ids[0], tokenizer("D", add_special_tokens=False).input_ids[0], tokenizer("E", add_special_tokens=False).input_ids[0], tokenizer("F", add_special_tokens=False).input_ids[0], tokenizer("G", add_special_tokens=False).input_ids[0], ] bench_split = "eval" def transform_bench_subject(example): # Split on ':' and trim whitespace parts = example["subject"].split(":") first_part = ( parts[0].strip().lower().replace("-", "_") ) # Lowercase the first part second_part = ( parts[1].strip().replace("-", "_") if len(parts) > 1 else "all" ) # Replace hyphens with underscores # Return the transformed values return {"name": first_part, "subject": second_part} if trainer.args.bench_dataset == "mmlu-zs": bench_dataset = load_dataset( "openaccess-ai-collective/mmlu-evals", data_files={ "eval": "zero_shot_mmlu_val.json", "test": "zero_shot_mmlu_test.json", }, ) # bench_dataset = bench_dataset.remove_columns("subject") # MMLU Five-shot (Eval/Test only) elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]: bench_dataset = load_dataset( "openaccess-ai-collective/mmlu-evals", data_files={ "eval": "five_shot_mmlu_val.json", "test": "five_shot_mmlu_test.json", }, ) # bench_dataset = bench_dataset.remove_columns('subject') elif "/" in trainer.args.bench_dataset: bench_ds = trainer.args.bench_dataset bench_ds_name = "/".join(bench_ds.split("/", 2)[:2]) bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:]) bench_dataset = load_dataset( bench_ds_name, data_files={ "eval": bench_ds_data_file, }, ) bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject) else: raise ValueError( f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args" ) bench_dataset = bench_dataset[trainer.args.bench_split] if trainer.args.max_bench_samples is not None: bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples)) def tokenize_evals(example): source = f"{tokenizer.bos_token}{example['input']}" target = f"{example['output']}{tokenizer.eos_token}" tokenized_source = tokenizer( source, max_length=2048, truncation=True, add_special_tokens=False, ) tokenized_target = tokenizer( target, max_length=2048, truncation=True, add_special_tokens=False, ) input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"] labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[ "input_ids" ] return { "input_ids": input_ids, "labels": labels, "subject": example["subject"], } with zero_first(is_main_process()): bench_dataset = bench_dataset.map(tokenize_evals) bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx) class BenchEvalCallback(TrainerCallback): """ TrainerCallback that runs the MMLU evals """ def on_evaluate( self, args: AxolotlTrainingArguments, state: TrainerState, control: TrainerControl, metrics: Dict[str, float], **kwargs, ): data_loader = trainer.get_bench_dataloader( bench_dataset.remove_columns(["input", "subject", "output", "name"]) ) trainer.model.eval() preds, refs = [], [] loss_bench = 0 for batch in tqdm(data_loader, total=len(data_loader)): (loss, logits, labels) = trainer.prediction_step( trainer.model, batch, prediction_loss_only=False, ) # There are two tokens, the output, and eos token. for i, logit in enumerate(logits): label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[ 0 ][0] logit_abcd = logit[label_non_zero_id - 1][abcd_idx] preds.append(torch.argmax(logit_abcd).item()) labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0] refs += [ abcd_idx.index(label) if label in abcd_idx else -1 for label in labels.tolist() ] loss_bench += loss.item() # Extract results by subject. bench_name = bench_dataset["name"] bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)} for s, p, r in zip(bench_name, preds, refs, strict=False): bench_names[s]["preds"].append(p) bench_names[s]["refs"].append(r) barrier() local_bench_names = bench_names gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())] # Gather results from all GPUs to GPU 0 loss_bench_ranks = gather_scalar_from_all_ranks( lambda: loss_bench, get_world_size() ) len_data_loader_ranks = gather_scalar_from_all_ranks( lambda: len(data_loader), get_world_size() ) results = {} if is_distributed() and not is_main_process(): dist.gather_object(local_bench_names, dst=0) else: if is_distributed(): dist.gather_object(local_bench_names, gathered_bench_names, dst=0) else: gathered_bench_names = [local_bench_names] bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks) results = {f"{bench_split}_bench_loss": bench_loss} # Combine results from all GPUs combined_bench_names: Dict[str, Dict[str, List]] = {} for bench_name in gathered_bench_names: for name, data in bench_name.items(): if name not in combined_bench_names: combined_bench_names[name] = {"refs": [], "preds": []} combined_bench_names[name]["refs"].extend(data["refs"]) combined_bench_names[name]["preds"].extend(data["preds"]) bench_scores = [] bench_refs = [] bench_preds = [] for bench_name in combined_bench_names: bench_score = accuracy.compute( references=combined_bench_names[bench_name]["refs"], predictions=combined_bench_names[bench_name]["preds"], )["accuracy"] bench_refs.extend(combined_bench_names[bench_name]["refs"]) bench_preds.extend(combined_bench_names[bench_name]["preds"]) if not pd.isna(bench_score): results[f"{bench_split}_bench_accuracy_{bench_name}"] = ( bench_score ) bench_scores.append(bench_score) else: results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0 bench_scores.append(0.0) results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores) results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute( references=bench_refs, predictions=bench_preds )["accuracy"] trainer.log(results) results = broadcast_dict(results) for key, val in results.items(): metrics[key] = val return BenchEvalCallback def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer): class CausalLMBenchEvalCallback(TrainerCallback): """Callback to log prediction values during each evaluation""" def __init__(self, cfg): self.cfg = cfg self.logged = False self.metrics = self.__maybe_load_metrics() def __maybe_load_metrics(self): metrics = {} for metric in self.cfg.eval_causal_lm_metrics: if metric == "perplexity": max_seq_len = self.cfg.eval_max_new_tokens metrics[metric] = Perplexity( tokenizer=tokenizer, max_seq_len=max_seq_len, ) else: try: metrics[metric] = evaluate.load(metric) except Exception as exc: LOG.warning(f"{metric}: {exc.args}") return metrics def on_evaluate( self, args: AxolotlTrainingArguments, state: TrainerState, control: TrainerControl, train_dataloader, eval_dataloader, **kwargs, ): trainer.model_wrapped.eval() device = torch.device( self.cfg.device ) # Use this instead of trainer.model_wrapped.device as it may return cpu if fsdp offloaded generation_config = GenerationConfig( max_new_tokens=self.cfg.eval_max_new_tokens, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, use_cache=True, return_dict_in_generate=True, output_attentions=False, output_hidden_states=False, output_scores=False, ) def find_ranges(lst): ranges = [] start = 0 for i in range(1, len(lst)): if lst[i] == 0: ranges.append((start, i - 1)) start = i end = len(lst) - 1 ranges.append((start, end)) return ranges def compute(metric: evaluate.Metric, **kwargs): # safely compute a metric and return the score if the format is correct metric_score = None try: # Only pass the kwargs that are in the metric's feature list metric_kwargs = { k: kwargs[k] for k in metric._feature_names() if k in kwargs } if isinstance(metric, Perplexity): metric_kwargs["model"] = trainer.model_wrapped metric_score = metric.compute(**metric_kwargs) return ( metric_score["score"] if "score" in metric_score else metric_score["mean_score"] ) except Exception: traceback.print_exc() LOG.debug( f"Failed to compute metric {metric.name} with kwargs {kwargs.keys()}" ) return metric_score def evaluate_preds(sources, predictions, references): scores = {} for metric_name, metric in self.metrics.items(): score = compute( metric, references=references, predictions=predictions, sources=sources, ) if score is None: score = compute( metric, references=[[r] for r in references], predictions=predictions, ) scores["eval_" + metric_name] = score return scores def predict_with_generate(): eval_src, eval_pred, eval_ref = [], [], [] with unwrap_model_for_generation( trainer.model_wrapped, trainer.accelerator ) as unwrapped_model: for batch in tqdm(eval_dataloader, disable=not is_main_process()): batch_labels = batch["labels"].to(device) batch_input_ids = batch["input_ids"].to(device) if "position_ids" in batch: batch_pos_ids = batch["position_ids"].tolist() else: batch_pos_ids = [None] * len(batch["input_ids"]) prompt_token_ids_list = [] completion_token_ids_list = [] for input_ids_all, labels_all, pos_ids in zip( batch_input_ids, batch_labels, batch_pos_ids, strict=False, ): if pos_ids is None: pos_ranges = [(0, len(input_ids_all) - 1)] else: pos_ranges = find_ranges(pos_ids) for pos_range in pos_ranges: start, end = pos_range if start == end: continue input_ids = input_ids_all[start : end + 1] labels = labels_all[start : end + 1] tokens_without_loss = labels == IGNORE_INDEX tokens_with_loss = labels != IGNORE_INDEX tokens_exclude_padding = ( input_ids != tokenizer.pad_token_id ) prompt_token_includes = ( tokens_without_loss & tokens_exclude_padding ) prompt_token_ids = input_ids[prompt_token_includes] prompt_token_ids_list.append(prompt_token_ids) completion_token_ids = input_ids[tokens_with_loss] completion_token_ids_list.append(completion_token_ids) prompt_texts = tokenizer.batch_decode( prompt_token_ids_list, skip_special_tokens=True ) completion_texts = tokenizer.batch_decode( completion_token_ids_list, skip_special_tokens=True ) with torch.no_grad(): prompt_encoding = tokenizer( prompt_texts, padding=True, return_tensors="pt" ).to(device) predictions = unwrapped_model.generate( **prompt_encoding, generation_config=generation_config ) del prompt_encoding prediction_all_tokens = predictions["sequences"].cpu().tolist() prediction_without_prompt_tokens_list = [] for prompt_token_ids, prediction_tokens in zip( prompt_token_ids_list, prediction_all_tokens, strict=False ): prediction_without_prompt_tokens = prediction_tokens[ len(prompt_token_ids) : ] prediction_without_prompt_tokens_list.append( prediction_without_prompt_tokens ) predicted_texts = tokenizer.batch_decode( prediction_without_prompt_tokens_list, skip_special_tokens=True, ) eval_src.extend(prompt_texts) eval_pred.extend(predicted_texts) eval_ref.extend(completion_texts) return eval_src, eval_pred, eval_ref eval_preds = predict_with_generate() trainer.log(evaluate_preds(*eval_preds)) return control return CausalLMBenchEvalCallback def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str): class LogPredictionCallback(TrainerCallback): """Callback to log prediction values during each evaluation""" def __init__(self, cfg): self.cfg = cfg self.logged = False def on_evaluate( self, args: AxolotlTrainingArguments, state: TrainerState, control: TrainerControl, train_dataloader, eval_dataloader, **kwargs, ): eval_table_size = self.cfg.eval_table_size if eval_table_size <= 0: return control trainer.model.eval() device = torch.device(self.cfg.device) generation_config = GenerationConfig( max_new_tokens=self.cfg.eval_max_new_tokens, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, use_cache=True, return_dict_in_generate=True, output_attentions=False, output_hidden_states=False, output_scores=False, ) def logits_to_tokens(logits) -> torch.Tensor: probabilities = torch.softmax(logits, dim=-1) # Get the predicted token ids (the ones with the highest probability) predicted_token_ids = torch.argmax(probabilities, dim=-1) return predicted_token_ids def find_ranges(lst): ranges = [] start = 0 for i in range(1, len(lst)): if lst[i] == 0: ranges.append((start, i - 1)) start = i end = len(lst) - 1 ranges.append((start, end)) return ranges def log_table_from_dataloader(name: str, table_dataloader): table_data: Dict[str, List[Any]] = { "id": [], "Prompt": [], "Correct Completion": [], "Predicted Completion (model.generate)": [], "Predicted Completion (trainer.prediction_step)": [], } row_index = 0 for batch in tqdm(table_dataloader): if row_index > eval_table_size: break batch_labels = batch["labels"].to(device) batch_input_ids = batch["input_ids"].to(device) if "position_ids" in batch: batch_pos_ids = batch["position_ids"].tolist() else: batch_pos_ids = [None] * len(batch["input_ids"]) (_, batch_logits, _) = trainer.prediction_step( trainer.model, batch, prediction_loss_only=False, ) prompt_token_ids_list = [] pred_step_token_ids_list = [] completion_token_ids_list = [] for input_ids_all, labels_all, pos_ids, logits in zip( batch_input_ids, batch_labels, batch_pos_ids, batch_logits, strict=False, ): if pos_ids is None: pos_ranges = [(0, len(input_ids_all) - 1)] else: pos_ranges = find_ranges(pos_ids) for pos_range in pos_ranges: start, end = pos_range if start == end: continue input_ids = input_ids_all[start : end + 1] labels = labels_all[start : end + 1] tokens_without_loss = labels == IGNORE_INDEX tokens_with_loss = labels != IGNORE_INDEX tokens_exclude_padding = input_ids != tokenizer.pad_token_id prompt_token_includes = ( tokens_without_loss & tokens_exclude_padding ) prompt_token_ids = input_ids[prompt_token_includes] prompt_token_ids_list.append(prompt_token_ids) completion_token_ids = input_ids[tokens_with_loss] completion_token_ids_list.append(completion_token_ids) pred_step_token_ids = logits_to_tokens( logits[start : end + 1] )[tokens_with_loss] pred_step_token_ids_list.append(pred_step_token_ids) prompt_texts = tokenizer.batch_decode( prompt_token_ids_list, skip_special_tokens=True ) completion_texts = tokenizer.batch_decode( completion_token_ids_list, skip_special_tokens=True ) pred_step_texts = tokenizer.batch_decode( pred_step_token_ids_list, skip_special_tokens=True ) with torch.no_grad(): prompt_encoding = tokenizer( prompt_texts, padding=True, return_tensors="pt" ).to(self.cfg.device) predictions = trainer.model.generate( **prompt_encoding, generation_config=generation_config ) prediction_all_tokens = predictions["sequences"].cpu().tolist() prediction_without_prompt_tokens_list = [] for prompt_token_ids, prediction_tokens in zip( prompt_token_ids_list, prediction_all_tokens, strict=False ): prediction_without_prompt_tokens = prediction_tokens[ len(prompt_token_ids) : ] prediction_without_prompt_tokens_list.append( prediction_without_prompt_tokens ) predicted_texts = tokenizer.batch_decode( prediction_without_prompt_tokens_list, skip_special_tokens=True ) for ( prompt_text, completion_text, prediction_text, pred_step_text, ) in zip( prompt_texts, completion_texts, predicted_texts, pred_step_texts, strict=False, ): table_data["id"].append(row_index) table_data["Prompt"].append(prompt_text) table_data["Correct Completion"].append(completion_text) table_data["Predicted Completion (model.generate)"].append( prediction_text ) table_data[ "Predicted Completion (trainer.prediction_step)" ].append(pred_step_text) row_index += 1 if logger == "wandb": # type: ignore[attr-defined] wandb.run.log( { f"{name} - Predictions vs Ground Truth": pd.DataFrame( table_data ) } ) elif logger == "mlflow" and is_mlflow_available(): import mlflow tracking_uri = AxolotlInputConfig( **self.cfg.to_dict() ).mlflow_tracking_uri mlflow.log_table( data=table_data, artifact_file="PredictionsVsGroundTruth.json", tracking_uri=tracking_uri, ) elif logger == "comet_ml" and is_comet_available(): import comet_ml experiment = comet_ml.get_running_experiment() if experiment: experiment.log_table( f"{name} - Predictions vs Ground Truth.csv", pd.DataFrame(table_data), ) if is_main_process(): log_table_from_dataloader("Eval", eval_dataloader) return control return LogPredictionCallback class SaveAxolotlConfigtoWandBCallback(TrainerCallback): """Callback to save axolotl config to wandb""" def __init__(self, axolotl_config_path): self.axolotl_config_path = axolotl_config_path def on_train_begin( self, args: AxolotlTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): if state.is_world_process_zero: try: # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later. with NamedTemporaryFile( mode="w", delete=False, suffix=".yml", prefix="axolotl_config_" ) as temp_file: copyfile(self.axolotl_config_path, temp_file.name) artifact = wandb.Artifact( f"config-{wandb.run.id}", type="axolotl-config" ) artifact.add_file(temp_file.name) wandb.log_artifact(artifact) wandb.save(temp_file.name) LOG.info( "The Axolotl config has been saved to the WandB run under files." ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving Axolotl config to WandB: {err}") try: with open(self.axolotl_config_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} chat_tpl = cfg.get("chat_template_jinja") if chat_tpl: with NamedTemporaryFile( mode="w", delete=True, suffix=".jinja", prefix="chat_template_" ) as temp_ct_file: if ( isinstance(chat_tpl, str) and os.path.exists(chat_tpl) and os.path.isfile(chat_tpl) ): copyfile(chat_tpl, temp_ct_file.name) else: temp_ct_file.write(str(chat_tpl)) temp_ct_file.flush() artifact = wandb.Artifact( f"chat-template-{wandb.run.id}", type="jinja-template" ) artifact.add_file(temp_ct_file.name) wandb.log_artifact(artifact) wandb.save(temp_ct_file.name) LOG.info( "The chat_template_jinja has been saved to the WandB run under files." ) except (FileNotFoundError, ConnectionError, yaml.YAMLError) as err: LOG.warning(f"Error while saving chat_template_jinja to WandB: {err}") if args.deepspeed: try: # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later. with NamedTemporaryFile( mode="w", delete=False, suffix=".json", prefix="deepspeed_config_", ) as temp_file: skip_upload = False if isinstance(args.deepspeed, dict): json.dump(args.deepspeed, temp_file, indent=4) elif isinstance(args.deepspeed, str) and os.path.exists( args.deepspeed ): copyfile(args.deepspeed, temp_file.name) else: skip_upload = True if not skip_upload: artifact = wandb.Artifact( f"deepspeed-config-{wandb.run.id}", type="deepspeed-config", ) artifact.add_file(temp_file.name) wandb.log_artifact(artifact) wandb.save(temp_file.name) LOG.info( "The DeepSpeed config has been saved to the WandB run under files." ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}") return control class GCCallback(TrainerCallback): """Callback to garbage collect torch cache""" def __init__(self, gc_steps: int | None = -1): self.gc_steps: int = gc_steps or -1 self.next_gc_on_begin_step: int = -1 def _gc(self): torch.cuda.empty_cache() gc.collect() def on_train_begin( self, args, state, control, **kwargs, ): self._gc() def on_step_begin( self, args, state, control, **kwargs, ): if self.next_gc_on_begin_step == state.global_step or state.global_step == 0: self._gc() def on_step_end( self, args, state, control, **kwargs, ): if control.should_evaluate: # automatically GC before evals so the eval memory spike from the CEL doesn't OOM the trainer self._gc() # also GC on the start of the next step after the eval self.next_gc_on_begin_step = state.global_step + 1 elif self.gc_steps > 0 and state.global_step % self.gc_steps == 0: self._gc() elif ( args.save_strategy == SaveStrategy.STEPS and state.save_steps > 0 and state.global_step % state.save_steps == 0 ): # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors self._gc() elif state.global_step >= state.max_steps: if args.save_strategy == SaveStrategy.STEPS: # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors self._gc() def on_epoch_end( self, args, state, control, **kwargs, ): self._gc() def colab_inference_post_train_callback(trainer: Trainer): class ColabCallback(TrainerCallback): """Callback to prep model for inference on Google Colab""" def __init__(self, cfg): self.gpu_name = torch.cuda.get_device_name(0) self.cfg = cfg def on_train_end(self, args, state, control, **kwargs): """ handle T4 gpu, we need to convert attention to eager for inference """ if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention: trainer.model.config._attn_implementation = "eager" trainer.model.gradient_checkpointing_disable() trainer.model.config.use_cache = True trainer.model.eval() return ColabCallback ================================================ FILE: src/axolotl/utils/callbacks/comet_.py ================================================ """Comet module for trainer callbacks""" from typing import TYPE_CHECKING import comet_ml from transformers import TrainerCallback, TrainerControl, TrainerState from axolotl.utils.distributed import is_main_process from axolotl.utils.logging import get_logger if TYPE_CHECKING: from axolotl.core.training_args import AxolotlTrainingArguments LOG = get_logger(__name__) class SaveAxolotlConfigtoCometCallback(TrainerCallback): """Callback to save axolotl config to comet""" def __init__(self, axolotl_config_path): self.axolotl_config_path = axolotl_config_path def on_train_begin( self, args: "AxolotlTrainingArguments", state: TrainerState, control: TrainerControl, **kwargs, ): if is_main_process(): try: comet_experiment = comet_ml.start(source="axolotl") comet_experiment.log_other("Created from", "axolotl") comet_experiment.log_asset( self.axolotl_config_path, file_name="axolotl-config", ) LOG.info( "The Axolotl config has been saved to the Comet Experiment under assets." ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving Axolotl config to Comet: {err}") return control ================================================ FILE: src/axolotl/utils/callbacks/dynamic_checkpoint.py ================================================ from pathlib import Path from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.utils.distributed import ( barrier, is_distributed, is_main_process, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) DEFAULT_TRIGGER_FILENAME = "axolotl_checkpoint.save" class DynamicCheckpointCallback(TrainerCallback): """ Callback to save checkpoints on-demand during training via: 1. File-based trigger (works everywhere, rank 0 checks file) Thread-safe for multi-GPU distributed training. Usage: # File-based: touch /path/to/output_dir/axolotl_checkpoint.save """ def _get_config_value(self, config, key, default=None): """Helper to get config value from dict or object.""" if isinstance(config, dict): return config.get(key, default) return getattr(config, key, default) def __init__(self, cfg): self.cfg = cfg if not cfg.dynamic_checkpoint or not cfg.dynamic_checkpoint.enabled: self.enabled = False return self.enabled = True dc_config = cfg.dynamic_checkpoint trigger_file_path = self._get_config_value(dc_config, "trigger_file_path") self.trigger_filename = ( trigger_file_path if trigger_file_path else DEFAULT_TRIGGER_FILENAME ) check_interval = self._get_config_value(dc_config, "check_interval") self.check_interval = check_interval if check_interval is not None else 100 self.should_save_checkpoint = False LOG.info( f"Dynamic checkpoint enabled. To trigger checkpoint save:\n" f" • File: touch {cfg.output_dir}/{self.trigger_filename}\n" f" • Check interval: every {self.check_interval} steps", ) def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **_kwargs, ) -> TrainerControl: """ Check for checkpoint triggers at the end of each step. ONLY rank 0 checks the file, then all ranks synchronize. """ if not self.enabled: return control trigger_detected = False if state.global_step % self.check_interval == 0: if is_main_process(): trigger_path = Path(args.output_dir) / self.trigger_filename if trigger_path.exists(): trigger_detected = True try: trigger_path.unlink() # Delete the trigger file LOG.info( f"Dynamic checkpoint triggered via file '{self.trigger_filename}' " f"at step {state.global_step}", ) except OSError as exc: LOG.warning( f"Failed to delete trigger file: {exc}", ) if self.should_save_checkpoint: trigger_detected = True self.should_save_checkpoint = False # Reset flag if is_distributed(): import torch import torch.distributed as dist device = getattr( args, "device", torch.device("cuda" if torch.cuda.is_available() else "cpu"), ) trigger_tensor = torch.tensor( 1 if trigger_detected else 0, dtype=torch.long, device=device, ) dist.broadcast(trigger_tensor, src=0) trigger_detected = bool(trigger_tensor.item()) barrier() if trigger_detected: control.should_save = True LOG.info( f"Saving dynamic checkpoint at step {state.global_step}", ) return control ================================================ FILE: src/axolotl/utils/callbacks/generation.py ================================================ """Callback for generating samples during SFT/Pretrain training.""" from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState from transformers.training_args import TrainingArguments from axolotl.utils.generation.sft import generate_samples from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class SFTGenerationCallback(TrainerCallback): """Callback for generating samples during SFT/Pretrain training.""" def __init__(self, trainer): self.trainer = trainer def on_evaluate( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Generate samples at specified intervals.""" cfg = self.trainer.axolotl_cfg if not getattr(cfg, "generate_samples", False): return dataloader = None try: if getattr(self.trainer, "eval_dataset", None) is not None: dataloader = self.trainer.get_eval_dataloader() LOG.info( f"Using eval dataloader for generation at step {state.global_step}" ) except Exception as e: LOG.warning(f"Could not get eval dataloader: {e}") dataloader = None if dataloader is None: dataloader = self.trainer.get_train_dataloader() LOG.info( f"Using train dataloader for generation at step {state.global_step}" ) samples = generate_samples( model=self.trainer.model, tokenizer=self.trainer.processing_class, dataloader=dataloader, num_generation_samples=getattr(cfg, "num_generation_samples", 3), max_new_tokens=getattr(cfg, "generation_max_new_tokens", 50), temperature=getattr(cfg, "generation_temperature", 0.7), top_p=getattr(cfg, "generation_top_p", None), top_k=getattr(cfg, "generation_top_k", None), do_sample=getattr(cfg, "generation_do_sample", True), prompt_ratio=getattr(cfg, "generation_prompt_ratio", 0.5), ) self._log_samples(samples, state.global_step) def _log_samples(self, samples: list, step: int): """Log generated samples to console and W&B.""" from axolotl.utils.generation.sft import format_generation_for_logging for i, sample in enumerate(samples): console_text, wandb_text = format_generation_for_logging(sample, i, step) LOG.info(console_text) try: import wandb if wandb.run is not None: wandb.log( { f"samples/sample_{i + 1}": wandb.Html( f"
{wandb_text}
" ) }, step=step, ) except (ImportError, Exception): pass ================================================ FILE: src/axolotl/utils/callbacks/lisa.py ================================================ """ module for LISA Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl Arxiv: https://arxiv.org/abs/2403.17919 License: Apache 2.0 """ from functools import reduce from typing import TYPE_CHECKING import numpy as np from transformers import TrainerCallback from axolotl.utils.logging import get_logger if TYPE_CHECKING: from axolotl.core.trainers import AxolotlTrainer LOG = get_logger(__name__) def lisa_callback_factory(trainer: "AxolotlTrainer"): class LISACallback(TrainerCallback): """trainer callback for lisa layer switching""" def __init__( self, n_layers, step_interval, trainer, layers_attribute="model.layers" ): super().__init__() self.n_layers = n_layers self.step_interval = step_interval self.layers_attribute = layers_attribute self.trainer = trainer reduce(getattr, self.layers_attribute.split("."), self.trainer.model) self.total_layers = len( reduce(getattr, self.layers_attribute.split("."), self.trainer.model) ) self.active_layers_indices = [] layers = reduce( getattr, self.layers_attribute.split("."), self.trainer.model ) LOG.info( f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers * 100 / len(layers)}%) every {self.step_interval} steps" ) def freeze_all_layers(self): layers = reduce( getattr, self.layers_attribute.split("."), self.trainer.model ) for layer in layers: for param in layer.parameters(): param.requires_grad = False def on_step_begin(self, args, state, control, **kwargs): # Check if it's time to switch active layers, including at step 0 if state.global_step % self.step_interval == 0 or state.global_step == 1: self.switch_active_layers() def switch_active_layers(self): # First, disable gradients for all layers self.freeze_all_layers() # Randomly select n_layers to activate layers = reduce( getattr, self.layers_attribute.split("."), self.trainer.model ) self.active_layers_indices = np.random.choice( range(self.total_layers), self.n_layers, replace=False ) LOG.info( f"Activating layers at indices: {self.active_layers_indices} for the next steps." ) # Enable gradients only for the selected layers for idx in self.active_layers_indices: for param in layers[idx].parameters(): param.requires_grad = True lisa_callback = LISACallback( n_layers=trainer.args.lisa_n_layers, step_interval=trainer.args.lisa_step_interval, trainer=trainer, layers_attribute=trainer.args.lisa_layers_attribute, ) return lisa_callback ================================================ FILE: src/axolotl/utils/callbacks/mlflow_.py ================================================ """MLFlow module for trainer callbacks""" import os from shutil import copyfile from tempfile import NamedTemporaryFile from typing import TYPE_CHECKING import mlflow from transformers import TrainerCallback, TrainerControl, TrainerState from axolotl.utils.distributed import is_main_process from axolotl.utils.logging import get_logger if TYPE_CHECKING: from axolotl.core.training_args import AxolotlTrainingArguments LOG = get_logger(__name__) def should_log_artifacts() -> bool: truths = ["TRUE", "1", "YES"] return os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in truths class SaveAxolotlConfigtoMlflowCallback(TrainerCallback): """Callback to save axolotl config to mlflow""" def __init__(self, axolotl_config_path): self.axolotl_config_path = axolotl_config_path def on_train_begin( self, args: "AxolotlTrainingArguments", state: TrainerState, control: TrainerControl, **kwargs, ): if is_main_process(): try: if should_log_artifacts(): with NamedTemporaryFile( mode="w", delete=False, suffix=".yml", prefix="axolotl_config_" ) as temp_file: copyfile(self.axolotl_config_path, temp_file.name) mlflow.log_artifact(temp_file.name, artifact_path="") LOG.info( "The Axolotl config has been saved to the MLflow artifacts." ) else: LOG.info( "Skipping logging artifacts to MLflow (hf_mlflow_log_artifacts is false)" ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving Axolotl config to MLflow: {err}") return control ================================================ FILE: src/axolotl/utils/callbacks/models.py ================================================ """Helper functions for model classes""" from typing import Tuple from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES def get_causal_lm_model_cls_prefix(model_type: str) -> Tuple[str, str]: if model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: causal_lm_cls = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type] causal_lm_cls_prefix = causal_lm_cls for suffix in [ "ForCausalLM", "ForConditionalGeneration", "LMHeadModel", "GenerationDecoder", ]: causal_lm_cls_prefix = causal_lm_cls_prefix.replace(suffix, "") return causal_lm_cls_prefix, causal_lm_cls causal_lm_cls_prefix = "".join( [part.capitalize() for part in model_type.split("_")] ) return causal_lm_cls_prefix, f"{causal_lm_cls_prefix}ForCausalLM" ================================================ FILE: src/axolotl/utils/callbacks/opentelemetry.py ================================================ """OpenTelemetry metrics callback for Axolotl training""" import threading from typing import Dict, Optional from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) try: from opentelemetry import metrics from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.metrics import set_meter_provider from opentelemetry.sdk.metrics import MeterProvider as SDKMeterProvider from prometheus_client import start_http_server OPENTELEMETRY_AVAILABLE = True except ImportError: LOG.warning("OpenTelemetry not available. pip install [opentelemetry]") OPENTELEMETRY_AVAILABLE = False class OpenTelemetryMetricsCallback(TrainerCallback): """ TrainerCallback that exports training metrics to OpenTelemetry/Prometheus. This callback automatically tracks key training metrics including: - Training loss - Evaluation loss - Learning rate - Epoch progress - Global step count - Gradient norm Metrics are exposed via HTTP endpoint for Prometheus scraping. """ def __init__(self, cfg): if not OPENTELEMETRY_AVAILABLE: LOG.warning("OpenTelemetry not available, metrics will not be collected") self.metrics_enabled = False return self.cfg = cfg self.metrics_host = getattr(cfg, "otel_metrics_host", "localhost") self.metrics_port = getattr(cfg, "otel_metrics_port", 8000) self.metrics_enabled = True self.server_started = False self.metrics_lock = threading.Lock() try: # Create Prometheus metrics reader prometheus_reader = PrometheusMetricReader() # Create meter provider with Prometheus exporter provider = SDKMeterProvider(metric_readers=[prometheus_reader]) set_meter_provider(provider) # Get meter for creating metrics self.meter = metrics.get_meter("axolotl.training") # Create metrics self._create_metrics() except Exception as e: LOG.warning(f"Failed to initialize OpenTelemetry metrics: {e}") self.metrics_enabled = False def _create_metrics(self): """Create all metrics that will be tracked""" self.train_loss_gauge = self.meter.create_gauge( name="axolotl_train_loss", description="Current training loss", unit="1", ) self.eval_loss_gauge = self.meter.create_gauge( name="axolotl_eval_loss", description="Current evaluation loss", unit="1", ) self.learning_rate_gauge = self.meter.create_gauge( name="axolotl_learning_rate", description="Current learning rate", unit="1", ) self.epoch_gauge = self.meter.create_gauge( name="axolotl_epoch", description="Current training epoch", unit="1", ) self.global_step_counter = self.meter.create_counter( name="axolotl_global_steps", description="Total training steps completed", unit="1", ) self.grad_norm_gauge = self.meter.create_gauge( name="axolotl_gradient_norm", description="Gradient norm", unit="1", ) self.memory_usage_gauge = self.meter.create_gauge( name="axolotl_memory_usage", description="Current memory usage in MB", unit="MB", ) def _start_metrics_server(self): """Start the HTTP server for metrics exposure""" if self.server_started: return try: start_http_server(self.metrics_port, addr=self.metrics_host) self.server_started = True LOG.info( f"OpenTelemetry metrics server started on http://{self.metrics_host}:{self.metrics_port}/metrics" ) except Exception as e: LOG.error(f"Failed to start OpenTelemetry metrics server: {e}") def on_train_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Called at the beginning of training""" if not self.metrics_enabled: return self._start_metrics_server() LOG.info("OpenTelemetry metrics collection started") def on_log( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs, ): """Called when logging occurs""" if not self.metrics_enabled or not logs: return if "loss" in logs: self.train_loss_gauge.set(logs["loss"]) if "eval_loss" in logs: self.eval_loss_gauge.set(logs["eval_loss"]) if "learning_rate" in logs: self.learning_rate_gauge.set(logs["learning_rate"]) if "epoch" in logs: self.epoch_gauge.set(logs["epoch"]) if "grad_norm" in logs: self.grad_norm_gauge.set(logs["grad_norm"]) if "memory_usage" in logs: self.memory_usage_gauge.set(logs["memory_usage"]) def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Called at the end of each training step""" if not self.metrics_enabled: return # Update step counter and epoch self.global_step_counter.add(1) if state.epoch is not None: self.epoch_gauge.set(state.epoch) def on_evaluate( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: Optional[Dict[str, float]] = None, **kwargs, ): """Called after evaluation""" if not self.metrics_enabled or not metrics: return if "eval_loss" in metrics: self.eval_loss_gauge.set(metrics["eval_loss"]) # Record any other eval metrics as gauges for key, value in metrics.items(): if key.startswith("eval_") and isinstance(value, (int, float)): # Create gauge for this metric if it doesn't exist gauge_name = f"axolotl_{key}" try: gauge = self.meter.create_gauge( name=gauge_name, description=f"Evaluation metric: {key}", unit="1", ) gauge.set(value) except Exception as e: LOG.warning(f"Failed to create/update metric {gauge_name}: {e}") def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Called at the end of training""" if not self.metrics_enabled: return LOG.info("Training completed. OpenTelemetry metrics collection finished.") LOG.info( f"Metrics are still available at http://{self.metrics_host}:{self.metrics_port}/metrics" ) ================================================ FILE: src/axolotl/utils/callbacks/perplexity.py ================================================ """callback to calculate perplexity as an evaluation metric.""" from typing import Dict, List, Optional import torch from torch import Tensor from tqdm import tqdm from transformers.modeling_outputs import CausalLMOutput from transformers.modeling_utils import PreTrainedModel try: from transformers.tokenization_python import PreTrainedTokenizer except ImportError: from transformers.tokenization_utils import PreTrainedTokenizer from axolotl.utils.distributed import is_main_process class Perplexity: """ Calculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity. This is a custom variant that doesn't re-tokenize the input or re-load the model. """ def __init__( self, tokenizer: PreTrainedTokenizer, max_seq_len: int, stride: int = 512, ) -> None: self.max_seq_len = max_seq_len self.stride = stride self.tokenizer = tokenizer self.name = "perplexity" def _feature_names(self) -> List[str]: return ["references"] def compute( self, model: PreTrainedModel, references: Optional[List[str]] = None, ) -> Dict[str, float]: """ Compute perplexity in a fixed length sliding window across the sequence. """ assert references is not None, "Missing parameter: references" model.eval() references_tokenized = self.tokenizer( references, return_tensors="pt", padding=True, truncation=True ) input_ids: Tensor = references_tokenized["input_ids"] # type: ignore input_ids = input_ids.to(model.device) sequence_length = input_ids.size(1) losses = [] prev_end_loc = 0 for begin_loc in tqdm( range(0, sequence_length, self.stride), disable=not is_main_process() ): end_loc = min(begin_loc + self.max_seq_len, sequence_length) trg_len = end_loc - prev_end_loc input_ids_slice = input_ids[:, begin_loc:end_loc] labels_slice = input_ids_slice.clone() labels_slice[:, :-trg_len] = -100 with torch.no_grad(): outputs: CausalLMOutput = model( input_ids=input_ids_slice, labels=labels_slice ) losses.append(outputs.loss) prev_end_loc = end_loc if end_loc == sequence_length: break perplexity = torch.exp(torch.stack(losses).mean()).item() return { "score": perplexity, } ================================================ FILE: src/axolotl/utils/callbacks/profiler.py ================================================ """ HF Trainer callback for creating pytorch profiling snapshots """ from pathlib import Path from pickle import dump # nosec B403 import torch from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) class PytorchProfilerCallback(TrainerCallback): """ PyTorch Profiler callback to create snapshots of GPU memory usage at specified steps. Also runs torch.profiler to produce a Chrome trace for timing analysis. """ def __init__(self, steps_to_profile: int = 5, profiler_steps_start: int = 0): # steps are 0 indexed, so to start at 0-th step, we start at beginning of first step, # and finish at end of last step, so 5 steps_to_profile is steps [0, 1, 2, 3, 4] self.profiler_steps_end = profiler_steps_start + steps_to_profile - 1 if profiler_steps_start == 0: # start recording memory allocations before everything is allocated, because if we start # at the beginning of step 0, we won't have any memory allocations in the traces torch.cuda.memory._record_memory_history(enabled="all", stacks="all") profiler_steps_start = -1 self.profiler_steps_start = profiler_steps_start self._profiler = None def on_step_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): if state.global_step == self.profiler_steps_start: torch.cuda.memory._record_memory_history(enabled="all", stacks="all") # Start torch.profiler on the first profiled step if state.global_step == max(self.profiler_steps_start, 0): profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], record_shapes=True, profile_memory=True, with_stack=True, ) profiler.__enter__() self._profiler = profiler def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): if state.global_step == self.profiler_steps_end: snapshot = torch.cuda.memory._snapshot() with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout: dump(snapshot, fout) # tell CUDA to stop recording memory allocations now torch.cuda.memory._record_memory_history(enabled=None) # Stop and export torch.profiler trace if self._profiler is not None: self._profiler.__exit__(None, None, None) trace_path = Path(args.output_dir) / "profiler_trace.json" self._profiler.export_chrome_trace(str(trace_path)) self._profiler = None def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): # make sure to record if we happen to have more steps than steps to profile if ( state.global_step >= self.profiler_steps_start and state.global_step < self.profiler_steps_end ): snapshot = torch.cuda.memory._snapshot() with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout: dump(snapshot, fout) # tell CUDA to stop recording memory allocations now torch.cuda.memory._record_memory_history(enabled=None) if self._profiler is not None: self._profiler.__exit__(None, None, None) trace_path = Path(args.output_dir) / "profiler_trace.json" self._profiler.export_chrome_trace(str(trace_path)) self._profiler = None ================================================ FILE: src/axolotl/utils/callbacks/qat.py ================================================ """QAT Callback for HF Causal Trainer""" from functools import partial from torch import nn from torchao.quantization.qat.embedding import FakeQuantizedEmbedding from torchao.quantization.qat.linear import FakeQuantizedLinear from transformers import TrainerCallback from axolotl.utils.logging import get_logger from axolotl.utils.schemas.quantization import QATConfig LOG = get_logger(__name__) def toggle_fake_quant(mod: nn.Module, enable: bool): """ Toggle fake quantization for any fake quantized linear or embedding layers in the model. Args: mod: The module to toggle fake quantization for. enable: Whether to enable or disable fake quantization. """ if isinstance(mod, (FakeQuantizedLinear, FakeQuantizedEmbedding)): if ( isinstance(mod, FakeQuantizedLinear) and mod.activation_fake_quantizer is not None ): mod.activation_fake_quantizer.enabled = enable mod.weight_fake_quantizer.enabled = enable class QATCallback(TrainerCallback): """ Callback to toggle fake quantization for the model. """ def __init__(self, cfg: QATConfig): self.cfg = cfg def on_step_begin(self, args, state, control, model, **kwargs): if self.cfg.fake_quant_after_n_steps is not None: if state.global_step == 0: LOG.info(f"Disabling fake quantization at step {state.global_step}") model.apply(partial(toggle_fake_quant, enable=False)) elif state.global_step == self.cfg.fake_quant_after_n_steps: LOG.info(f"Enabling fake quantization at step {state.global_step}") model.apply(partial(toggle_fake_quant, enable=True)) ================================================ FILE: src/axolotl/utils/callbacks/swanlab.py ================================================ """Callbacks for SwanLab integration""" from __future__ import annotations import json import os from shutil import copyfile from tempfile import NamedTemporaryFile from typing import TYPE_CHECKING from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.utils.logging import get_logger if TYPE_CHECKING: from axolotl.core.training_args import AxolotlTrainingArguments LOG = get_logger(__name__) class CustomSwanLabCallback(TrainerCallback): """ Lightweight SwanLab callback that directly logs metrics without using SwanLab's transformers integration (which requires omegaconf). This avoids the antlr4 version conflict between omegaconf and axolotl. """ def __init__(self): self._initialized = False self.swanlab = None def setup(self): """Lazy initialization of SwanLab""" if self._initialized: return try: import swanlab self.swanlab = swanlab # Check if SwanLab run is initialized if swanlab.get_run() is None: LOG.warning("SwanLab run is not initialized") return self._initialized = True LOG.info("CustomSwanLabCallback initialized successfully") except ImportError: LOG.error("SwanLab is not installed") def on_train_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Called at the beginning of training""" if not state.is_world_process_zero: return control self.setup() if not self._initialized: return control # Log training configuration try: self.swanlab.config.update( { "train_batch_size": args.per_device_train_batch_size, "eval_batch_size": args.per_device_eval_batch_size, "learning_rate": args.learning_rate, "num_train_epochs": args.num_train_epochs, "max_steps": args.max_steps, "warmup_steps": args.warmup_steps, "logging_steps": args.logging_steps, "save_steps": args.save_steps, "gradient_accumulation_steps": args.gradient_accumulation_steps, } ) LOG.debug("Training configuration logged to SwanLab") except Exception as err: LOG.warning(f"Failed to log training config: {err}") return control def on_log( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs=None, **kwargs, ): """Called when logging metrics""" if not state.is_world_process_zero: return control if not self._initialized: self.setup() if not self._initialized or logs is None: return control # Log metrics to SwanLab try: # Filter out non-numeric values and prepare for logging metrics = {} for key, value in logs.items(): if isinstance(value, (int, float)): # Use step from state metrics[key] = value if metrics and state.global_step is not None: self.swanlab.log(metrics, step=state.global_step) except Exception as err: LOG.warning(f"Failed to log metrics to SwanLab: {err}") return control def on_train_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): """Called at the end of training""" if not state.is_world_process_zero: return control if self._initialized: LOG.info("Training completed. SwanLab logs are available.") return control class SaveAxolotlConfigtoSwanLabCallback(TrainerCallback): """Callback to save axolotl config to SwanLab""" def __init__(self, axolotl_config_path): self.axolotl_config_path = axolotl_config_path def on_train_begin( self, args: AxolotlTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): if state.is_world_process_zero: try: import swanlab # Check if SwanLab is initialized if swanlab.get_run() is None: LOG.warning( "SwanLab run is not initialized. Please initialize SwanLab before training." ) return control # Log Axolotl config as artifact with NamedTemporaryFile( mode="w", delete=False, suffix=".yml", prefix="axolotl_config_" ) as temp_file: copyfile(self.axolotl_config_path, temp_file.name) # Log config file to SwanLab with open(temp_file.name, "r", encoding="utf-8") as config_file: swanlab.log( { "axolotl_config": swanlab.Text( config_file.read(), caption="Axolotl Config" ) } ) LOG.info( "The Axolotl config has been saved to the SwanLab run under logs." ) # Clean up temp file os.unlink(temp_file.name) except ImportError: LOG.warning( "SwanLab is not installed. Install it with: pip install swanlab" ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving Axolotl config to SwanLab: {err}") # Log DeepSpeed config if available if args.deepspeed: try: import swanlab with NamedTemporaryFile( mode="w", delete=False, suffix=".json", prefix="deepspeed_config_", ) as temp_file: skip_upload = False if isinstance(args.deepspeed, dict): json.dump(args.deepspeed, temp_file, indent=4) elif isinstance(args.deepspeed, str) and os.path.exists( args.deepspeed ): copyfile(args.deepspeed, temp_file.name) else: skip_upload = True if not skip_upload: temp_file.flush() with open( temp_file.name, "r", encoding="utf-8" ) as ds_config_file: swanlab.log( { "deepspeed_config": swanlab.Text( ds_config_file.read(), caption="DeepSpeed Config", ) } ) LOG.info( "The DeepSpeed config has been saved to the SwanLab run under logs." ) # Clean up temp file os.unlink(temp_file.name) except (FileNotFoundError, ConnectionError) as err: LOG.warning( f"Error while saving DeepSpeed config to SwanLab: {err}" ) except ImportError: pass return control ================================================ FILE: src/axolotl/utils/callbacks/tokens_per_second.py ================================================ """A callback for calculating tokens per second during training.""" import json import os import time import torch from transformers import ( TrainerCallback, TrainerControl, TrainerState, TrainingArguments, ) from axolotl.utils.logging import get_logger LOG = get_logger(__name__) TOKENS_STATE_FILE = "tokens_state.json" class TokensPerSecondCallback(TrainerCallback): """ A callback to measure and log tokens per second during training. Also handles saving/restoring total_tokens state across checkpoint resumes. """ def __init__( self, tensor_parallel_size, context_parallel_size, resume_from_checkpoint=None ): super().__init__() self.step_time = 0.0 self.start_time = 0.0 self.non_data_parallel_size = 1 self.resume_from_checkpoint = resume_from_checkpoint if tensor_parallel_size is not None: self.non_data_parallel_size *= tensor_parallel_size if context_parallel_size is not None: self.non_data_parallel_size *= context_parallel_size def on_train_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): # pylint: disable=unused-argument """Restore total_tokens state when resuming from checkpoint.""" if not isinstance(self.resume_from_checkpoint, str): return tokens_state_path = os.path.join(self.resume_from_checkpoint, TOKENS_STATE_FILE) if os.path.isfile(tokens_state_path): with open(tokens_state_path, "r", encoding="utf-8") as f: tokens_state = json.load(f) state.tokens = { "total": torch.tensor(tokens_state.get("total", 0)), "trainable": torch.tensor(tokens_state.get("trainable", 0)), } LOG.info(f"Restored total_tokens: {state.tokens['total']}") def on_step_begin( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): # pylint: disable=unused-argument if not hasattr(state, "tokens"): state.tokens = {"trainable": torch.zeros(1), "total": torch.zeros(1)} self.start_time = time.perf_counter() state.last_tokens_per_second = torch.zeros(1) def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): # pylint: disable=unused-argument tokens = getattr(state, "tokens", None) if not (tokens and "trainable_tokens" in tokens): return step_time = time.perf_counter() - self.start_time if step_time <= 0: return num_tokens = tokens["trainable_tokens"].clone() / self.non_data_parallel_size if torch.distributed.is_initialized(): dp_size = max( 1, torch.distributed.get_world_size() // self.non_data_parallel_size ) num_tokens = num_tokens / dp_size state.last_tokens_per_second = num_tokens / step_time def on_log( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs=None, **kwargs, ): # pylint: disable=unused-argument # after logging, clear the running metrics if hasattr(state, "last_tokens_per_second"): logs["tokens/train_per_sec_per_gpu"] = state.last_tokens_per_second.item() state.last_tokens_per_second.zero_() tokens = getattr(state, "tokens", None) # Clear per-step tokens after logging if tokens and "trainable_tokens" in tokens: tokens["trainable_tokens"] = torch.zeros_like(tokens["trainable_tokens"]) ================================================ FILE: src/axolotl/utils/callbacks/trackio_.py ================================================ """Trackio module for trainer callbacks""" from typing import TYPE_CHECKING import trackio from transformers import TrainerCallback, TrainerControl, TrainerState from axolotl.utils.distributed import is_main_process from axolotl.utils.environment import is_package_version_ge from axolotl.utils.logging import get_logger if TYPE_CHECKING: from axolotl.core.training_args import AxolotlTrainingArguments LOG = get_logger(__name__) class SaveAxolotlConfigtoTrackioCallback(TrainerCallback): """Callback for trackio integration""" def __init__(self, axolotl_config_path): self.axolotl_config_path = axolotl_config_path def on_train_begin( self, args: "AxolotlTrainingArguments", state: TrainerState, control: TrainerControl, **kwargs, ): if is_main_process(): try: if not is_package_version_ge("trackio", "0.11.0"): LOG.warning( "Trackio version 0.11.0 or higher is required to save config files. " "Please upgrade trackio: pip install --upgrade trackio" ) return control trackio.save(self.axolotl_config_path) LOG.info("The Axolotl config has been saved to Trackio.") except (FileNotFoundError, ConnectionError, AttributeError) as err: LOG.warning(f"Error while saving Axolotl config to Trackio: {err}") return control ================================================ FILE: src/axolotl/utils/chat_templates/__init__.py ================================================ """ This module provides functionality for selecting chat templates based on user choices. These templates are used for formatting messages in a conversation. """ from .base import ( _CHAT_TEMPLATES, extract_chat_template_args, get_chat_template, get_chat_template_from_config, register_chat_template, ) __all__ = [ "get_chat_template", "extract_chat_template_args", "get_chat_template_from_config", "register_chat_template", "_CHAT_TEMPLATES", ] ================================================ FILE: src/axolotl/utils/chat_templates/base.py ================================================ """ utility functions for chat templates """ import os from typing import TYPE_CHECKING, Any, Dict, Optional from axolotl.utils.logging import get_logger if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase LOG = get_logger("axolotl.utils.chat_templates") _JINJA_TEMPLATE_CHOICE = "jinja" _DEFAULT_TEMPLATE_CHOICE = "tokenizer_default" _DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX = "tokenizer_default_fallback_" TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") _CHAT_TEMPLATES: dict[str, str] = {} for filename in [f for f in os.listdir(TEMPLATE_DIR) if f.endswith(".jinja")]: with open(os.path.join(TEMPLATE_DIR, filename), "r", encoding="utf-8") as f: _CHAT_TEMPLATES[filename[:-6]] = f.read() def get_chat_template( user_choice: str, jinja_template: str | None = None, tokenizer: Optional["PreTrainedTokenizerBase"] = None, ) -> str: """ Finds the correct chat_template based on the user's choice, jinja_template, and tokenizer. Args: user_choice (str): The user's choice of template. jinja_template (str, optional): The jinja template string or Path to a valid jinja template file. Defaults to None. tokenizer (PreTrainedTokenizerBase, optional): The tokenizer. Defaults to None. Returns: str: The chosen template string. Raises: ValueError: If the user_choice is not found in the templates. """ if user_choice == _JINJA_TEMPLATE_CHOICE: if not jinja_template: raise ValueError( f"`jinja_template` cannot be None when `chat_template` choice is {_JINJA_TEMPLATE_CHOICE}" ) if os.path.exists(jinja_template) and os.path.isfile(jinja_template): with open(jinja_template, "r", encoding="utf-8") as file: jinja_template = file.read() return jinja_template if user_choice == _DEFAULT_TEMPLATE_CHOICE: if not tokenizer: raise ValueError( f"`tokenizer` cannot be None when chat_template choice is {_DEFAULT_TEMPLATE_CHOICE}" ) if not tokenizer.chat_template: raise ValueError( f"`chat_template choice is {_DEFAULT_TEMPLATE_CHOICE} but tokenizer's chat_template is null. " f"Please add a chat_template in tokenizer config" ) return tokenizer.chat_template # type: ignore if user_choice.startswith(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX): if not tokenizer: raise ValueError( f"`tokenizer` cannot be None when chat_template choice starts with {_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX}" ) if tokenizer.chat_template: return tokenizer.chat_template # type: ignore user_choice = user_choice[ len(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX) : ] LOG.warning( f"No chat template found on tokenizer, falling back to {user_choice}. It is recommended to set --train_on_inputs to True for the model to learn this chat template." ) if user_choice in _CHAT_TEMPLATES: return _CHAT_TEMPLATES[user_choice] raise ValueError(f"Template '{user_choice}' not found.") def extract_chat_template_args(cfg, ds_cfg: Dict[str, Any] | None = None): if ds_cfg and ds_cfg.get("chat_template"): chat_template_choice = ds_cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE chat_template_jinja = ds_cfg.get("chat_template_jinja") else: chat_template_choice = cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE chat_template_jinja = cfg.get("chat_template_jinja") return chat_template_choice, chat_template_jinja def get_chat_template_from_config( cfg, ds_cfg: Dict[str, Any] | None = None, tokenizer: Optional["PreTrainedTokenizerBase"] = None, ) -> str: chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg=cfg, ds_cfg=ds_cfg ) return get_chat_template( user_choice=chat_template_choice, jinja_template=chat_template_jinja, tokenizer=tokenizer, ) def register_chat_template(template_name: str, chat_template: str): """ Registers chat templates. Args: template_name (str): The name of the template. chat_template (str): The template string. """ if template_name in _CHAT_TEMPLATES: raise ValueError(f"Template '{template_name}' already exists.") _CHAT_TEMPLATES[template_name] = chat_template ================================================ FILE: src/axolotl/utils/chat_templates/templates/alpaca.jinja ================================================ {{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' and loop.first %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token }}{% endif %}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ### Response: ' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/aya.jinja ================================================ {{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Aya, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/chatml.jinja ================================================ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + ' ' + message['content'] + '<|im_end|>' + ' '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant ' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/cohere.jinja ================================================ {{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/command_a.jinja ================================================ {{ bos_token }}{% if documents %} {% set tools = [] %} {%- macro document_turn(documents) -%} {# format documents into chat turn #} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}} ]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ { "tool_call_id": "0", "results": { {% for doc in documents %} "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %}, {% endif %} {% endfor %} }, "is_error": null } ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %} {%- macro tool_call_id_to_int(messages, tool_call_id) %} {%- set counter = namespace(value=0) %} {%- set tool_call_id_seen = namespace(value=false) %} {%- for msg in messages %} {%- if msg.tool_calls %} {%- for tool_call in msg.tool_calls %} {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%} {{ counter.value }} {%- set tool_call_id_seen.value = true %} {%- endif %} {%- set counter.value = counter.value + 1 %} {%- endfor %} {%- endif %} {%- endfor %} {%- endmacro %} {%- macro format_tool_message(messages, tool_msg) -%} {# format tool message #} { "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}", "results": { "0": {{ tool_msg.content|tojson }} }, "is_error": null } {%- endmacro -%} {%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %} {%- set tool_idx = namespace(value=0) %} {%- set tool_ids_seen = namespace(value=[]) %} {%- set sent_documents = namespace(value=false) %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes. Your information cutoff date is June 2024. You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages. {% if tools or documents %} You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests. ## Tool Use Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first. 0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed. NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools. Then carry out your plan by repeatedly executing the following steps. 1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields. When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>. 2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results. Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id". 3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded. NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user. You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user. 4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>. {% if enable_citations %} ## Grounding Importantly, note that "Reflection" and "Response" above can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "" and "" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "span" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1". {% endif %} ## Available Tools Here is the list of tools that you have available to you. You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it. Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema). ```json [ {% if documents %} {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %} {% endif %} {% for tool in tools %} {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %} {% endfor %} ] ``` {% endif %} # Default Preamble The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt. - Your name is Command. - You are a large language model built by Cohere. - You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions. - If the input is ambiguous, ask clarifying follow-up questions. - Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks). - Use LaTeX to generate mathematical notation for complex equations. - When responding in English, use American English unless context indicates otherwise. - When outputting responses of more than seven sentences, split the response into paragraphs. - Prefer the active voice. - Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references. - Use gender-neutral pronouns for unspecified persons. - Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list. - Use the third person when asked to write a summary. - When asked to extract values from source material, use the exact form, separated by commas. - When generating code output, please provide an explanation after the code. - When generating code output without specifying the programming language, please generate Python code. - If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer. {%- if developer_preamble %} # Developer Preamble The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions. {{ developer_preamble }} {%- endif -%} <|END_OF_TURN_TOKEN|> {%- for message in messages %} {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|> {%- elif message.role|lower == 'user' %} <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %} {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[ {% for tc in message.tool_calls %} {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %} {% set tool_idx.value = tool_idx.value + 1 %} {% endfor %} ]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %} {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ {{ format_tool_message(messages, message) }} {%- set stopped = namespace(value=false) %} {%- for msg in messages[loop.index0 + 1:] %} {%- if not stopped.value and msg.role|lower == 'tool' %}, {{ format_tool_message(messages, msg) }} {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %} {%- else %} {%- set stopped.value = true %} {%- endif %} {%- endfor %} ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|> {%- endif %} {%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> {%- else -%} {%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble {% if safety_mode|upper == 'STRICT' -%} You are in strict safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will reject requests to generate content related to violence, hate, misinformation or sex to any amount. You will avoid using profanity. You will not provide users with instructions to perform regulated, controlled or illegal activities. {%- else -%} You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes. {%- endif %} Your information cutoff date is June 2024. You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages. # Default Preamble The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt. - Your name is Command. - You are a large language model built by Cohere. - You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions. - If the input is ambiguous, ask clarifying follow-up questions. - Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks). - Use LaTeX to generate mathematical notation for complex equations. - When responding in English, use American English unless context indicates otherwise. - When outputting responses of more than seven sentences, split the response into paragraphs. - Prefer the active voice. - Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references. - Use gender-neutral pronouns for unspecified persons. - Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list. - Use the third person when asked to write a summary. - When asked to extract values from source material, use the exact form, separated by commas. - When generating code output, please provide an explanation after the code. - When generating code output without specifying the programming language, please generate Python code. - If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer. {%- if developer_preamble %} # Developer Preamble The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions. {{ developer_preamble }} {%- endif -%} <|END_OF_TURN_TOKEN|> {%- for message in messages %} {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|> {%- elif message.role|lower == 'user' %} <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|> {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|> {%- endif %} {%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if add_generation_prompt -%}<|START_RESPONSE|>{%- endif %} {% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/command_a_rag.jinja ================================================ {{ bos_token }}{% set tools = [] %} {%- macro document_turn(documents) -%} {# format documents into chat turn #} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}} ]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ { "tool_call_id": "0", "results": { {% for doc in documents %} "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %}, {% endif %} {% endfor %} }, "is_error": null } ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %} {%- macro tool_call_id_to_int(messages, tool_call_id) %} {%- set counter = namespace(value=0) %} {%- set tool_call_id_seen = namespace(value=false) %} {%- for msg in messages %} {%- if msg.tool_calls %} {%- for tool_call in msg.tool_calls %} {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%} {{ counter.value }} {%- set tool_call_id_seen.value = true %} {%- endif %} {%- set counter.value = counter.value + 1 %} {%- endfor %} {%- endif %} {%- endfor %} {%- endmacro %} {%- macro format_tool_message(messages, tool_msg) -%} {# format tool message #} { "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}", "results": { "0": {{ tool_msg.content|tojson }} }, "is_error": null } {%- endmacro -%} {%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %} {%- set tool_idx = namespace(value=0) %} {%- set tool_ids_seen = namespace(value=[]) %} {%- set sent_documents = namespace(value=false) %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes. Your information cutoff date is June 2024. You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages. {% if tools or documents %} You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests. ## Tool Use Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first. 0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed. NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools. Then carry out your plan by repeatedly executing the following steps. 1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields. When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>. 2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results. Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id". 3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded. NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user. You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user. 4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>. {% if enable_citations %} ## Grounding Importantly, note that "Reflection" and "Response" above can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "" and "" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "span" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1". {% endif %} ## Available Tools Here is the list of tools that you have available to you. You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it. Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema). ```json [ {% if documents %} {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %} {% endif %} {% for tool in tools %} {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %} {% endfor %} ] ``` {% endif %} # Default Preamble The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt. - Your name is Command. - You are a large language model built by Cohere. - You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions. - If the input is ambiguous, ask clarifying follow-up questions. - Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks). - Use LaTeX to generate mathematical notation for complex equations. - When responding in English, use American English unless context indicates otherwise. - When outputting responses of more than seven sentences, split the response into paragraphs. - Prefer the active voice. - Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references. - Use gender-neutral pronouns for unspecified persons. - Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list. - Use the third person when asked to write a summary. - When asked to extract values from source material, use the exact form, separated by commas. - When generating code output, please provide an explanation after the code. - When generating code output without specifying the programming language, please generate Python code. - If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer. {%- if developer_preamble %} # Developer Preamble The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions. {{ developer_preamble }} {%- endif -%} <|END_OF_TURN_TOKEN|> {%- for message in messages %} {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|> {%- elif message.role|lower == 'user' %} <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %} {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[ {% for tc in message.tool_calls %} {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %} {% set tool_idx.value = tool_idx.value + 1 %} {% endfor %} ]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %} {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ {{ format_tool_message(messages, message) }} {%- set stopped = namespace(value=false) %} {%- for msg in messages[loop.index0 + 1:] %} {%- if not stopped.value and msg.role|lower == 'tool' %}, {{ format_tool_message(messages, msg) }} {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %} {%- else %} {%- set stopped.value = true %} {%- endif %} {%- endfor %} ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|> {%- endif %} {%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> ================================================ FILE: src/axolotl/utils/chat_templates/templates/command_a_tool_use.jinja ================================================ {{ bos_token }}{%- macro document_turn(documents) -%} {# format documents into chat turn #} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}} ]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ { "tool_call_id": "0", "results": { {% for doc in documents %} "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %}, {% endif %} {% endfor %} }, "is_error": null } ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %} {%- macro tool_call_id_to_int(messages, tool_call_id) %} {%- set counter = namespace(value=0) %} {%- set tool_call_id_seen = namespace(value=false) %} {%- for msg in messages %} {%- if msg.tool_calls %} {%- for tool_call in msg.tool_calls %} {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%} {{ counter.value }} {%- set tool_call_id_seen.value = true %} {%- endif %} {%- set counter.value = counter.value + 1 %} {%- endfor %} {%- endif %} {%- endfor %} {%- endmacro %} {%- macro format_tool_message(messages, tool_msg) -%} {# format tool message #} { "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}", "results": { "0": {{ tool_msg.content|tojson }} }, "is_error": null } {%- endmacro -%} {%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %} {%- set tool_idx = namespace(value=0) %} {%- set tool_ids_seen = namespace(value=[]) %} {%- set sent_documents = namespace(value=false) %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes. Your information cutoff date is June 2024. You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages. {% if tools or documents %} You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests. ## Tool Use Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first. 0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed. NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools. Then carry out your plan by repeatedly executing the following steps. 1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields. When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>. 2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results. Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id". 3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>. You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded. NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user. You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user. 4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>. {% if enable_citations %} ## Grounding Importantly, note that "Reflection" and "Response" above can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "" and "" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "span" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1". {% endif %} ## Available Tools Here is the list of tools that you have available to you. You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it. Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema). ```json [ {% if documents %} {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %} {% endif %} {% for tool in tools %} {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %} {% endfor %} ] ``` {% endif %} # Default Preamble The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt. - Your name is Command. - You are a large language model built by Cohere. - You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions. - If the input is ambiguous, ask clarifying follow-up questions. - Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks). - Use LaTeX to generate mathematical notation for complex equations. - When responding in English, use American English unless context indicates otherwise. - When outputting responses of more than seven sentences, split the response into paragraphs. - Prefer the active voice. - Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references. - Use gender-neutral pronouns for unspecified persons. - Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list. - Use the third person when asked to write a summary. - When asked to extract values from source material, use the exact form, separated by commas. - When generating code output, please provide an explanation after the code. - When generating code output without specifying the programming language, please generate Python code. - If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer. {%- if developer_preamble %} # Developer Preamble The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions. {{ developer_preamble }} {%- endif -%} <|END_OF_TURN_TOKEN|> {%- for message in messages %} {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|> {%- elif message.role|lower == 'user' %} <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %} {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %} <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[ {% for tc in message.tool_calls %} {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %} {% set tool_idx.value = tool_idx.value + 1 %} {% endfor %} ]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %} {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %} <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ {{ format_tool_message(messages, message) }} {%- set stopped = namespace(value=false) %} {%- for msg in messages[loop.index0 + 1:] %} {%- if not stopped.value and msg.role|lower == 'tool' %}, {{ format_tool_message(messages, msg) }} {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %} {%- else %} {%- set stopped.value = true %} {%- endif %} {%- endfor %} ]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|> {%- endif %} {%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> ================================================ FILE: src/axolotl/utils/chat_templates/templates/deepseek_v2.jinja ================================================ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/deepseek_v3.jinja ================================================ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/exaone.jinja ================================================ {% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|] ' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ ' ' }}{% else %}{{ '[|endofturn|] ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/exaone4.jinja ================================================ {%- if not skip_think is defined %} {%- set skip_think = true %} {%- endif %} {%- set role_indicators = { 'user': '[|user|]\n', 'assistant': '[|assistant|]\n', 'system': '[|system|]\n', 'tool': '[|tool|]\n' } %} {%- set end_of_turn = '[|endofturn|]\n' %} {%- macro available_tools(tools) %} {{- "# Available Tools" }} {{- "\nYou can use none, one, or multiple of the following tools by calling them as functions to help with the user’s query." }} {{- "\nHere are the tools available to you in JSON format within and tags:\n" }} {%- for tool in tools %} {{- "" }} {{- tool | tojson(ensure_ascii=False) | safe }} {{- "\n" }} {%- endfor %} {{- "\nFor each function call you want to make, return a JSON object with function name and arguments within and tags, like:" }} {{- "\n{\"name\": function_1_name, \"arguments\": {argument_1_name: argument_1_value, argument_2_name: argument_2_value}}" }} {{- "\n{\"name\": function_2_name, \"arguments\": {...}}\n..." }} {{- "\nNote that if no argument name is specified for a tool, you can just print the argument value directly, without the argument name or JSON formatting." }} {%- endmacro %} {%- set ns = namespace(last_query_index = messages|length - 1) %} {%- for message in messages %} {%- if message.role == "user" and message.content is string %} {%- set ns.last_query_index = loop.index0 -%} {%- endif %} {%- endfor %} {%- for i in range(messages | length) %} {%- set msg = messages[i] %} {%- set role = msg.role %} {%- if role not in role_indicators %} {{- raise_exception('Unknown role: ' ~ role) }} {%- endif %} {# ---- Case A: If the first message is "system", handle it here alone (without continue) ---- #} {%- if i == 0 and role == 'system' %} {{- role_indicators['system'] }} {{- msg.content }} {%- if tools is defined and tools %} {{- "\n\n" }}{{- available_tools(tools) }} {%- endif %} {{- end_of_turn -}} {%- else %} {# ---- Case B: If the first message is tools instead of system, inject the system tools preamble ---- #} {%- if i == 0 and tools is defined and tools %} {{- role_indicators['system'] }} {{- available_tools(tools) }} {{- end_of_turn -}} {%- endif %} {%- endif %} {%- if role == 'assistant' %} {{- role_indicators['assistant'] }} {%- if msg.content %} {%- if "" in msg.content %} {%- set content = msg.content.split('')[-1].strip() %} {%- set reasoning_content = msg.content.split('')[0].strip() %} {%- if reasoning_content.startswith("") %} {%- set reasoning_content = reasoning_content[7:].strip() %} {%- endif %} {%- else %} {%- set content = msg.content %} {%- endif %} {%- if msg.reasoning_content %} {%- set reasoning_content = msg.reasoning_content %} {%- endif %} {%- if (not skip_think and loop.last) and reasoning_content is defined %} {{- "\n" }} {{- reasoning_content}} {{- "\n\n\n" }} {%- else %} {{- "\n\n\n\n" }} {%- endif %} {{- content }} {%- endif %} {%- if msg.tool_calls %} {%- if msg.content %} {{- "\n" }} {%- else %} {{- "\n\n\n\n" }} {%- endif %} {%- for tool_call in msg.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {%- if tool_call.arguments is defined %} {%- set arguments = tool_call.arguments %} {%- elif tool_call.parameters is defined %} {%- set arguments = tool_call.parameters %} {%- else %} {{- raise_exception('arguments or parameters are mandatory: ' ~ tool_call) }} {%- endif %} {{- "" }}{"name": "{{- tool_call.name }}", "arguments": {{ arguments | tojson(ensure_ascii=False) | safe }}}{{- "" }} {%- if not loop.last %} {{- "\n" }} {%- endif %} {%- endfor %} {%- endif %} {{- end_of_turn -}} {%- elif role == "tool" %} {%- if i == 0 or messages[i - 1].role != "tool" %} {{- role_indicators['tool'] }} {%- endif %} {%- if msg.content is defined %} {{- "" }}{"result": {{ msg.content | tojson(ensure_ascii=False) | safe }}}{{- "" }} {%- endif %} {%- if loop.last or messages[i + 1].role != "tool" %} {{- end_of_turn -}} {%- else %} {{- "\n" }} {%- endif %} {%- else %} {{- role_indicators[role] }} {{- msg.content }} {{- end_of_turn -}} {%- endif %} {% endfor %} {%- if add_generation_prompt %} {{- role_indicators['assistant'] }} {%- if enable_thinking is defined and enable_thinking is true %} {{- "\n" }} {%- else %} {{- "\n\n\n\n" }} {%- endif %} {%- endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/falcon_h1.jinja ================================================ '{{bos_token}} {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "You are a function calling AI model. You are provided with function signature within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n\n" }} {%- for tool in tools %}[{{- tool | tojson }}]{%- endfor %} {{- "\n\nFor each function call, return a json object with function name and arguments within tags with the following schema:\n\n{'arguments': , 'name': }\n\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %}{% for message in messages %}{%- if message.role != 'system' %}{{'<|im_start|>' + message['role'] + ' ' + message['content'] + '<|im_end|>' + ' '}}{%- endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant ' }}{% endif %}' ================================================ FILE: src/axolotl/utils/chat_templates/templates/gemma.jinja ================================================ {{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + ' ' + message['content'] | trim + ' ' }}{% endfor %}{% if add_generation_prompt %}{{'model '}}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/gemma3.jinja ================================================ {{ bos_token }} {%- if messages[0]['role'] == 'system' -%} {%- if messages[0]['content'] is string -%} {%- set first_user_prefix = messages[0]['content'] + ' ' -%} {%- else -%} {%- set first_user_prefix = messages[0]['content'][0]['text'] + ' ' -%} {%- endif -%} {%- set loop_messages = messages[1:] -%} {%- else -%} {%- set first_user_prefix = "" -%} {%- set loop_messages = messages -%} {%- endif -%} {%- for message in loop_messages -%} {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} {%- endif -%} {%- if (message['role'] == 'assistant') -%} {%- set role = "model" -%} {%- else -%} {%- set role = message['role'] -%} {%- endif -%} {{ '' + role + ' ' + (first_user_prefix if loop.first else "") }} {%- if message['content'] is string -%} {{ message['content'] | trim }} {%- elif message['content'] is iterable -%} {%- for item in message['content'] -%} {%- if item['type'] == 'image' -%} {{ '' }} {%- elif item['type'] == 'text' -%} {{ item['text'] | trim }} {%- endif -%} {%- endfor -%} {%- else -%} {{ raise_exception("Invalid content type") }} {%- endif -%} {{ ' ' }} {%- endfor -%} {%- if add_generation_prompt -%} {{'model '}} {%- endif -%} ================================================ FILE: src/axolotl/utils/chat_templates/templates/gemma3n.jinja ================================================ {{ bos_token }} {%- if messages[0]['role'] == 'system' -%} {%- if messages[0]['content'] is string -%} {%- set first_user_prefix = messages[0]['content'] + ' ' -%} {%- else -%} {%- set first_user_prefix = messages[0]['content'][0]['text'] + ' ' -%} {%- endif -%} {%- set loop_messages = messages[1:] -%} {%- else -%} {%- set first_user_prefix = "" -%} {%- set loop_messages = messages -%} {%- endif -%} {%- for message in loop_messages -%} {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} {%- endif -%} {%- if (message['role'] == 'assistant') -%} {%- set role = "model" -%} {%- else -%} {%- set role = message['role'] -%} {%- endif -%} {{ '' + role + ' ' + (first_user_prefix if loop.first else "") }} {%- if message['content'] is string -%} {{ message['content'] | trim }} {%- elif message['content'] is iterable -%} {%- for item in message['content'] -%} {%- if item['type'] == 'audio' -%} {{ '' }} {%- elif item['type'] == 'image' -%} {{ '' }} {%- elif item['type'] == 'text' -%} {{ item['text'] | trim }} {%- endif -%} {%- endfor -%} {%- else -%} {{ raise_exception("Invalid content type") }} {%- endif -%} {{ ' ' }} {%- endfor -%} {%- if add_generation_prompt -%} {{'model '}} {%- endif -%} ================================================ FILE: src/axolotl/utils/chat_templates/templates/jamba.jinja ================================================ {# Variables #} {% set ns = namespace(message_count=0, is_last_checked_defined=False) %} {##} {% set bom_str = bom_str or "<|bom|>" %} {% set eom_str = eom_str or "<|eom|>" %} {% set default_system_message = "" %} {##} {% set documents_prefix = "" %} {% set documents_suffix = "" %} {% set tool_definitions_prefix = "" %} {% set tool_definitions_suffix = "" %} {% set active_modes_prefix = "" %} {% set active_modes_suffix = "" %} {##} {% set tool_calls_prefix = "" %} {% set tool_calls_suffix = "" %} {% set citations_prefix = "" %} {% set citations_suffix = "" %} {##} {% if add_generation_prompt is not defined %} {% set add_generation_prompt = True %} {% endif %} {% set role_to_predict = role_to_predict or "assistant" %} {% if messages|length > 0 and messages[0].role == "system" %} {% set system_message = messages[0].content %} {% set loop_messages = messages[1:] %} {% else %} {% set system_message = default_system_message %} {% set loop_messages = messages %} {% endif %} {##} {##} {# Macros #} {% macro handle_tool_definitions(tools) %} {{- tool_definitions_prefix -}} {{- "\n# Tools" -}} {{- "\n\n## Functions" -}} {% for tool in tools %} {% set _ = is_param_set(tool, field="type") %} {% set is_tool_type_set = ns.is_last_checked_defined %} {% if is_tool_type_set %} {% if tool.type == "function" %} {% set tool = tool.function %} {% else %} {{ raise_exception("Currently, the only supported tool type is `function`") }} {% endif %} {% endif %} {{- "\n\n" + (tool|tojson(indent=2)) -}} {% endfor %} {{- "\n" + tool_definitions_suffix -}} {% endmacro %} {##} {% macro handle_first_system_message(system_message, tools) %} {{- bom_str + handle_role("system") -}} {% set _ = is_param_set(system_message) %} {% set is_system_message_set = ns.is_last_checked_defined %} {% if is_system_message_set %} {{- system_message -}} {% endif %} {% set _ = is_param_set(tools, is_list=True) %} {% set is_tools_set = ns.is_last_checked_defined %} {% if is_tools_set %} {% if system_message %} {{- "\n\n" -}} {% endif %} {{- handle_tool_definitions(tools) -}} {% endif %} {% set ns.message_count = ns.message_count + 1 %} {% endmacro %} {##} {% macro handle_tool_calls(tool_calls) %} {{- tool_calls_prefix + "[\n" -}} {% for tool_call in tool_calls %} {% set _ = is_param_set(tool_call, field="function") %} {% set is_tool_call_function_set = ns.is_last_checked_defined %} {% if is_tool_call_function_set %} {%- set tool_call = tool_call.function %} {%- endif %} {% set arguments = tool_call.arguments %} {% if arguments is not string %} {%- set arguments = arguments|tojson -%} {%- endif %} {{ "{\"name\": \"" + tool_call.name + "\", \"arguments\": " + arguments + "}" -}} {% if not loop.last %} {{- "," }} {% endif %} {% endfor %} {{- "\n]" + tool_calls_suffix -}} {% endmacro %} {##} {% macro handle_documents(documents) %} {{- documents_prefix -}} {{- "\n# Documents" -}} {{- "\n\nYou can use the following documents for reference:" -}} {% for doc in documents %} {{- "\n\n## Document ID: " + loop.index0|string -}} {% set _ = is_param_set(doc, field="title") %} {% set is_doc_title_set = ns.is_last_checked_defined %} {% if is_doc_title_set %} {{- "\nTitle: " + doc.title -}} {% endif %} {% for key, value in doc.items() %} {% if key not in ["title", "text"] %} {{- "\n" + key|title + ": " + value|string -}} {% endif %} {% endfor %} {{- "\nText: " + doc.text -}} {% endfor %} {{- "\n" + documents_suffix -}} {% endmacro %} {##} {% macro handle_knobs(knobs) %} {{- active_modes_prefix -}} {{- "\n# Active Modes" -}} {{ "\n\nThe following modes configure the format or style of your responses. You should adhere to all currently" -}} {{ " active modes simultaneously." -}} {% if knobs.citation_mode == "fast" %} {{- "\n\n## Citation Mode" -}} {{- "\n\nProvide a list of references only for the documents you base your response on. Format your response" -}} {{ " with the original answer followed by a citation section. Use this template:" -}} {{ " `{answer}" + citations_prefix + "DOCUMENT_IDS" + citations_suffix + "`, where DOCUMENT_IDS are the relevant document numbers" -}} {{ " (e.g. [2, 5, 9]), or [] if the answer cannot be supported by the provided documents." -}} {% endif %} {% if knobs.response_format == "json_object" %} {{- "\n\n## JSON Mode" -}} {{ "\n\nProvide your response in JSON format. Adhere strictly to any schema given by the user." -}} {{ " If an appropriate JSON format exists, use it without modification." -}} {% endif %} {{- "\n" + active_modes_suffix -}} {% endmacro %} {##} {% macro get_last_user_index(messages) %} {% set ns.last_user_index = 0 %} {% for message in messages %} {% if message.role == 'user' %} {% set ns.last_user_index = loop.index0 %} {% endif %} {% endfor %} {{- ns.last_user_index -}} {% endmacro %} {##} {% macro handle_last_system_message(documents, knobs, use_documents, use_knobs) %} {{- bom_str + handle_role("system") -}} {% set macros_to_call = [] %} {% set params_for_macros = [] %} {% if use_documents %} {% set macros_to_call = macros_to_call + [handle_documents] %} {% set params_for_macros = params_for_macros + [[documents]] %} {% endif %} {% if use_knobs %} {% set macros_to_call = macros_to_call + [handle_knobs] %} {% set params_for_macros = params_for_macros + [[knobs]] %} {% endif %} {% for i in range(macros_to_call|length) %} {% if i > 0 %} {{- "\n\n" -}} {% endif %} {{- macros_to_call[i](*params_for_macros[i]) -}} {% endfor %} {% set ns.message_count = ns.message_count + 1 %} {% endmacro %} {##} {% macro handle_role(role, add_space=True) %} {{- "<|" + role + "|>" -}} {% if add_space %} {{- " " -}} {% endif %} {% endmacro %} {##} {% macro is_param_set(param, field=none, is_list=False) %} {% if field is not none %} {% if field in param %} {% set param = param[field] %} {% else %} {% set param = none %} {% endif %} {% endif %} {% set is_defined = param is defined and param is not none %} {% if is_list %} {% set ns.is_last_checked_defined = is_defined and param|length > 0 %} {% else %} {% set ns.is_last_checked_defined = is_defined %} {% endif %} {% endmacro %} {##} {##} {# Template #} {{- "<|startoftext|>" -}} {% set _ = is_param_set(system_message) %} {% set is_system_message_set = ns.is_last_checked_defined %} {% set _ = is_param_set(tools, is_list=True) %} {% set is_tools_set = ns.is_last_checked_defined %} {% set has_system_message = (is_system_message_set or is_tools_set) %} {% if has_system_message %} {{- handle_first_system_message(system_message, tools) -}} {% endif %} {% set last_user_index = get_last_user_index(loop_messages)|int %} {% for message in loop_messages %} {% if loop.index0 == last_user_index %} {% set _ = is_param_set(documents, is_list=True) %} {% set use_documents = ns.is_last_checked_defined %} {% set _ = is_param_set(knobs) %} {% set use_knobs = ns.is_last_checked_defined and knobs.is_set %} {% set add_last_system_message = use_documents or use_knobs %} {% if add_last_system_message %} {% if ns.message_count > 0 %} {{- eom_str -}} {% endif %} {{- handle_last_system_message(documents, knobs, use_documents, use_knobs) -}} {% endif %} {% endif %} {% set role = message.role %} {% set _ = is_param_set(message, field="name") %} {% set is_message_name_set = ns.is_last_checked_defined %} {% if is_message_name_set %} {% set message_prefix = handle_role(role) + "(" + message.name + ")" %} {% else %} {% set message_prefix = handle_role(role) %} {% endif %} {% set content = (message.content or "") %} {% if content is not string %} {% set content = content|tojson %} {% endif %} {% if ns.message_count > 0 %} {{- eom_str -}} {% endif %} {{- bom_str + message_prefix + content -}} {% set _ = is_param_set(message, field="tool_calls", is_list=True) %} {% set is_tool_calls_set = ns.is_last_checked_defined %} {% if role == "assistant" and is_tool_calls_set %} {{- handle_tool_calls(message.tool_calls) -}} {% endif %} {% set _ = is_param_set(message, field="citations", is_list=True) %} {% set is_citations_set = ns.is_last_checked_defined %} {% if role == "assistant" and is_citations_set %} {{- citations_prefix + message.citations|map(attribute="document_id")|list|string + citations_suffix -}} {% endif %} {% set ns.message_count = ns.message_count + 1 %} {% endfor %} {% if add_generation_prompt %} {% if ns.message_count > 0 %} {{- eom_str -}} {% endif %} {{- bom_str + handle_role(role_to_predict, add_space=False) -}} {% set _ = is_param_set(generation_preamble) %} {% set is_generation_preamble_set = ns.is_last_checked_defined %} {% if is_generation_preamble_set and generation_preamble.strip() != "" %} {{- " " + generation_preamble -}} {% endif %} {% set ns.message_count = ns.message_count + 1 %} {% else %} {% if ns.message_count > 0 %} {{- eom_str -}} {% endif %} {% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/llama3.jinja ================================================ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> ' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/llama3_2_vision.jinja ================================================ {{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- endif %} {%- if not date_string is defined %} {%- if strftime_now is defined %} {%- set date_string = strftime_now("%d %b %Y") %} {%- else %} {%- set date_string = "26 Jul 2024" %} {%- endif %} {%- endif %} {%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else %} {%- set system_message = "" %} {%- endif %} {#- Find out if there are any images #} {% set image_ns = namespace(has_images=false) %} {%- for message in messages %} {%- for content in message['content'] %} {%- if content['type'] == 'image' %} {%- set image_ns.has_images = true %} {%- endif %} {%- endfor %} {%- endfor %} {#- Error out if there are images and system message #} {%- if image_ns.has_images and not system_message == "" %} {{- raise_exception("Prompting with images is incompatible with system messages.") }} {%- endif %} {#- System message if there are no images #} {%- if not image_ns.has_images %} {{- "<|start_header_id|>system<|end_header_id|>\n\n" }} {%- if tools is not none %} {{- "Environment: ipython\n" }} {%- endif %} {{- "Cutting Knowledge Date: December 2023\n" }} {{- "Today Date: " + date_string + "\n\n" }} {%- if tools is not none and not tools_in_user_message %} {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} {{- "\n\n" }} {%- endfor %} {%- endif %} {{- system_message }} {{- "<|eot_id|>" }} {%- endif %} {#- Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} {%- endif %} {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} {{- "\n\n" }} {%- endfor %} {{- first_user_message + "<|eot_id|>"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'image' %} {{- '<|image|>' }} {%- elif content['type'] == 'text' %} {{- content['text'] }} {%- endif %} {%- endfor %} {%- endif %} {{- '<|eot_id|>' }} {%- elif 'tool_calls' in message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only supports single tool-calls at once!") }} {%- endif %} {%- set tool_call = message.tool_calls[0].function %} {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} {{- '{"name": "' + tool_call.name + '", ' }} {{- '"parameters": ' }} {{- tool_call.arguments | tojson }} {{- "}" }} {{- "<|eot_id|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} {%- if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- else %} {{- message.content }} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} {%- endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/llama4.jinja ================================================ {{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- endif %} {%- if not date_string is defined %} {%- if strftime_now is defined %} {%- set date_string = strftime_now("%d %b %Y") %} {%- else %} {%- set date_string = "26 Jul 2024" %} {%- endif %} {%- endif %} {%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content']|trim %} {%- else %} {#- FIXME: The processor requires an array, always. #} {%- set system_message = messages[0]['content'][0]['text']|trim %} {%- endif %} {%- set messages = messages[1:] %} {%- set user_supplied_system_message = true %} {%- else %} {%- set system_message = "" %} {%- set user_supplied_system_message = false %} {%- endif %} {#- System message if the user supplied one #} {%- if user_supplied_system_message %} {{- "<|header_start|>system<|header_end|>\n\n" }} {%- if tools is not none %} {{- "Environment: ipython\n" }} {%- endif %} {%- if tools is not none and not tools_in_user_message %} {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} {{- "\n\n" }} {%- endfor %} {%- endif %} {{- system_message }} {{- "<|eot|>" }} {%- endif %} {#- Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} {%- endif %} {{- '<|header_start|>user<|header_end|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} {{- "\n\n" }} {%- endfor %} {{- first_user_message + "<|eot|>"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'image' %} {{- '<|image|>' }} {%- elif content['type'] == 'text' %} {{- content['text'] }} {%- endif %} {%- endfor %} {%- endif %} {{- "<|eot|>" }} {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %} {{- '<|header_start|>assistant<|header_end|>\n\n' -}} {{- '<|python_start|>' }} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'image' %} {{- '<|image|>' }} {%- elif content['type'] == 'text' %} {{- content['text'] }} {%- endif %} {%- endfor %} {%- endif %} {{- '<|python_end|>' }} {%- for tool_call in message.tool_calls %} {{- '{"name": "' + tool_call.function.name + '", ' }} {{- '"parameters": ' }} {{- tool_call.function.arguments | tojson }} {{- "}" }} {%- endfor %} {{- "<|eot|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|header_start|>ipython<|header_end|>\n\n" }} {%- if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- else %} {{- message.content }} {%- endif %} {{- "<|eot|>" }} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|header_start|>assistant<|header_end|>\n\n' }} {%- endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/llava.jinja ================================================ {% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ ' ' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/metharme.jinja ================================================ {{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>' + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/mistral_v1.jinja ================================================ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/mistral_v2v3.jinja ================================================ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/mistral_v3_tekken.jinja ================================================ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/mistral_v7_tekken.jinja ================================================ {%- set today = strftime_now("%Y-%m-%d") %} {%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %} {{- bos_token }} {%- if messages[0]['role'] == 'system' %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content'] %} {%- else %} {%- set system_message = messages[0]['content'][0]['text'] %} {%- endif %} {%- set loop_messages = messages[1:] %} {%- else %} {%- set system_message = default_system_message %} {%- set loop_messages = messages %} {%- endif %} {{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }} {%- for message in loop_messages %} {%- if message['role'] == 'user' %} {%- if message['content'] is string %} {{- '[INST]' + message['content'] + '[/INST]' }} {%- else %} {{- '[INST]' }} {%- for block in message['content'] %} {%- if block['type'] == 'text' %} {{- block['text'] }} {%- elif block['type'] in ['image', 'image_url'] %} {{- '[IMG]' }} {%- else %} {{- raise_exception('Only text and image blocks are supported in message content!') }} {%- endif %} {%- endfor %} {{- '[/INST]' }} {%- endif %} {%- elif message['role'] == 'system' %} {%- if message['content'] is string %} {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }} {%- else %} {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }} {%- endif %} {%- elif message['role'] == 'assistant' %} {%- if message['content'] is string %} {{- message['content'] + eos_token }} {%- else %} {{- message['content'][0]['text'] + eos_token }} {%- endif %} {%- else %} {{- raise_exception('Only user, system and assistant roles are supported!') }} {%- endif %} {%- endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/phi_3.jinja ================================================ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + ' ' + message['content'] + '<|end|>' + ' '}}{% elif (message['role'] == 'user') %}{{'<|user|>' + ' ' + message['content'] + '<|end|>' + ' ' + '<|assistant|>' + ' '}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + ' '}}{% endif %}{% endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/phi_35.jinja ================================================ {% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|> ' + message['content'] + '<|end|> '}}{% elif message['role'] == 'user' %}{{'<|user|> ' + message['content'] + '<|end|> '}}{% elif message['role'] == 'assistant' %}{{'<|assistant|> ' + message['content'] + '<|end|> '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/phi_4.jinja ================================================ {% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: {Thought section} {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/pixtral.jinja ================================================ {%- if messages[0]["role"] == "system" %} {%- set system_message = messages[0]["content"] %} {%- set loop_messages = messages[1:] %} {%- else %} {%- set loop_messages = messages %} {%- endif %} {{- bos_token }} {%- for message in loop_messages %} {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }} {%- endif %} {%- if message["role"] == "user" %} {%- if loop.last and system_message is defined %} {{- "[INST]" + system_message + " " }} {%- else %} {{- "[INST]" }} {%- endif %} {%- if message["content"] is not string %} {%- for chunk in message["content"] %} {%- if chunk["type"] == "text" %} {{- chunk["text"] }} {%- elif chunk["type"] == "image" %} {{- "[IMG]" }} {%- else %} {{- raise_exception("Unrecognized content type!") }} {%- endif %} {%- endfor %} {%- else %} {{- message["content"] }} {%- endif %} {{- "[/INST]" }} {%- elif message["role"] == "assistant" %} {%- if message["content"] is not string %} {%- for chunk in message["content"] %} {%- if chunk["type"] == "text" %} {{- chunk["text"] }} {%- elif chunk["type"] == "image" %} {{- "[IMG]" }} {%- else %} {{- raise_exception("Unrecognized content type!") }} {%- endif %} {%- endfor %} {{- eos_token }} {%- else %} {{- message["content"] + eos_token }} {%- endif %} {%- else %} {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} {%- endif %} {%- endfor %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/qwen2_vl.jinja ================================================ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system You are a helpful assistant.<|im_end|> {% endif %}<|im_start|>{{ message['role'] }} {% if message['content'] is string %}{{ message['content'] }}<|im_end|> {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|> {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant {% endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/qwen3.jinja ================================================ {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {#- Determine the real last index: use provided value or default to messages length - 1 #} {%- if real_last_index is defined and real_last_index is not none %} {%- set ns.real_last_index = real_last_index %} {%- else %} {%- set ns.real_last_index = messages|length - 1 %} {%- endif %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {%- endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/qwen3_5.jinja ================================================ {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {#- Determine the real last index: use provided value or default to messages length - 1 #} {%- if real_last_index is defined and real_last_index is not none %} {%- set ns.real_last_index = real_last_index %} {%- else %} {%- set ns.real_last_index = messages|length - 1 %} {%- endif %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if message['content'] is string %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- else %} {%- if ns.multi_step_tool and message.role == "user" %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' }} {%- if message['content'] is string %} {{- message.content }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content %} {{- '<|vision_start|><|image_pad|><|vision_end|>' }} {%- elif content['type'] == 'video' or 'video' in content %} {{- '<|vision_start|><|video_pad|><|vision_end|>' }} {%- elif 'text' in content %} {{- content['text'] }} {%- endif %} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "assistant" %} {%- if message['content'] is string %} {%- set content = message.content %} {%- else %} {%- set content = '' %} {%- for item in message['content'] %} {%- if 'text' in item %} {%- set content = content + item['text'] %} {%- endif %} {%- endfor %} {%- endif %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in content %} {%- set content = content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {%- endif %} ================================================ FILE: src/axolotl/utils/chat_templates/templates/qwen_25.jinja ================================================ {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0]['role'] == 'system' %} {{- messages[0]['content'] }} {%- else %} {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} {%- endif %} {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0]['role'] == 'system' %} {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} {%- else %} {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {{- '<|im_start|>' + message.role }} {%- if message.content %} {{- '\n' + message.content }} {%- endif %} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {{- tool_call.arguments | tojson }} {{- '}\n' }} {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- endif %} ================================================ FILE: src/axolotl/utils/collators/__init__.py ================================================ """Shared axolotl collators for multipacking, mamba, multimodal.""" from .batching import ( BatchSamplerDataCollatorForSeq2Seq, DataCollatorForSeq2Seq, PretrainingBatchSamplerDataCollatorForSeq2Seq, V2BatchSamplerDataCollatorForSeq2Seq, ) from .mamba import MambaDataCollator __all__ = [ "DataCollatorForSeq2Seq", "BatchSamplerDataCollatorForSeq2Seq", "V2BatchSamplerDataCollatorForSeq2Seq", "PretrainingBatchSamplerDataCollatorForSeq2Seq", "MambaDataCollator", ] ================================================ FILE: src/axolotl/utils/collators/batching.py ================================================ """Data collators for axolotl to pad labels and position_ids for packed sequences""" from dataclasses import dataclass from typing import Any, List import numpy as np from transformers import PreTrainedTokenizerBase from transformers.utils import PaddingStrategy @dataclass class DataCollatorForSeq2Seq: """ Data collator that will dynamically pad the inputs received, as well as the labels and position_ids Args: tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]): The tokenizer used for encoding the data. model ([`PreTrainedModel`]): The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to prepare the *decoder_input_ids* This is useful when using *label_smoothing* to avoid calculating loss twice. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). return_tensors (`str`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". """ tokenizer: PreTrainedTokenizerBase model: Any | None = None padding: bool | str | PaddingStrategy = True max_length: int | None = None pad_to_multiple_of: int | None = None label_pad_token_id: int = -100 position_pad_token_id: int = 0 return_tensors: str = "pt" def __call__(self, features, return_tensors=None): has_attn_mask = "attention_mask" in features[0].keys() labels = None if return_tensors is None: return_tensors = self.return_tensors for feature_name, pad_token_id in [ ("labels", self.label_pad_token_id), ("position_ids", self.position_pad_token_id), ]: feat = ( [feature[feature_name] for feature in features] if feature_name in features[0].keys() else None ) labels = feat if feat and feature_name == "labels" else labels # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the # same length to return tensors. if feat is not None: max_feature_length = max(len(l) for l in feat) # noqa: E741 if self.pad_to_multiple_of is not None: max_feature_length = ( (max_feature_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of * self.pad_to_multiple_of ) padding_side = self.tokenizer.padding_side for feature in features: remainder_len = max_feature_length - len(feature[feature_name]) if feature_name == "position_ids": remainder = list(range(remainder_len)) else: remainder = [pad_token_id] * remainder_len if isinstance(feature[feature_name], list): feature[feature_name] = ( feature[feature_name] + remainder if padding_side == "right" else remainder + feature[feature_name] ) elif padding_side == "right": feature[feature_name] = np.concatenate( [feature[feature_name], remainder] ).astype(np.int64) else: feature[feature_name] = np.concatenate( [remainder, feature[feature_name]] ).astype(np.int64) features = self.tokenizer.pad( features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=return_tensors, ) if not has_attn_mask and "attention_mask" in features: del features["attention_mask"] # prepare decoder_input_ids if ( labels is not None and self.model is not None and hasattr(self.model, "prepare_decoder_input_ids_from_labels") ): decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels( labels=features["labels"] ) features["decoder_input_ids"] = decoder_input_ids return features @dataclass class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): """ Collator for multipack specific to the using the BatchSampler """ def __call__(self, features, return_tensors=None): if not isinstance(features[0], list): features = [features] out_features = [{} for _ in features] for i, features_ in enumerate(features): for feature in features_[0].keys(): if feature == "length": continue if feature == "attention_mask": arrays = [ (1) * np.array(item[feature]) for i, item in enumerate(features_) if feature in item ] out_features[i][feature] = np.concatenate(arrays) else: arrays = [ np.array(item[feature]) for item in features_ if feature in item ] out_features[i][feature] = np.concatenate(arrays) return super().__call__(out_features, return_tensors=return_tensors) @dataclass class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): """ Collator for multipack specific to the using the BatchSampler """ squash_position_ids: bool = False def __call__(self, features, return_tensors=None): if not isinstance(features[0], list): features: List[List[dict]] = [features] out_features = [{} for _ in features] for i, features_ in enumerate(features): for feature in features_[0].keys(): if feature == "length": continue if feature == "attention_mask": arrays = [ (i + 1) * np.array(item[feature]) for i, item in enumerate(features_) if feature in item ] out_features[i][feature] = np.concatenate(arrays) elif feature == "position_ids" and self.squash_position_ids: arrays = [ np.array(item[feature]) for item in features_ if feature in item ] # concatenate, get total length and create arange of new total position ids position_ids = np.concatenate(arrays) total_length = position_ids.shape[0] position_ids = np.arange(total_length) out_features[i][feature] = position_ids else: arrays = [ np.array(item[feature]) for item in features_ if feature in item ] out_features[i][feature] = np.concatenate(arrays) return super().__call__(out_features, return_tensors=return_tensors) @dataclass class PretrainingBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): """ Collator for multipack specific to the using the BatchSampler """ def __init__(self, *args, multipack_attn=True, **kwargs): super().__init__(*args, **kwargs) self.multipack_attn = multipack_attn def __call__(self, features, return_tensors=None): chunked_data = {} for feature in features.keys(): if feature == "length": continue if feature == "attention_mask": if self.multipack_attn: arrays = [ (i + 1) * np.array(item) for i, item in enumerate(features[feature]) ] else: arrays = [(1) * np.array(item) for item in features[feature]] chunked_data[feature] = np.concatenate(arrays) else: arrays = [np.array(item) for item in features[feature]] chunked_data[feature] = np.concatenate(arrays) features = [chunked_data] return super().__call__(features, return_tensors=return_tensors) ================================================ FILE: src/axolotl/utils/collators/core.py ================================================ """ basic shared collator constants """ IGNORE_INDEX = -100 ================================================ FILE: src/axolotl/utils/collators/mamba.py ================================================ """ collators for Mamba """ from dataclasses import dataclass from typing import Dict, Sequence import torch import transformers from axolotl.utils.collators.core import IGNORE_INDEX @dataclass class MambaDataCollator: """ Collator for State Space Models (Mamba) """ tokenizer: transformers.PreTrainedTokenizer def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: input_ids, labels = tuple( [torch.LongTensor(instance[key]) for instance in instances] for key in ("input_ids", "labels") ) input_ids = torch.nn.utils.rnn.pad_sequence( input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id, ) labels = torch.nn.utils.rnn.pad_sequence( labels, batch_first=True, padding_value=IGNORE_INDEX ) return { "input_ids": input_ids, "labels": labels, } ================================================ FILE: src/axolotl/utils/collators/mm_chat.py ================================================ """ Collators for multi-modal chat messages and packing """ from dataclasses import dataclass from typing import Any, Optional, Union from torch import Tensor from transformers import PreTrainedTokenizerBase from transformers.data.data_collator import DataCollatorMixin from transformers.utils import PaddingStrategy from axolotl.processing_strategies import ProcessingStrategy @dataclass class MultiModalChatDataCollator(DataCollatorMixin): """ Collator for multi-modal chat messages """ tokenizer: PreTrainedTokenizerBase processing_strategy: ProcessingStrategy packing: bool = False return_tensors: str = "pt" padding: Union[bool, str, PaddingStrategy] = True pad_to_multiple_of: Optional[int] = None def __post_init__(self): if self.packing: raise ValueError("Packing is currently not supported.") def torch_call(self, examples: list[dict]) -> dict[str, Any]: return self.process_rows(examples) def process_rows( self, examples: list[dict], ) -> dict[str, Tensor]: # Preprocess the examples examples = self.processing_strategy(examples) # Initialize batch messages = [ex["messages"] for ex in examples] batch = self.processing_strategy.processor.apply_chat_template( messages, add_generation_prompt=False, tokenize=True, return_tensors="pt", padding=True, return_dict=True, chat_template=self.processing_strategy.chat_template, ) # Process the labels batch["labels"] = self.processing_strategy.process_labels(batch["input_ids"]) return batch ================================================ FILE: src/axolotl/utils/comet_.py ================================================ """Module for wandb utilities""" import os from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) COMET_ENV_MAPPING_OVERRIDE = { "comet_mode": "COMET_START_MODE", "comet_online": "COMET_START_ONLINE", } COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE = { "auto_histogram_activation_logging": "COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS", "auto_histogram_epoch_rate": "COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE", "auto_histogram_gradient_logging": "COMET_AUTO_LOG_HISTOGRAM_GRADIENTS", "auto_histogram_tensorboard_logging": "COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD", "auto_histogram_weight_logging": "COMET_AUTO_LOG_HISTOGRAM_WEIGHTS", "auto_log_co2": "COMET_AUTO_LOG_CO2", "auto_metric_logging": "COMET_AUTO_LOG_METRICS", "auto_metric_step_rate": "COMET_AUTO_LOG_METRIC_STEP_RATE", "auto_output_logging": "COMET_AUTO_LOG_OUTPUT_LOGGER", "auto_param_logging": "COMET_AUTO_LOG_PARAMETERS", "comet_disabled": "COMET_AUTO_LOG_DISABLE", "display_summary_level": "COMET_DISPLAY_SUMMARY_LEVEL", "distributed_node_identifier": "COMET_DISTRIBUTED_NODE_IDENTIFIER", "log_code": "COMET_AUTO_LOG_CODE", "log_env_cpu": "COMET_AUTO_LOG_ENV_CPU", "log_env_details": "COMET_AUTO_LOG_ENV_DETAILS", "log_env_disk": "COMET_AUTO_LOG_ENV_DISK", "log_env_gpu": "COMET_AUTO_LOG_ENV_GPU", "log_env_host": "COMET_AUTO_LOG_ENV_HOST", "log_env_network": "COMET_AUTO_LOG_ENV_NETWORK", "log_git_metadata": "COMET_AUTO_LOG_GIT_METADATA", "log_git_patch": "COMET_AUTO_LOG_GIT_PATCH", "log_graph": "COMET_AUTO_LOG_GRAPH", "name": "COMET_START_EXPERIMENT_NAME", "offline_directory": "COMET_OFFLINE_DIRECTORY", "parse_args": "COMET_AUTO_LOG_CLI_ARGUMENTS", "tags": "COMET_START_EXPERIMENT_TAGS", } def python_value_to_environ_value(python_value): if isinstance(python_value, bool): if python_value is True: return "true" return "false" if isinstance(python_value, int): return str(python_value) if isinstance(python_value, list): # Comet only have one list of string parameter return ",".join(map(str, python_value)) return python_value def setup_comet_env_vars(cfg: DictDefault): # TODO, we need to convert Axolotl configuration to environment variables # as Transformers integration are call first and would create an # Experiment first for key in cfg.keys(): if key.startswith("comet_") and key != "comet_experiment_config": value = cfg.get(key, "") if value is not None and value != "": env_variable_name = COMET_ENV_MAPPING_OVERRIDE.get(key, key.upper()) final_value = python_value_to_environ_value(value) os.environ[env_variable_name] = final_value if cfg.comet_experiment_config: for key, value in cfg.comet_experiment_config.items(): if value is not None and value != "": config_env_variable_name = ( COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE.get(key) ) if config_env_variable_name is None: LOG.warning( f"Unknown Comet Experiment Config name {key}, ignoring it" ) continue final_value = python_value_to_environ_value(value) os.environ[config_env_variable_name] = final_value # Enable comet if project name is present if cfg.comet_project_name and len(cfg.comet_project_name) > 0: cfg.use_comet = True ================================================ FILE: src/axolotl/utils/config/__init__.py ================================================ """Module for working with config dicts""" import json import os from typing import Optional import torch from transformers.utils import is_torch_bf16_gpu_available from transformers.utils.import_utils import ( is_torch_greater_or_equal, is_torch_npu_available, ) from axolotl.integrations.base import PluginManager from axolotl.integrations.config import merge_input_args from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING from axolotl.loaders.utils import load_model_config from axolotl.utils.bench import log_gpu_memory_usage from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.schemas.config import ( AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase, AxolotlInputConfig as AxolotlInputConfigBase, ) from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset LOG = get_logger(__name__) def choose_device(cfg): def get_device(): try: if torch.cuda.is_available(): return f"cuda:{cfg.local_rank}" if torch.backends.mps.is_available(): return "mps" if is_torch_npu_available(): return f"npu:{cfg.local_rank}" raise SystemError("No CUDA/mps/npu device found") except Exception: return "cpu" cfg.device = get_device() if cfg.world_size == 1: cfg.device_map = cfg.device_map or "auto" else: if cfg.device.startswith("cuda"): cfg.device_map = {"": torch.cuda.current_device()} elif cfg.device.startswith("npu"): cfg.device_map = {"npu": torch.npu.current_device()} else: cfg.device_map = {"": cfg.device} # in `accelerate launch`, we need to not pass through any device map and let # accelerate figure out which parts of the model to put on which gpu accelerate_vars = [var for var in os.environ if var.startswith("ACCELERATE_USE_")] if accelerate_vars: cfg.device_map = None def resolve_dtype(cfg): if ( not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray ): # if we use ray we want to defer this check to the worker node if is_torch_bf16_gpu_available(): LOG.debug("bf16 support detected, enabling for this configuration.") cfg.bf16 = True else: LOG.debug("bf16 support not detected, disabling for this configuration.") cfg.bf16 = False if cfg.fp16 is None and not cfg.float16: cfg.fp16 = True if cfg.fp16 and cfg.bf16 == "auto": cfg.bf16 = False if cfg.device == "mps": cfg.load_in_8bit = False cfg.tf32 = False if cfg.bf16 and cfg.fp16 is not False: cfg.fp16 = True cfg.bf16 = False else: if cfg.tf32 is True: torch.set_float32_matmul_precision("high") if is_torch_greater_or_equal("2.9.0"): torch.backends.fp32_precision = "tf32" torch.backends.cuda.matmul.fp32_precision = "tf32" torch.backends.cudnn.fp32_precision = "tf32" else: torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True if cfg.bf16: cfg.fp16 = False if cfg.bf16 or cfg.bfloat16: cfg.torch_dtype = torch.bfloat16 elif cfg.load_in_8bit or cfg.fp16 or cfg.float16: cfg.torch_dtype = torch.float16 else: cfg.torch_dtype = torch.float32 def normalize_config(cfg): # setup some derived config / hyperparams cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or ( cfg.batch_size // cfg.micro_batch_size ) cfg.batch_size = ( cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps ) if cfg.eval_batch_size is None: cfg.eval_batch_size = cfg.micro_batch_size cfg.world_size = int(os.environ.get("WORLD_SIZE", 1)) cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0)) cfg.eval_table_size = cfg.eval_table_size or 0 cfg.eval_max_new_tokens = cfg.eval_max_new_tokens or 128 cfg.eval_causal_lm_metrics = cfg.eval_causal_lm_metrics or [ "sacrebleu", "comet", "ter", "chrf", ] choose_device(cfg) cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1 if cfg.world_size != 1: cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))} if cfg.fsdp or cfg.fsdp_config or cfg.ddp: effective_world_size = ( cfg.world_size // (cfg.context_parallel_size or 1) // (cfg.tensor_parallel_size or 1) ) cfg.batch_size = cfg.batch_size * effective_world_size if not cfg.use_ray: # delay resolving dtype until on worker node when launching with ray resolve_dtype(cfg) if cfg.deepspeed: if isinstance(cfg.deepspeed, str) and os.path.exists(cfg.deepspeed): ds_config_path = cfg.deepspeed with open(ds_config_path, encoding="utf-8") as f: cfg.deepspeed = json.load(f) if cfg.saves_per_epoch: save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs) if save_steps < 1.0: # prevent saves on every step cfg.save_steps = save_steps elif save_steps > 1: LOG.warning( f"Invalid value for save_steps ({save_steps}) from saves_per_epoch and/or num_epochs. Saving at training end only." ) if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch: eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs) if eval_steps < 1.0: # prevent evals on every step cfg.eval_steps = eval_steps elif eval_steps > 1: LOG.warning( f"Invalid value for eval_steps ({eval_steps}) from evals_per_epoch and/or num_epochs. Skipping evaluations." ) if not cfg.base_model_config: cfg.base_model_config = cfg.base_model # Apply pre-config load patches (e.g., for Kimi Linear remote code patching) from axolotl.loaders.patch_manager import PatchManager PatchManager.apply_pre_config_load_patches(cfg) model_config = load_model_config(cfg) cfg.tokenizer_config = ( cfg.tokenizer_config or cfg.base_model_config or cfg.base_model ) cfg.is_multimodal = ( hasattr(model_config, "model_type") and model_config.model_type in MULTIMODAL_AUTO_MODEL_MAPPING or any( multimodal_name in cfg.base_model.lower() for multimodal_name in [ "pixtral", ] ) or cfg.is_multimodal ) if cfg.is_multimodal: cfg.processor_config = ( cfg.processor_config or cfg.base_model_config or cfg.base_model ) cfg.model_config_type = model_config.model_type # Resolve inner text backbone type for VLM wrappers (e.g. mistral3 -> mistral4) if callable(getattr(model_config, "get_text_config", None)): text_config = model_config.get_text_config() if ( hasattr(text_config, "model_type") and text_config.model_type != model_config.model_type ): cfg.model_config_type_text = text_config.model_type # figure out if the model is llama cfg.is_llama_derived_model = ( ( hasattr(model_config, "model_type") and model_config.model_type in ["llama", "mllama_text_model"] ) or cfg.is_llama_derived_model or "llama" in cfg.base_model.lower() or (cfg.type_of_model and "llama" in cfg.type_of_model.lower()) ) # figure out if the model is falcon cfg.is_falcon_derived_model = ( ( hasattr(model_config, "model_type") and model_config.model_type in [ "falcon", "RefinedWebModel", "RefinedWeb", ] ) or cfg.is_falcon_derived_model or "falcon" in cfg.base_model.lower() or (cfg.type_of_model and "rwforcausallm" in cfg.type_of_model.lower()) ) cfg.is_mistral_derived_model = ( ( hasattr(model_config, "model_type") and model_config.model_type in [ "mistral", ] ) or cfg.is_mistral_derived_model or "mistral" in cfg.base_model.lower().split("/")[-1] or (cfg.type_of_model and "mistral" in cfg.type_of_model.lower()) ) cfg.is_qwen_derived_model = ( hasattr(model_config, "model_type") and model_config.model_type in [ "qwen", ] ) or cfg.is_qwen_derived_model if isinstance(cfg.pretraining_dataset, dict): cfg.pretraining_dataset = [cfg.pretraining_dataset] if ( cfg.gradient_checkpointing and cfg.unfrozen_parameters is None and cfg.gradient_checkpointing_kwargs is None and cfg.rl is None ): cfg.gradient_checkpointing_kwargs = {"use_reentrant": True} log_gpu_memory_usage(LOG, "baseline", cfg.device) def normalize_cfg_datasets(cfg): """ helpers for mapping chat_template to various dataset configurations as necessary """ if cfg.chat_template: if cfg.datasets: for idx, ds_cfg in enumerate(cfg.datasets): if ( ds_cfg.type in ["orpo.chat_template", "chat_template"] and not ds_cfg.chat_template ): LOG.info( f"updating dataset {ds_cfg.path} with `chat_template: {cfg.chat_template}` to match your chat_template" ) cfg.datasets[idx].chat_template = cfg.chat_template cfg.datasets[idx].chat_template_jinja = cfg.chat_template_jinja def validate_config( cfg: DictDefault, capabilities: Optional[dict] = None, env_capabilities: Optional[dict] = None, ) -> DictDefault: AxolotlConfigWCapabilities = AxolotlConfigWCapabilitiesBase AxolotlInputConfig = AxolotlInputConfigBase if cfg.plugins: ( AxolotlConfigWCapabilities, AxolotlInputConfig, ) = merge_input_args() # Convert datasets to proper format if needed if cfg.get("datasets"): for idx, ds_cfg in enumerate(cfg["datasets"]): if cfg.get("rl") in ["dpo", "ipo", "simpo"] and not isinstance( ds_cfg, DPODataset ): cfg["datasets"][idx] = DPODataset(**ds_cfg) elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset): cfg["datasets"][idx] = KTODataset(**dict(ds_cfg)) elif not isinstance(ds_cfg, SFTDataset): cfg["datasets"][idx] = SFTDataset(**dict(ds_cfg)) if capabilities or env_capabilities: if (capabilities and env_capabilities is None) or ( env_capabilities and capabilities is None ): raise ValueError( "Both capabilities and env_capabilities must be provided or not provided." ) return DictDefault( dict( AxolotlConfigWCapabilities( **cfg.to_dict(), capabilities=capabilities, env_capabilities=env_capabilities, ).model_dump(exclude_none=True) ) ) return DictDefault( dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True)) ) def prepare_plugins(cfg): """ Prepare the plugins for the configuration """ if cfg.get("plugins"): plugin_manager = PluginManager.get_instance() for plugin_name in cfg["plugins"]: plugin_manager.register(plugin_name) ================================================ FILE: src/axolotl/utils/config/models/__init__.py ================================================ ================================================ FILE: src/axolotl/utils/ctx_managers/__init__.py ================================================ """Init for context manager submodule""" # flake8: noqa from .sequence_parallel import SequenceParallelContextManager ================================================ FILE: src/axolotl/utils/ctx_managers/sequence_parallel.py ================================================ """Module for Axolotl trainer sequence parallelism manager and utilities""" import functools import inspect import torch import torch.distributed as dist from torch import nn from torch.distributed import DeviceMesh from torch.utils.hooks import RemovableHandle from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.utils import ModelOutput from axolotl.monkeypatch.ring_attn import ( get_ring_attn_group, register_ring_attn_from_device_mesh, update_ring_attn_params, ) from axolotl.utils.schemas.enums import RingAttnFunc # TODO(djsaunde): implement zigzag, stripe patterns here (and elsewhere) in this # module. Currently, we just focus on batch ring and varlen llama3 for simplicity. def apply_sequence_parallelism( batch: dict[str, torch.Tensor], local_rank: int, local_world_size: int, gradient_accumulation_steps: int, ring_attn_func: RingAttnFunc, ) -> tuple[dict[str, torch.Tensor], int, int]: """ Apply sequence parallelism slicing to a batch. Special handling is implemented for integer logits_to_keep, which indicates to only keep the last N tokens in the sequence during generation. Args: batch: Batch dictionary (e.g., input_ids, attention_mask, etc.). local_rank: Local rank in the sequence parallel group. local_world_size: World size of the sequence parallel group. gradient_accumulation_steps: Number of steps to accumulate gradients over. ring_attn_func: Which ring attention function to use. Currently unused, but related to above TODO. Returns: tuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added. """ batch_size, original_seq_len = batch["input_ids"].shape # Update ring attention params if needed if batch.get("position_ids") is not None and batch_size == 1: update_ring_attn_params(position_ids=batch["position_ids"]) else: # If position_ids aren't already in the batch, create them batch["position_ids"] = torch.arange( 0, original_seq_len, dtype=torch.long, device=batch["input_ids"].device, ).expand(batch["input_ids"].size(0), -1) if "logits_to_keep" in batch and isinstance(batch["logits_to_keep"], int): logits_to_keep = batch["logits_to_keep"] # Calculate which positions in the full sequence contain the last N tokens start_position = max(0, original_seq_len - logits_to_keep) chunk_size = original_seq_len // local_world_size rank_start = local_rank * chunk_size rank_end = rank_start + chunk_size # Create a boolean mask tensor for this rank's chunk mask = torch.zeros( chunk_size, dtype=torch.bool, device=batch["input_ids"].device, ) if rank_end > start_position: # Calculate how many of the last N tokens fall within this rank's range tokens_in_rank = min(rank_end, original_seq_len) - max( rank_start, start_position ) # Calculate where these tokens start in the local chunk local_start_idx = max(0, start_position - rank_start) # Set the appropriate positions in the mask to True mask[local_start_idx : local_start_idx + tokens_in_rank] = True # Replace the integer with the boolean mask batch["logits_to_keep"] = mask # Add padding to make sequence length divisible by local_world_size total_seq_len = original_seq_len pad_len = 0 divisor = min(local_world_size, 64) if total_seq_len % divisor != 0: pad_len = divisor - (total_seq_len % divisor) # Apply padding to all relevant tensors for key in batch: if ( isinstance(batch[key], torch.Tensor) and batch[key].dim() > 1 and batch[key].size(1) == total_seq_len ): # Create padding tensor pad_value = -100 if key == "labels" else 0 padding = torch.full( (batch[key].size(0), pad_len, *batch[key].shape[2:]), pad_value, dtype=batch[key].dtype, device=batch[key].device, ) # Concatenate padding to the right side of the tensor batch[key] = torch.cat([batch[key], padding], dim=1) if key == "logits_to_keep": # Create padding tensor padding = torch.ones( 1, dtype=batch[key].dtype, device=batch[key].device, ) # Concatenate padding to the right side of the tensor batch[key] = torch.cat([batch[key], padding], dim=0) # Update the total sequence length after padding total_seq_len = batch["input_ids"].size(1) # Slice batch for sequence parallel for key in batch: if not isinstance(batch[key], torch.Tensor) or batch[key].dim() <= 1: continue # Split in sequential fashion and grab this rank's chunk if batch[key].size(1) == total_seq_len: batch[key] = ( batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous() ) elif key == "logits_to_keep": batch[key] = ( batch[key].chunk(local_world_size, dim=0)[local_rank].contiguous() ) # Handle num_items_in_batch if "num_items_in_batch" in batch: # Approximation; this needed since num_items_in_batch may be counted across # all samples in a gradient accumulated batch, not on a per-step basis. local_valid_tokens = (batch["labels"] != -100).sum() # All-reduce across sequence parallel ranks to get global token count cp_group = get_ring_attn_group() global_valid_tokens = local_valid_tokens.clone() # we use AVG instead of SUM as using sum seems to scale down the loss by over-accounting the number of tokens dist.all_reduce(global_valid_tokens, op=dist.ReduceOp.AVG, group=cp_group) global_valid_tokens = int(global_valid_tokens.item()) batch["num_items_in_batch"] = ( global_valid_tokens * gradient_accumulation_steps ) return batch, original_seq_len, pad_len class SequenceParallelContextManager: """Context manager for sequence parallelism operations. This class provides a context that will automatically apply sequence parallelism during model forward passes using a pre-forward hook, and gather outputs from across the sequence parallelism group using a post-forward hook. Args: models: List of models to apply sequence parallelism to pre- and post- forward hooks. context_parallel_size: Number of processes to split sequences over. gradient_accumulation_steps: Number of steps to accumulate gradients over. ring_attn_func: Which ring attention function to use. Currently unused. heads_k_stride: Sequence parallelism K head stride size. Passed through to `varlen_llama3` `ring_flash_attn` implementation. gather_outputs: Whether to gather outputs after model forward pass across the sequence parallel group. """ def __init__( self, models: list[nn.Module], context_parallel_size: int, gradient_accumulation_steps: int, ring_attn_func: RingAttnFunc, heads_k_stride: int | None, gather_outputs: bool, device_mesh: DeviceMesh | None = None, ): self.models = models self.context_parallel_size = context_parallel_size self.gradient_accumulation_steps = gradient_accumulation_steps self.ring_attn_func = ring_attn_func self.heads_k_stride = heads_k_stride self.gather_outputs = gather_outputs self.device_mesh = device_mesh self._register_ring_attn() # Set distributed info for local rank self.process_group = get_ring_attn_group() self.local_rank = dist.get_rank(self.process_group) self.local_world_size = dist.get_world_size(self.process_group) # Will store hook handles for removal self.hook_handles: list[RemovableHandle] = [] # Store original sequence length and padding information self.original_seq_len = 0 self.pad_len = 0 # Track local valid token count for eval loss correction across CP ranks self._local_valid_tokens: torch.Tensor | None = None # Create a partially applied version of the apply_sequence_parallelism function self.apply_sequence_parallelism = functools.partial( apply_sequence_parallelism, local_rank=self.local_rank, local_world_size=self.local_world_size, gradient_accumulation_steps=self.gradient_accumulation_steps, ring_attn_func=self.ring_attn_func, ) def __enter__(self): self._register_model_hooks() return self def __exit__(self, exc_type, exc_val, exc_tb): # Remove all hooks for handle in self.hook_handles: handle.remove() self.hook_handles = [] # TODO(djsaunde): Un-patch attention and accelerate functions (low priority) def _register_ring_attn(self): # Initialize ring attn for sequence parallelism register_ring_attn_from_device_mesh( device_mesh=self.device_mesh, context_parallel_dim=("cp",), heads_k_stride=self.heads_k_stride, ring_attn_func=self.ring_attn_func, ) def _register_model_hooks(self): # Forward pre-hook to apply sequence parallelism def sequence_parallel_pre_hook(_, args, kwargs): # Get parameter names from the model's forward function forward_params = list( inspect.signature(self.models[0].forward).parameters.keys() ) updated_kwargs = kwargs.copy() for i, arg in enumerate(args): if i < len(forward_params): updated_kwargs[forward_params[i]] = arg # Any excess positional arguments are kept as-is remaining_args = args[len(forward_params) :] # Apply sequence parallelism to updated kwargs updated_kwargs, self.original_seq_len, self.pad_len = ( self.apply_sequence_parallelism(updated_kwargs) ) # Track local valid tokens for eval loss correction if "labels" in updated_kwargs and not self.models[0].training: self._local_valid_tokens = ( (updated_kwargs["labels"] != -100).sum().float() ) # Strip num_items_in_batch during eval so the model uses # reduction='mean', allowing the post-hook weighted all-reduce # formula (loss * local_valid) to correctly recover the loss sum updated_kwargs.pop("num_items_in_batch", None) else: self._local_valid_tokens = None return remaining_args, updated_kwargs # Forward post-hook to gather outputs def sequence_parallel_post_hook(_, __, output: ModelOutput) -> ModelOutput: # Gather the sharded outputs output = self._gather_outputs(output) # Remove padding if it was added if self.pad_len > 0: for key, value in output.items(): if isinstance(value, torch.Tensor) and value.dim() > 1: if value.size(1) == self.original_seq_len + self.pad_len: # Slice to remove padding output[key] = value[:, : self.original_seq_len].contiguous() return output # Post-hook to correct eval loss via weighted all-reduce across CP ranks def eval_loss_correction_post_hook(_, __, output: ModelOutput) -> ModelOutput: if self._local_valid_tokens is None: return output if not hasattr(output, "loss") or output.loss is None: return output local_valid = self._local_valid_tokens.to(output.loss.device) loss = output.loss.detach().clone() # Handle rank with zero valid tokens (loss is NaN) if local_valid.item() == 0: weighted_loss = torch.zeros(1, device=loss.device, dtype=loss.dtype) else: weighted_loss = loss * local_valid total_valid = local_valid.clone() dist.all_reduce( weighted_loss, op=dist.ReduceOp.SUM, group=self.process_group, ) dist.all_reduce( total_valid, op=dist.ReduceOp.SUM, group=self.process_group, ) if total_valid.item() > 0: output["loss"] = (weighted_loss / total_valid).squeeze() else: output["loss"] = torch.tensor( float("nan"), device=loss.device, dtype=loss.dtype ) self._local_valid_tokens = None return output # Register hooks for model in self.models: self.hook_handles.append( model.register_forward_pre_hook( sequence_parallel_pre_hook, with_kwargs=True ) ) if self.gather_outputs: self.hook_handles.append( model.register_forward_hook(sequence_parallel_post_hook) ) # Always register eval loss correction hook self.hook_handles.append( model.register_forward_hook(eval_loss_correction_post_hook) ) def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast: """Gather sharded outputs from all ranks and reconstruct the full tensor.""" for key, value in output.items(): if isinstance(value, torch.Tensor) and value.dim() > 1: output[key] = AllGatherWithGrad.apply(value, self.process_group) return output class AllGatherWithGrad(torch.autograd.Function): """Custom autograd function for all-gather to preserve gradients.""" @staticmethod def forward( ctx: torch.autograd.function.FunctionCtx, input_tensor: torch.Tensor, group: dist.ProcessGroup, ) -> torch.Tensor: """ Forward pass of all-gather of data with sequence dimension. Args: ctx: `torch.autograd` function context. input_tensor: Tensor from model output with sequence dimension. group: `torch.distributed` process group. Returns: Tensor from gathering the `input_tensor` from across the process group and concatenating along the sequence dimension. """ ctx.group = group ctx.rank = dist.get_rank(group) world_size = dist.get_world_size(group) # Gather shape metadata local_shape = torch.tensor(list(input_tensor.shape), device=input_tensor.device) all_shapes = [torch.zeros_like(local_shape) for _ in range(world_size)] dist.all_gather(all_shapes, local_shape, group=group) # Store sequence lengths for backward pass seq_lens = [int(shape[1].item()) for shape in all_shapes] ctx.seq_lens = seq_lens # Perform all_gather operation gathered = [ torch.zeros( tuple(shape.tolist()), dtype=input_tensor.dtype, device=input_tensor.device, ) for shape in all_shapes ] dist.all_gather(gathered, input_tensor, group=group) # Concatenate tensors along sequence dimension result = torch.cat(gathered, dim=1) return result @staticmethod def backward( ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor ) -> tuple[torch.Tensor, None]: """ Backward pass for all-gather operation. Extracts the gradient slice corresponding to this rank's original input from the full gradient tensor. Args: ctx: `torch.autograd` function context. grad_output: Gradient from subsequent layers with respect to the concatenated output tensor. Returns: Tuple containing the gradient slice for this rank's input tensor and `None` for the process group parameter which doesn't require gradients. """ rank = ctx.rank seq_lens = ctx.seq_lens # Extract gradient for this rank's chunk offset = sum(seq_lens[:rank]) grad_slice = grad_output[:, offset : offset + seq_lens[rank]].contiguous() return grad_slice, None ================================================ FILE: src/axolotl/utils/data/__init__.py ================================================ """Init for `axolotl.utils.data` module.""" from axolotl.utils.data.rl import prepare_preference_datasets from axolotl.utils.data.sft import ( get_dataset_wrapper, prepare_datasets, ) from axolotl.utils.data.streaming import ( encode_streaming, wrap_streaming_dataset, ) from axolotl.utils.data.utils import md5 __all__ = [ "encode_streaming", "wrap_streaming_dataset", "prepare_preference_datasets", "get_dataset_wrapper", "prepare_datasets", "md5", ] ================================================ FILE: src/axolotl/utils/data/lock.py ================================================ """Logic for loading / preparing a dataset once over all processes.""" import time from pathlib import Path from typing import Any, Callable from filelock import FileLock from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH from axolotl.utils.dict import DictDefault LOCK_FILE_NAME = "datasets_prep.lock" READY_FILE_NAME = "datasets_ready.flag" PROCESS_COUNTER_FILE_NAME = "process_counter.txt" class FileLockLoader: """ Simple class for abstracting single process data loading / processing. The first process that creates a lock file does the work; the remaining procesees simply load the preprocessed dataset once the first process is done. """ def __init__(self, cfg: DictDefault): self.cfg = cfg self.dataset_prepared_path = ( cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH ) self.lock_file_path = Path(self.dataset_prepared_path) / LOCK_FILE_NAME self.ready_flag_path = Path(self.dataset_prepared_path) / READY_FILE_NAME self.counter_path = Path(self.dataset_prepared_path) / PROCESS_COUNTER_FILE_NAME def load(self, load_fn: Callable[[], Any]) -> Any: with FileLock(str(self.lock_file_path)): self._increment_counter() if not self.ready_flag_path.exists(): result = load_fn() self.ready_flag_path.touch() return result while not self.ready_flag_path.exists(): time.sleep(1) return load_fn() def _increment_counter(self): """Safely increment the process counter.""" if self.counter_path.exists(): counter_content = self.counter_path.read_text().strip() count = int(counter_content) if counter_content else 0 else: count = 0 self.counter_path.write_text(str(count + 1)) def cleanup(self): """Clean up ready flag when last process is done.""" try: with FileLock(str(self.lock_file_path)): counter_content = self.counter_path.read_text().strip() count = int(counter_content) if counter_content else 0 count -= 1 if count <= 0: # Last process cleans everything up self.ready_flag_path.unlink(missing_ok=True) self.counter_path.unlink(missing_ok=True) else: # Still have active processes self.counter_path.write_text(str(count)) except FileNotFoundError: # Lock file might have already been deleted by another process pass ================================================ FILE: src/axolotl/utils/data/rl.py ================================================ """Data handling specific to RL trainers.""" import inspect from functools import partial from typing import Any, Callable, Literal from datasets import Dataset, DatasetDict from transformers import PreTrainedTokenizer from axolotl.loaders import load_tokenizer from axolotl.prompt_strategies.dpo import load as load_dpo from axolotl.prompt_strategies.kto import load as load_kto from axolotl.prompt_strategies.orpo import load as load_orpo from axolotl.utils.data.lock import FileLockLoader from axolotl.utils.data.shared import ( create_train_validation_split, datasets_with_name_generator, generate_dataset_hash_from_config, load_dataset_with_config, load_preprocessed_dataset, merge_datasets, save_preprocessed_dataset, try_load_from_hub, ) from axolotl.utils.data.utils import ( deduplicate_and_log_datasets, retry_on_request_exceptions, ) from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import RLType LOG = get_logger(__name__) @retry_on_request_exceptions(max_retries=3, delay=5) def prepare_preference_datasets( cfg: DictDefault, tokenizer: PreTrainedTokenizer ) -> tuple[Dataset, Dataset | None]: """Load and prepare preference datasets for RL training. Loads training and evaluation datasets, handling preprocessing, caching, and deduplication as configured. Uses FileLock for distributed coordination. Args: cfg: Configuration object containing dataset and training settings. tokenizer: Tokenizer to use for processing text. Returns: Tuple of (train_dataset, eval_dataset). eval_dataset may be None if no evaluation dataset is configured. """ def _load_datasets(): # Load training dataset train_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="train") # Load or create evaluation dataset eval_dataset: Dataset | None = None if cfg.test_datasets: eval_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="test") elif cfg.val_set_size: # Create validation split from training data train_dataset, eval_dataset = create_train_validation_split( train_dataset, cfg, cfg.val_set_size ) return train_dataset, eval_dataset # Prepare datasets (with file locking logic for multiple ranks) loader = FileLockLoader(cfg) try: train_dataset, eval_dataset = loader.load(_load_datasets) finally: loader.cleanup() # Apply deduplication if configured if cfg.dataset_exact_deduplication: train_dataset, eval_dataset = deduplicate_and_log_datasets( dataset=train_dataset, other_dataset=eval_dataset ) return train_dataset, eval_dataset def _map_dataset( cfg: DictDefault, dataset: Dataset | DatasetDict, ds_transform_fn: Callable[..., Any], tokenizer: Any | None = None, **map_kwargs: Any, ) -> Dataset: """Apply transformation function to dataset. Args: cfg: Configuration object. dataset: Dataset to transform. ds_transform_fn: Transformation function to apply. tokenizer: Optional tokenizer for transformation. **map_kwargs: Additional arguments for dataset mapping. Returns: Transformed dataset. """ sig = inspect.signature(ds_transform_fn) if "tokenizer" in sig.parameters: if not tokenizer: tokenizer = load_tokenizer(cfg) ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer) if isinstance(dataset, DatasetDict): dataset = dataset["train"] dataset = dataset.map( ds_transform_fn, num_proc=cfg.dataset_num_proc, load_from_cache_file=not cfg.is_preprocess, desc="Mapping RL Dataset", **map_kwargs, ) return dataset def _drop_long_sequences( sample: dict[str, Any], rl: RLType, tokenizer: Any, sequence_len: int ) -> bool: """Filter out samples that exceed maximum sequence length. Args: sample: Dataset sample to check. rl: Reinforcement learning type. tokenizer: Tokenizer for length calculation. sequence_len: Maximum allowed sequence length. Returns: True if sample should be kept, False if it should be dropped. Raises: ValueError: If required keys are missing or RL type is unknown. """ if rl in {RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO}: if not ( sample.get("prompt") and sample.get("chosen") and sample.get("rejected") ): raise ValueError( "Prompt, chosen and rejected keys are required for DPO/ORPO datasets" ) prompt = sample["prompt"] chosen = sample["chosen"] rejected = sample["rejected"] len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"]) len_chosen = len(tokenizer(chosen, add_special_tokens=False)["input_ids"]) len_rejected = len(tokenizer(rejected, add_special_tokens=False)["input_ids"]) return (len_prompt + len_chosen) <= sequence_len and ( len_prompt + len_rejected ) <= sequence_len if rl is RLType.KTO: if not (sample.get("prompt") and sample.get("completion")): raise ValueError("Prompt and completion keys are required for KTO datasets") prompt = sample["prompt"] completion = sample["completion"] len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"]) len_completion = len( tokenizer(completion, add_special_tokens=False)["input_ids"] ) return (len_prompt + len_completion) <= sequence_len if rl in {RLType.GRPO, RLType.GDPO}: return True raise ValueError("Unknown RL type") def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset: """Load and process dataset split for RL training. Args: cfg: Configuration object containing dataset settings. split: Dataset split to load ("train" or "test"). Returns: Combined and processed dataset for the specified split. """ datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets split_datasets: list[Dataset | DatasetDict] = [] for dataset_config in datasets_with_name_generator(datasets_configs): dataset: Dataset | DatasetDict = load_dataset_with_config( dataset_config, cfg.hf_use_auth_token, streaming=False ) split_datasets.append(dataset) tokenizer = load_tokenizer(cfg) for i, dataset in enumerate(split_datasets): _type = datasets_configs[i]["type"] if _type: if isinstance(_type, DictDefault): _type = "user_defined.default" if cfg.rl is RLType.ORPO: ds_transform_fn = load_orpo(_type, cfg, dataset_idx=i) elif cfg.rl is RLType.KTO: ds_transform_fn = load_kto(_type, cfg, dataset_idx=i) else: ds_transform_fn = load_dpo(_type, cfg, dataset_idx=i) map_kwargs: dict[str, Any] = {} if isinstance(ds_transform_fn, tuple): ds_transform_fn, map_kwargs = ds_transform_fn split_datasets[i] = _map_dataset( cfg, dataset, ds_transform_fn, tokenizer, **map_kwargs ) else: # If no `type` is provided, assume the dataset is already in the expected format with # "prompt", "chosen", and "rejected" already preprocessed split_datasets[i] = dataset if not cfg.skip_prepare_dataset: drop_long = partial( _drop_long_sequences, rl=cfg.rl, tokenizer=tokenizer, sequence_len=cfg.sequence_len, ) prior_len = len(split_datasets[i]) split_datasets[i] = split_datasets[i].filter( drop_long, num_proc=cfg.dataset_num_proc, load_from_cache_file=not cfg.is_preprocess, desc="Dropping Long Sequences", ) dropped = prior_len - len(split_datasets[i]) if dropped: LOG.warning(f"Dropped {dropped} long samples from dataset index {i}") # Merge datasets dataset = merge_datasets(split_datasets, cfg) if not cfg.skip_prepare_dataset: # Deduplicate before saving so the saved dataset is already de-duplicated if cfg.dataset_exact_deduplication: dataset, _ = deduplicate_and_log_datasets(dataset=dataset) # Save preprocessed dataset dataset_hash = generate_dataset_hash_from_config( cfg, datasets_configs, tokenizer.name_or_path ) save_preprocessed_dataset(cfg, dataset, dataset_hash, split) return dataset def _load_or_create_dataset_split( cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"] ) -> Dataset: """Load preprocessed dataset or create new one for given split. Args: cfg: Configuration object. tokenizer: Tokenizer to use for processing text. split: Dataset split to load. Returns: Tuple of (dataset, is_preprocessed). """ # Select correct dataset configuration based on split datasets_config = cfg.datasets if split == "train" else cfg.test_datasets # Generate dataset hash for caching dataset_hash = generate_dataset_hash_from_config( cfg, datasets_config, tokenizer.name_or_path ) # Try loading from hub if push_dataset_to_hub is configured dataset = None if cfg.push_dataset_to_hub: dataset = try_load_from_hub(cfg, dataset_hash, split) # Attempt to load preprocessed dataset if dataset is None: dataset = load_preprocessed_dataset(cfg, dataset_hash) # Otherwise, load it if dataset is None: dataset = _load_split(cfg, split=split) return dataset ================================================ FILE: src/axolotl/utils/data/sft.py ================================================ """Data handling specific to SFT.""" import functools import os import tempfile from typing import Literal from datasets import ( Dataset, DatasetDict, IterableDataset, IterableDatasetDict, load_dataset, ) from transformers import PreTrainedTokenizer, ProcessorMixin from axolotl.prompters import Prompter from axolotl.utils.data.lock import FileLockLoader from axolotl.utils.data.shared import ( create_train_validation_split, datasets_with_name_generator, generate_dataset_hash_from_config, load_dataset_with_config, load_preprocessed_dataset, merge_datasets, save_preprocessed_dataset, try_load_from_hub, ) from axolotl.utils.data.streaming import wrap_streaming_dataset from axolotl.utils.data.utils import ( deduplicate_and_log_datasets, handle_long_seq_in_dataset, retry_on_request_exceptions, ) from axolotl.utils.data.wrappers import get_dataset_wrapper from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import is_local_main_process from axolotl.utils.logging import get_logger from axolotl.utils.trainer import ( calculate_total_num_steps, process_datasets_for_packing, ) LOG = get_logger(__name__) @retry_on_request_exceptions(max_retries=3, delay=5) def prepare_datasets( cfg: DictDefault, tokenizer: PreTrainedTokenizer, processor: ProcessorMixin | None = None, ) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]: """Prepare training and evaluation datasets based on configuration. Args: cfg: Dictionary mapping `axolotl` config keys to values. tokenizer: Tokenizer to use for processing text. processor: Optional processor for multimodal datasets. Returns: Tuple of (train_dataset, eval_dataset, total_steps, prompters). """ if cfg.streaming or cfg.pretraining_dataset: return _prepare_streaming_dataset(cfg, tokenizer, processor) return _prepare_standard_dataset(cfg, tokenizer, processor) def _prepare_standard_dataset( cfg: DictDefault, tokenizer: PreTrainedTokenizer, processor: ProcessorMixin | None, ) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]: """Prepare standard (non-pretraining) datasets.""" def _load_datasets(): # Always load training dataset train_dataset, eval_dataset, prompters = _load_and_prepare_datasets( tokenizer, cfg, split="train", processor=processor, ) # Overwrite eval_dataset if test data exists if cfg.test_datasets: _, eval_dataset, _ = _load_and_prepare_datasets( tokenizer, cfg, split="test", processor=processor, ) return train_dataset, eval_dataset, prompters # Prepare datasets (with file locking logic for multiple ranks) loader = FileLockLoader(cfg) try: train_dataset, eval_dataset, prompters = loader.load(_load_datasets) finally: loader.cleanup() if os.environ.get("AXOLOTL_IS_PREPROCESS") == "1": return train_dataset, eval_dataset, -1, prompters # Validate sample packing configuration for evaluation if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False: total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False) if total_eval_steps == 0: raise ValueError( "eval dataset split is too small for sample_packing. " "You should set `eval_sample_packing: False` in your config." ) # Calculate total number of training steps if cfg.max_steps: total_num_steps = min( calculate_total_num_steps(cfg, train_dataset), cfg.max_steps ) else: total_num_steps = calculate_total_num_steps(cfg, train_dataset) LOG.info(f"Maximum number of steps set at {total_num_steps}") return train_dataset, eval_dataset, total_num_steps, prompters def _prepare_streaming_dataset( cfg: DictDefault, tokenizer: PreTrainedTokenizer, processor: ProcessorMixin | None, ) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]: """ Prepare dataset for streaming mode. Note: Streaming datasets are loaded incrementally from the source. """ if cfg.pretraining_dataset: dataset_config = _extract_pretraining_config(cfg) train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer) elif cfg.sample_packing: # TODO(djsaunde): Implement for multiple datasets dataset_config = DictDefault(cfg.datasets[0]) # Ensure we have a split set - default to 'train' if not specified if not hasattr(dataset_config, "split") or not dataset_config.split: dataset_config.split = "train" train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer) else: # Use legacy loading function for non-packed streaming datasets train_dataset, eval_dataset, prompters = _load_and_prepare_datasets( tokenizer, cfg, split="train", processor=processor, streaming=True, ) # Return early for non-packed streaming datasets total_num_steps = cfg.max_steps if cfg.max_steps else -1 return train_dataset, eval_dataset, total_num_steps, prompters # Load evaluation dataset if specified eval_dataset = None if cfg.test_datasets: _, eval_dataset, _ = _load_and_prepare_datasets( tokenizer, cfg, split="test", processor=processor, streaming=False, ) # For streaming, we return max_steps directly from config or -1 if not set total_num_steps = cfg.max_steps if cfg.max_steps else -1 return train_dataset, eval_dataset, total_num_steps, [] def _extract_pretraining_config(cfg: DictDefault) -> DictDefault: """Extract pretraining configuration from the main config.""" if isinstance(cfg.pretraining_dataset, list) and isinstance( cfg.pretraining_dataset[0], dict ): config = cfg.pretraining_dataset[0] return DictDefault( { "path": config["path"], "name": config["name"], "skip": config["skip"], "split": config.get("split", "train"), "data_files": config.get("data_files"), "type": config.get("type", "pretrain"), } ) # Simple string path case return DictDefault( { "path": cfg.pretraining_dataset, "name": None, "skip": 0, "split": "train", "data_files": None, "type": "pretrain", } ) def _load_streaming_dataset( pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer ) -> IterableDataset: """Load and prepare a streaming dataset for pretraining.""" # Create dataset wrapper partial function dataset_wrapper_partial = functools.partial( get_dataset_wrapper, dataset_config=pretraining_config, tokenizer=tokenizer, cfg=cfg, dataset_base_type=pretraining_config["type"], ) # Load the actual dataset if ( cfg.accelerator_config and cfg.accelerator_config.dispatch_batches and not is_local_main_process() ): iter_dataset = _create_placeholder_dataset() else: iter_dataset = load_dataset( pretraining_config["path"], streaming=True, split=pretraining_config["split"], name=pretraining_config["name"], data_files=pretraining_config["data_files"], ) # Apply skip if specified if pretraining_config["skip"]: LOG.info(f"Skipping {pretraining_config['skip']} samples from the dataset") iter_dataset = iter_dataset.skip(pretraining_config["skip"]) # Wrap the dataset for pretraining train_dataset = wrap_streaming_dataset( iter_dataset, tokenizer, cfg, dataset_wrapper_partial, ) # Format for PyTorch return train_dataset.with_format("torch") def _create_placeholder_dataset() -> IterableDataset: """Create a minimal placeholder dataset for non-main processes.""" with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: f.write("text\n") f.write("lorem ipsum dolor sit amet\n") f.seek(0) return load_dataset("csv", data_files=f.name, split="train", streaming=True) def _load_tokenized_prepared_datasets( tokenizer: PreTrainedTokenizer, cfg: DictDefault, split: Literal["train", "test"] = "train", processor: ProcessorMixin | None = None, streaming: bool = False, ) -> tuple[Dataset | DatasetDict, list[Prompter | None]]: """Load or create tokenized and prepared datasets for training or testing. Args: tokenizer: Tokenizer for processing text. cfg: Configuration object. split: Dataset split to load ('train' or 'test'). processor: Optional processor for multimodal datasets. streaming: Whether to use iterable preprocessing. Returns: Tuple of (dataset, prompters list). """ # Select correct dataset configuration based on split datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets # Generate dataset hash for caching dataset_hash = generate_dataset_hash_from_config( cfg, datasets_configs, tokenizer.name_or_path ) # Try loading from hub if push_dataset_to_hub is configured dataset = None if cfg.push_dataset_to_hub: dataset = try_load_from_hub(cfg, dataset_hash, split) # If not found on hub, try loading from disk if dataset is None: dataset = load_preprocessed_dataset(cfg, dataset_hash) # If not found on disk or skipping prepared dataset, load and process raw datasets prompters: list[Prompter | None] = [] if dataset is None: dataset, prompters = _load_raw_datasets( cfg, datasets_configs, tokenizer, split, processor, streaming, ) return dataset, prompters def _load_raw_datasets( cfg: DictDefault, datasets_configs: list, tokenizer: PreTrainedTokenizer, split: str, processor: ProcessorMixin | None = None, streaming: bool = False, ) -> tuple[Dataset, list[Prompter | None]]: """Load, process, merge, and save raw datasets.""" LOG.info("Loading raw datasets...", main_process_only=False) if not cfg.is_preprocess and not cfg.skip_prepare_dataset: LOG.warning( "Processing datasets during training can lead to VRAM instability. Please " "pre-process your dataset using `axolotl preprocess path/to/config.yml`." ) # Load and process individual datasets datasets = [] prompters = [] for dataset_config in datasets_with_name_generator(datasets_configs): dataset_wrapper, dataset_prompter = _load_and_process_single_dataset( dataset_config=dataset_config, cfg=cfg, tokenizer=tokenizer, split=split, seed=cfg.seed, processor=processor, streaming=streaming, ) datasets.append(dataset_wrapper) prompters.append(dataset_prompter) # Merge datasets dataset = merge_datasets(datasets, cfg) if not cfg.skip_prepare_dataset and not streaming: if split == "test" and cfg.eval_sequence_len: dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg) else: dataset = handle_long_seq_in_dataset(dataset, cfg.sequence_len, cfg) if (split == "train" and cfg.sample_packing) or ( split == "test" and cfg.eval_sample_packing ): dataset, _ = process_datasets_for_packing(cfg, dataset, None) # Deduplicate before saving so the saved dataset is already de-duplicated if cfg.dataset_exact_deduplication: dataset, _ = deduplicate_and_log_datasets(dataset=dataset) # Save the prepared dataset dataset_hash = generate_dataset_hash_from_config( cfg, datasets_configs, tokenizer.name_or_path ) save_preprocessed_dataset(cfg, dataset, dataset_hash, split) return dataset, prompters def _load_and_process_single_dataset( dataset_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: str, seed: int, processor: ProcessorMixin | None = None, streaming: bool = False, ) -> tuple[Dataset | IterableDataset, Prompter | None]: """Load and process a single dataset based on the passed config.""" # Load the dataset dataset = load_dataset_with_config( dataset_config, cfg.hf_use_auth_token, streaming=streaming ) # Parse dataset type d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type) # Select the appropriate split if isinstance(dataset, (DatasetDict, IterableDatasetDict)): if dataset_config.split and dataset_config.split in dataset: dataset = dataset[dataset_config.split] elif split in dataset: dataset = dataset[split] else: raise ValueError( f"no {split} split found for dataset {dataset_config.path}, you may " "specify a split with 'split: ...'" ) # Apply sharding if configured if dataset_config.shards: shards_idx = dataset_config.get("shards_idx", 0) dataset = dataset.shuffle(seed=seed).shard( num_shards=dataset_config.shards, index=shards_idx ) # Apply dataset wrapper dataset_wrapper, dataset_prompter = get_dataset_wrapper( dataset_config=dataset_config, tokenizer=tokenizer, cfg=cfg, dataset_base_type=d_base_type, dataset=dataset, dataset_prompt_style=d_prompt_style, processor=processor, ) return dataset_wrapper, dataset_prompter def _parse_dataset_type(d_type: str) -> tuple[str | None, str | None]: """Parse the dataset type string into base type and prompt style.""" if not isinstance(d_type, str): return None, None d_type_split = d_type.split(":") d_base_type = d_type_split[0] d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None return d_base_type, d_prompt_style def _handle_train_dataset_split( dataset: Dataset, cfg: DictDefault ) -> tuple[Dataset, Dataset | None]: """Handle processing for train split, including validation set creation.""" val_set_size = ( int(cfg.val_set_size) if cfg.val_set_size > 1 else float(cfg.val_set_size) ) if val_set_size: # Create train/validation split train_dataset, eval_dataset = create_train_validation_split( dataset, cfg, val_set_size ) return train_dataset, eval_dataset # No validation split - deduplication already applied during preprocessing return dataset, None def _apply_dataset_sharding(dataset: Dataset, cfg: DictDefault) -> Dataset: """Apply dataset sharding if configured. Args: dataset: Dataset to shard. cfg: Configuration object containing shard settings. Returns: Sharded dataset or original dataset if no sharding configured. """ if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None: LOG.info( f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards" ) dataset = dataset.shard( num_shards=cfg.dataset_shard_num, index=cfg.dataset_shard_idx, ) return dataset def _load_and_prepare_datasets( tokenizer: PreTrainedTokenizer, cfg: DictDefault, split: Literal["train", "test"] = "train", processor: ProcessorMixin | None = None, streaming: bool = False, ) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]: """Load and prepare datasets with optional validation split and sharding. Args: tokenizer: Tokenizer for processing text. cfg: Configuration object. split: Dataset split to load ('train' or 'test'). processor: Optional processor for multimodal datasets. streaming: Whether to use iterable preprocessing. Returns: Tuple of (train_dataset, eval_dataset, prompters). """ # Load the base dataset dataset, prompters = _load_tokenized_prepared_datasets( tokenizer, cfg, split=split, processor=processor, streaming=streaming, ) # Apply dataset sharding if configured using shared function dataset = _apply_dataset_sharding(dataset, cfg) # Apply deduplication and create train / validation splits based on the split type if split == "train": train_dataset, eval_dataset = _handle_train_dataset_split(dataset, cfg) else: # Deduplication already applied during preprocessing train_dataset, eval_dataset = None, dataset return train_dataset, eval_dataset, prompters ================================================ FILE: src/axolotl/utils/data/shared.py ================================================ """Dataset loading shared utils.""" from __future__ import annotations import functools import os from pathlib import Path from typing import TYPE_CHECKING, Any, Generator from datasets import ( Dataset, DatasetDict, IterableDataset, IterableDatasetDict, concatenate_datasets, load_dataset, load_from_disk, ) from huggingface_hub import hf_hub_download, snapshot_download from huggingface_hub.errors import ( HFValidationError, RepositoryNotFoundError, RevisionNotFoundError, ) from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5 from axolotl.utils.datasets import get_default_process_count from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger if TYPE_CHECKING: from adlfs import AzureBlobFileSystem from gcsfs import GCSFileSystem from ocifs import OCIFileSystem from s3fs import S3FileSystem LOG = get_logger(__name__) EXTENSIONS_TO_DATASET_TYPES = { ".parquet": "parquet", ".arrow": "arrow", ".csv": "csv", ".txt": "text", } def get_dataset_type(dataset_config: DictDefault) -> str: """Get the dataset type from the path if it's not specified.""" if dataset_config.ds_type: return dataset_config.ds_type for extension, dataset_type in EXTENSIONS_TO_DATASET_TYPES.items(): if extension in dataset_config.path: return dataset_type return "json" def datasets_with_name_generator( dataset_configs: list[DictDefault], ) -> Generator[DictDefault, None, None]: """Yields expanded dataset configurations based on multiple names or preprocessing shards. When a dataset config has a list of names, it yields separate configs for each name. When a dataset config specifies preprocessing shards, it yields configs for each shard. Args: dataset_configs: List of dataset configuration objects. Yields: Individual dataset configurations, expanded as needed for names or shards. """ for config in dataset_configs: if config.name and isinstance(config.name, list): for name in config.name: yield DictDefault({**config, "name": name}) elif config.preprocess_shards and not config.shards: for shard_idx in range(config.preprocess_shards): yield DictDefault( { **config, "shards": config.preprocess_shards, "shards_idx": shard_idx, } ) else: yield config def load_dataset_with_config( dataset_config: DictDefault, use_auth_token: bool, streaming=False ) -> Dataset | IterableDataset: """Load a dataset from a config. Handles datasets that are stored locally, in the HuggingFace Hub, in a remote filesystem (S3, GCS, Azure, OCI), a URL, or `data_files`. Args: dataset_config: Single dataset config. use_auth_token: Whether to use HF auth token. streaming: Whether to stream the dataset. Returns: Loaded dataset. """ # Set up common kwargs for dataset loading load_dataset_kwargs = { "split": dataset_config.split if dataset_config.split else None, "name": dataset_config.name, "streaming": streaming, "trust_remote_code": dataset_config.trust_remote_code, } # First check if it's a local path if Path(dataset_config.path).exists(): return _load_from_local_path(dataset_config, load_dataset_kwargs) # Check if it's a HuggingFace dataset is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token) # Check if it's a cloud storage path and get appropriate filesystem remote_fs, storage_options = _get_remote_filesystem(dataset_config.path) is_cloud_dataset = False if remote_fs: try: is_cloud_dataset = remote_fs.exists(dataset_config.path) except (FileNotFoundError, ConnectionError): pass # Load from appropriate source if is_hub_dataset: return _load_from_hub(dataset_config, use_auth_token, load_dataset_kwargs) if is_cloud_dataset: return _load_from_cloud( dataset_config, remote_fs, storage_options, load_dataset_kwargs ) if dataset_config.path.startswith("https://"): return _load_from_url(dataset_config, load_dataset_kwargs) if dataset_config.data_files: return _load_from_data_files(dataset_config, load_dataset_kwargs) raise ValueError( f"The dataset could not be loaded. This could be due to a misconfigured dataset path " f"({dataset_config.path}). Try double-check your path / name / data_files. " f"This is not caused by the dataset type." ) def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) -> bool: """Check if a dataset exists on the HuggingFace Hub.""" try: snapshot_download( repo_id=dataset_config.path, repo_type="dataset", token=use_auth_token, revision=dataset_config.revision, ignore_patterns=["*"], ) return True except ( RepositoryNotFoundError, RevisionNotFoundError, FileNotFoundError, ConnectionError, HFValidationError, ValueError, ): return False def _get_remote_filesystem( path: str, ) -> tuple[ S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem | None, dict ]: """Get the appropriate filesystem for a remote path.""" if path.startswith("s3://"): try: import s3fs storage_options = {"anon": False} return s3fs.S3FileSystem(**storage_options), storage_options except ImportError as exc: raise ImportError("s3:// paths require s3fs to be installed") from exc elif path.startswith(("gs://", "gcs://")): try: import gcsfs storage_options = {"token": None} # type: ignore # nosec B105 return gcsfs.GCSFileSystem(**storage_options), storage_options except ImportError as exc: raise ImportError( "gs:// or gcs:// paths require gcsfs to be installed" ) from exc elif path.startswith(("adl://", "abfs://", "az://")): try: import adlfs storage_options = {"anon": False} return adlfs.AzureBlobFileSystem(**storage_options), storage_options except ImportError as exc: raise ImportError( "adl:// or abfs:// paths require adlfs to be installed" ) from exc elif path.startswith("oci://"): try: import ocifs storage_options = {} return ocifs.OCIFileSystem(**storage_options), storage_options except ImportError as exc: raise ImportError("oci:// paths require ocifs to be installed") from exc return None, {} def _load_from_local_path( dataset_config: DictDefault, load_dataset_kwargs: dict ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: """Load a dataset from a local path.""" local_path = Path(dataset_config.path) if local_path.is_dir(): if dataset_config.data_files: dataset_type = get_dataset_type(dataset_config) return load_dataset( dataset_type, data_files=dataset_config.data_files, **load_dataset_kwargs, ) try: return load_from_disk(dataset_config.path) except FileNotFoundError: return load_dataset(dataset_config.path, **load_dataset_kwargs) elif local_path.is_file(): dataset_type = get_dataset_type(dataset_config) # For single file datasets, HF always creates only a "train" split if dataset_type in ("json", "csv", "text"): load_dataset_kwargs["split"] = "train" return load_dataset( dataset_type, data_files=dataset_config.path, **load_dataset_kwargs, ) else: raise ValueError( "Unhandled dataset load: local path exists, but is neither a directory or a file" ) def _load_from_hub( dataset_config: DictDefault, use_auth_token: bool, load_dataset_kwargs: dict ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: """Load a dataset from the HuggingFace Hub.""" return load_dataset( dataset_config.path, data_files=dataset_config.data_files, token=use_auth_token, revision=dataset_config.revision, **load_dataset_kwargs, ) def _load_from_cloud( dataset_config: DictDefault, remote_fs: S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem, storage_options: dict, load_dataset_kwargs: dict, ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: """Load a dataset from cloud storage.""" if remote_fs.isdir(dataset_config.path): return load_from_disk( dataset_config.path, storage_options=storage_options, ) if remote_fs.isfile(dataset_config.path): dataset_type = get_dataset_type(dataset_config) return load_dataset( dataset_type, data_files=dataset_config.path, storage_options=storage_options, **load_dataset_kwargs, ) raise ValueError( f"Cloud path {dataset_config.path} is neither a directory nor a file" ) def _load_from_url( dataset_config: DictDefault, load_dataset_kwargs: dict ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: """Load a dataset from a URL.""" dataset_type = get_dataset_type(dataset_config) return load_dataset( dataset_type, data_files=dataset_config.path, **load_dataset_kwargs, ) def _load_from_data_files( dataset_config: DictDefault, load_dataset_kwargs: dict ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: """Load a dataset from data files.""" file_path = None if isinstance(dataset_config.data_files, str): file_path = hf_hub_download( repo_id=dataset_config.path, repo_type="dataset", filename=dataset_config.data_files, revision=dataset_config.revision, ) elif isinstance(dataset_config.data_files, list): file_path = [ hf_hub_download( repo_id=dataset_config.path, repo_type="dataset", filename=file, revision=dataset_config.revision, ) for file in dataset_config.data_files ] else: raise ValueError("data_files must be either a string or list of strings") return load_dataset("json", data_files=file_path, **load_dataset_kwargs) def generate_split_fingerprints( dataset: Dataset, val_set_size: int | float, seed: int ) -> tuple[str, str]: """Generate consistent fingerprints for train/test splits.""" fingerprint = dataset._fingerprint train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}" test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}" train_fingerprint = md5(train_hash_input) test_fingerprint = md5(test_hash_input) return train_fingerprint, test_fingerprint def get_prepared_dataset_path(cfg: DictDefault, dataset_hash: str) -> Path: """Get standardized path for prepared datasets. Args: cfg: Configuration object. dataset_hash: Hash identifying the specific dataset configuration. Returns: Path where the prepared dataset should be stored. """ base_path = cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH return Path(base_path) / dataset_hash def create_train_validation_split( dataset: Dataset, cfg: DictDefault, val_set_size: int | float ) -> tuple[Dataset, Dataset]: """Create train/validation split with consistent fingerprinting. Args: dataset: Dataset to split. cfg: Configuration object containing seed and other settings. val_set_size: Size of validation set (absolute number or fraction). Returns: Tuple of (train_dataset, eval_dataset). """ train_fingerprint, test_fingerprint = generate_split_fingerprints( dataset, val_set_size, cfg.seed ) # Apply deduplication before splitting if configured if cfg.dataset_exact_deduplication: dataset, _ = deduplicate_and_log_datasets(dataset=dataset) split_dataset = dataset.train_test_split( test_size=val_set_size, shuffle=False, seed=cfg.seed, train_new_fingerprint=train_fingerprint, test_new_fingerprint=test_fingerprint, ) return split_dataset["train"], split_dataset["test"] def _generate_from_iterable_dataset( dataset: IterableDataset, worker_id: list[int], num_workers: list[int] ) -> Generator[Any, None, None]: """Generator function to correctly split the dataset for each worker""" for i, item in enumerate(dataset): if i % num_workers[0] == worker_id[0]: yield item def save_preprocessed_dataset( cfg: DictDefault, dataset: Dataset, dataset_hash: str, split: str, ) -> None: """Save preprocessed dataset to disk and optionally push to the HF Hub.""" prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash) num_workers = cfg.dataset_num_proc or get_default_process_count() if isinstance(dataset, IterableDataset): ds_from_iter = Dataset.from_generator( functools.partial(_generate_from_iterable_dataset, dataset), features=dataset.features, num_proc=num_workers, split=split, gen_kwargs={ "worker_id": list(range(num_workers)), "num_workers": [num_workers] * num_workers, }, ) ds_from_iter.save_to_disk( str(prepared_ds_path), num_proc=num_workers, max_shard_size=None, num_shards=cfg.num_dataset_shards_to_save, ) else: min_rows_per_proc = 256 os.makedirs(prepared_ds_path, exist_ok=True) dataset.save_to_disk( str(prepared_ds_path), num_proc=min(max(1, len(dataset) // min_rows_per_proc), num_workers), max_shard_size=None, num_shards=cfg.num_dataset_shards_to_save, ) if cfg.push_dataset_to_hub: LOG.info( "Pushing merged prepared dataset to Huggingface hub at " f"{cfg.push_dataset_to_hub} (version {dataset_hash})...", main_process_only=False, ) dataset.push_to_hub( cfg.push_dataset_to_hub, dataset_hash, private=True, ) def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset | None: """Load preprocessed dataset from disk if available. Args: cfg: Configuration object. dataset_hash: Hash identifying the dataset configuration. Returns: Loaded dataset if found and conditions are met, None otherwise. """ prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash) if ( cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")) and not cfg.skip_prepare_dataset and not cfg.is_preprocess ): LOG.info( f"Loading prepared dataset from disk at {prepared_ds_path}...", ) return load_from_disk(str(prepared_ds_path)) LOG.info( f"Unable to find prepared dataset in {prepared_ds_path}", ) return None def try_load_from_hub( cfg: DictDefault, dataset_hash: str, split: str ) -> Dataset | None: """Try to load the prepared dataset from HuggingFace Hub.""" try: LOG.info( "Attempting to load prepared dataset from HuggingFace Hub at " f"{cfg.push_dataset_to_hub} (version {dataset_hash})..." ) dataset = load_dataset( cfg.push_dataset_to_hub, dataset_hash, token=cfg.hf_use_auth_token, ) return dataset[split] except Exception: LOG.info("Unable to find prepared dataset in HuggingFace Hub") return None def generate_dataset_hash_from_config( cfg: DictDefault, cfg_datasets: list, tokenizer_name: str ) -> str: """Generate a hash to uniquely identify a dataset configuration for SFT. Args: cfg: Main configuration object. cfg_datasets: List of dataset configurations. tokenizer_name: Name of the tokenizer being used. Returns: MD5 hash string representing the configuration. """ config_str = ( f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@" f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}@" f"{cfg.dataset_exact_deduplication or False}|" f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}" f"|{tokenizer_name}" ) return str(md5(config_str)) def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset: """Merge multiple datasets into one with optional shuffling. Args: datasets: List of datasets to merge. cfg: Configuration object containing shuffle settings. Returns: Merged dataset. """ if len(datasets) == 1: ds = datasets[0] # Do not shuffle if curriculum sampling is enabled or # shuffle_merged_datasets is disabled if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets: return ds return ds.shuffle(seed=cfg.seed) # If enabled, shuffle each dataset independently before merging. # This allows curriculum learning strategies to be applied at the dataset level. if cfg.shuffle_before_merging_datasets: LOG.info("Shuffling each dataset individually before merging...") datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets] LOG.info("Merging datasets...") merged_dataset = concatenate_datasets(datasets) if cfg.shuffle_merged_datasets: LOG.debug("Shuffling merged datasets...") if cfg.curriculum_sampling: LOG.warning( "Shuffling merged datasets with curriculum sampling is not recommended. " "This will randomize the order of samples." ) merged_dataset = merged_dataset.shuffle(seed=cfg.seed) else: LOG.debug("Not shuffling merged datasets.") return merged_dataset ================================================ FILE: src/axolotl/utils/data/streaming.py ================================================ """Data handling specific to streaming datasets.""" import functools from collections import defaultdict from typing import Callable, Dict, List, Optional import torch from datasets import Dataset from torch.utils.data import RandomSampler from transformers import PreTrainedTokenizerBase from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq from axolotl.utils.logging import get_logger from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths from axolotl.utils.trainer import process_pretraining_datasets_for_packing LOG = get_logger(__name__) def encode_streaming( examples: Dict[str, List], tokenizer: PreTrainedTokenizerBase, max_tokens: int, text_column: str = "text", concatenate: bool = True, ) -> Dict[str, List]: res = tokenizer( examples[text_column], truncation=True, max_length=max_tokens - 2, add_special_tokens=True, ) # Convert to PyTorch tensors input_ids = [torch.tensor(seq) for seq in res["input_ids"]] targets = [torch.tensor(seq) for seq in res["input_ids"]] attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]] if not concatenate: return { "input_ids": [seq.tolist() for seq in input_ids], "labels": [seq.tolist() for seq in targets], "attention_mask": [seq.tolist() for seq in attention_mask], } new_input_ids = [] new_labels = [] new_attention_mask = [] # Append EOS and PAD tokens to input_ids, and correct attention_mask for i, _ in enumerate(input_ids): input_ids[i] = torch.cat( ( input_ids[i], torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]), ), dim=0, ) targets[i] = torch.cat( ( targets[i], torch.tensor([tokenizer.eos_token_id, -100]), ), dim=0, ) attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0) # Concatenate tokens so that their lengths are less than max_tokens buffer_input_ids = torch.tensor([], dtype=torch.long) buffer_labels = torch.tensor([], dtype=torch.long) buffer_attention_mask = torch.tensor([], dtype=torch.long) for ids, labels, mask in zip(input_ids, targets, attention_mask, strict=False): if buffer_input_ids.numel() == max_tokens: new_input_ids.append(buffer_input_ids) new_labels.append(buffer_labels) new_attention_mask.append(buffer_attention_mask) buffer_input_ids = torch.tensor([], dtype=torch.long) buffer_labels = torch.tensor([], dtype=torch.long) buffer_attention_mask = torch.tensor([], dtype=torch.long) buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) buffer_labels = torch.cat((buffer_labels, labels), dim=0) buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) elif buffer_input_ids.numel() + ids.numel() <= max_tokens: buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) buffer_labels = torch.cat((buffer_labels, labels), dim=0) buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) else: buffer_input_ids = torch.cat( ( buffer_input_ids, torch.full( (max_tokens - buffer_input_ids.numel(),), tokenizer.pad_token_id, dtype=torch.long, ), ), dim=0, ) buffer_labels = torch.cat( ( buffer_labels, torch.full( (max_tokens - buffer_labels.numel(),), -100, dtype=torch.long, ), ), dim=0, ) buffer_attention_mask = torch.cat( ( buffer_attention_mask, torch.full( (max_tokens - buffer_attention_mask.numel(),), 0, dtype=torch.long, ), ), dim=0, ) new_input_ids.append(buffer_input_ids) new_labels.append(buffer_labels) new_attention_mask.append(buffer_attention_mask) buffer_input_ids = torch.tensor([], dtype=torch.long) buffer_labels = torch.tensor([], dtype=torch.long) buffer_attention_mask = torch.tensor([], dtype=torch.long) buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) buffer_labels = torch.cat((buffer_labels, labels), dim=0) buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) if buffer_input_ids.numel() > 0: # for any leftover tokens while buffer_input_ids.numel() < max_tokens: # make all sequences equal in size buffer_input_ids = torch.cat( ( buffer_input_ids, torch.full( (max_tokens - buffer_input_ids.numel(),), tokenizer.pad_token_id, dtype=torch.long, ), ), dim=0, ) buffer_labels = torch.cat( ( buffer_labels, torch.full( (max_tokens - buffer_labels.numel(),), -100, dtype=torch.long, ), ), dim=0, ) buffer_attention_mask = torch.cat( ( buffer_attention_mask, torch.full( (max_tokens - buffer_attention_mask.numel(),), 0, dtype=torch.long, ), ), dim=0, ) new_input_ids.append(buffer_input_ids) new_labels.append(buffer_labels) new_attention_mask.append(buffer_attention_mask) ret = { "input_ids": [seq.tolist() for seq in new_input_ids], "labels": [seq.tolist() for seq in new_labels], "attention_mask": [seq.tolist() for seq in new_attention_mask], } LOG.debug(len(ret["input_ids"])) return ret def wrap_streaming_dataset( dataset, tokenizer, cfg, ds_wrapper_fn, ): if cfg.sample_packing: # For SFT (non-pretraining) datasets, always use multipack_attn=True to ensure # attention isolation between packed sequences multipack_attn = ( True if not cfg.pretraining_dataset else cfg.pretrain_multipack_attn ) collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq( tokenizer, return_tensors="pt", padding=True, pad_to_multiple_of=cfg.sequence_len, multipack_attn=multipack_attn, ) encode = functools.partial( encode_packed_streaming, collate_fn, ds_wrapper_fn, max_seq_length=cfg.sequence_len, batch_size=cfg.micro_batch_size, multipack_attn=multipack_attn, bin_size=cfg.sample_packing_bin_size, ) # Set this to 1 so downstream data_loader doesn't try to increase the batch size # again cfg.micro_batch_size = 1 else: # NOTE: This is not reachable for SFT datasets since we use the pre-existing # loading function for non-packed streaming datasets. Refer to # _prepare_streaming_datasets in sft.py for that code path. text_column = ( getattr(cfg.pretraining_dataset[0], "text_column", "text") or "text" ) encode = functools.partial( encode_streaming, tokenizer=tokenizer, max_tokens=cfg.sequence_len, text_column=text_column, concatenate=cfg.pretraining_sample_concatenation is True, ) if cfg.shuffle_merged_datasets: dataset = dataset.shuffle( seed=cfg.seed, buffer_size=cfg.streaming_multipack_buffer_size ) else: LOG.debug("NOT shuffling merged pretraining datasets") # remove all the existing columns after mapping since they end up having # a different length than the encoded/tokenized column # this is empty during streaming/pretraining remove_columns = [] if dataset.features is None: for first_row in dataset: remove_columns = list(first_row.keys()) break else: remove_columns = list(dataset.features.keys()) dataset = dataset.map( encode, batched=True, batch_size=cfg.streaming_multipack_buffer_size, remove_columns=remove_columns, ) return dataset def encode_packed_streaming( collate_fn, ds_wrapper: Callable, examples: Dict[str, List], bin_size: int, max_seq_length: int = 2048, batch_size: int = 4, multipack_attn: Optional[bool] = True, ) -> Dict[str, List]: # tokenize all the examples # rows get split with stride (overlap) train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0] train_dataset = process_pretraining_datasets_for_packing( train_dataset, max_seq_length, skip_position_ids=not multipack_attn, # FIXME using attention mask unpad/pad with trainer and packed pretraining is broken atm # workaround by using the position id logic for now in trainer drop_attention_mask=multipack_attn, ) sampler = MultipackBatchSampler( sampler=RandomSampler(train_dataset), lengths=get_dataset_lengths(train_dataset), batch_size=1, batch_max_len=batch_size * max_seq_length, drop_last=True, num_processes=1, bin_size=bin_size, ) chunked_data = defaultdict(list) for batch in sampler: for data in batch: features = train_dataset[data] if "num_truncated_tokens" in features: del features["num_truncated_tokens"] if "overflow_to_sample_mapping" in features: del features["overflow_to_sample_mapping"] if "labels" not in features: features["labels"] = features["input_ids"].copy() collated_features = collate_fn(features) for feature in features.keys(): if feature == "length": continue chunked_data[feature].append(collated_features[feature].squeeze(0)) return chunked_data ================================================ FILE: src/axolotl/utils/data/utils.py ================================================ """Data handling helpers""" import contextlib import functools import hashlib import time from enum import Enum from typing import Callable import huggingface_hub import numpy as np import requests from datasets import Dataset, IterableDataset from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger from axolotl.utils.samplers.utils import get_dataset_lengths from axolotl.utils.trainer import filter_sequences_by_length LOG = get_logger(__name__) class RetryStrategy(Enum): """Enum for retry strategies.""" CONSTANT = 1 LINEAR = 2 EXPONENTIAL = 3 def retry_on_request_exceptions( max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR ) -> Callable: """Decorator that retries function calls on specific request exceptions. Args: max_retries: Maximum number of retry attempts. delay: Base delay between retries in seconds. retry_strategy: Strategy for calculating retry delays. Returns: Decorated function with retry logic. """ def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): for attempt in range(max_retries): try: return func(*args, **kwargs) except ( requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError, huggingface_hub.errors.HfHubHTTPError, ) as exc: if attempt < max_retries - 1: if retry_strategy == RetryStrategy.EXPONENTIAL: step_delay = delay * 2**attempt elif retry_strategy == RetryStrategy.LINEAR: step_delay = delay * (attempt + 1) else: step_delay = delay # Use constant delay. time.sleep(step_delay) else: raise exc return wrapper return decorator def md5(to_hash: str, encoding: str = "utf-8") -> str: """Generate MD5 hash of a string.""" try: return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest() except TypeError: return hashlib.md5(to_hash.encode(encoding)).hexdigest() # nosec def sha256(to_hash: str, encoding: str = "utf-8") -> str: """Generate SHA256 hash of a string.""" return hashlib.sha256(to_hash.encode(encoding)).hexdigest() def _deduplicate_dataset( dataset: Dataset, seen_hashes: set[str] | None = None, ) -> tuple[Dataset, set[str]]: """Remove duplicate rows from a dataset using SHA256 hashes. Args: dataset: Dataset to deduplicate. seen_hashes: Set of previously seen row hashes (for cross-deduplication). Returns: Tuple of deduplicated dataset and the set of seen hashes. """ if seen_hashes is None: seen_hashes = set() unique_indices = [] for idx, row in enumerate(dataset): row_hash = sha256(str(row)) # Using SHA256 for collision resistance if row_hash not in seen_hashes: seen_hashes.add(row_hash) unique_indices.append(idx) return dataset.select(unique_indices), seen_hashes def deduplicate_and_log_datasets( dataset: Dataset, other_dataset: Dataset | None = None, dataset_name: str | None = "train", other_name: str | None = "eval", ) -> tuple[Dataset, Dataset | None]: """Deduplicate datasets, with optional cross-dataset deduplication. Args: dataset: Primary dataset to deduplicate. other_dataset: Optional second dataset to deduplicate against the first. dataset_name: Name for the primary dataset (for logging). other_name: Name for the second dataset (for logging). Returns: Tuple of (deduplicated_dataset, deduplicated_other_dataset). """ # Deduplicate primary dataset LOG.info( f"Starting deduplication for {dataset_name} dataset. Original size: {len(dataset)}" ) dataset, seen_rows = _deduplicate_dataset(dataset) LOG.info( f"Deduplication complete for {dataset_name} dataset. New size: {len(dataset)}" ) # Deduplicate second dataset if provided if other_dataset is not None: LOG.info( f"Starting deduplication for {other_name} dataset. Original size: {len(other_dataset)}" ) other_dataset, _ = _deduplicate_dataset(other_dataset, seen_rows) LOG.info( f"Deduplication complete for {other_name} dataset. New size: {len(other_dataset)}" ) return dataset, other_dataset def keep_min_len(sample, min_sequence_len=2): """ Batched filter function that keeps only samples with sequence length >= min_sequence_len. Returns a list of booleans indicating which samples to keep. """ min_sequence_len = min_sequence_len or 2 input_ids = sample["input_ids"] # Batched (input_ids is a list of lists) results = [] for seq in input_ids: results.append(len(seq) >= min_sequence_len) return results def truncate_long_seq(sample, sequence_len=2048): """ Truncate samples whose sequence length is too long (> sequence_len). Modifies the sample in-place and returns the modified sample. """ input_ids = sample["input_ids"] # Batched (input_ids is a list of lists) for i, seq in enumerate(input_ids): length = len(seq) if length > sequence_len: sample["input_ids"][i] = seq[:sequence_len] if "attention_mask" in sample: sample["attention_mask"][i] = sample["attention_mask"][i][:sequence_len] if "labels" in sample: sample["labels"][i] = sample["labels"][i][:sequence_len] if "position_ids" in sample: sample["position_ids"][i] = sample["position_ids"][i][:sequence_len] return sample def _should_skip_processing(dataset: Dataset) -> bool: """Check if dataset should skip long sequence handling.""" if ( hasattr(dataset, "column_names") and dataset.column_names and "input_ids" not in dataset.column_names ): LOG.warning( "Dataset does not contain 'input_ids' column. Skip drop long seq. This is " "expected for reward modeling." ) return True elif not hasattr(dataset, "column_names") or dataset.column_names is None: LOG.info( "Dataset is streaming (IterableDataset), skipping long sequence handling" ) return True return False def _log_dataset_stats(dataset: Dataset) -> None: """Log min/max sequence lengths for debugging.""" with contextlib.suppress(AttributeError, ValueError): ds_lengths = get_dataset_lengths(dataset, from_arrow=True) LOG.info(f"min_input_len: {np.min(ds_lengths)}") LOG.info(f"max_input_len: {np.max(ds_lengths)}") def _build_filter_kwargs(dataset: Dataset, cfg: DictDefault) -> dict: """Build kwargs for dataset filter/map operations.""" kwargs = {} if not isinstance(dataset, IterableDataset): kwargs["num_proc"] = cfg.dataset_num_proc kwargs["load_from_cache_file"] = not cfg.is_preprocess return kwargs def _filter_short_sequences( dataset: Dataset, min_len: int, filter_kwargs: dict ) -> tuple[Dataset, int]: """Filter out sequences shorter than min_len. Returns (dataset, num_dropped).""" prior_len = len(dataset) if hasattr(dataset, "__len__") else None desc_kwargs = {} if filter_kwargs: desc_kwargs["desc"] = f"Filtering Short Sequences (<{min_len})" dataset = dataset.filter( functools.partial(keep_min_len, min_sequence_len=min_len), batched=True, **filter_kwargs, **desc_kwargs, ) dropped = 0 if prior_len: dropped = prior_len - len(dataset) if dropped > 0: LOG.info(f"Dropped {dropped} short sequences (<{min_len} tokens)") return dataset, dropped def _truncate_long_sequences( dataset: Dataset, max_len: int, map_kwargs: dict ) -> Dataset: """Truncate sequences longer than max_len.""" desc_kwargs = {} if map_kwargs: desc_kwargs["desc"] = f"Truncating Sequences (target_len={max_len})" dataset = dataset.map( functools.partial(truncate_long_seq, sequence_len=max_len), batched=True, **map_kwargs, **desc_kwargs, ) LOG.info(f"Truncated long sequences to max length {max_len}") return dataset def _drop_outside_range( dataset: Dataset, max_len: int, min_len: int, raise_on_long: bool, filter_kwargs: dict, ) -> tuple[Dataset, int]: """Drop sequences outside valid length range [min_len, max_len]. Returns (dataset, num_dropped).""" prior_len = len(dataset) if hasattr(dataset, "__len__") else None desc_kwargs = {} if filter_kwargs: action = ( "Checking Sequence Lengths" if raise_on_long else "Dropping Invalid Sequences" ) desc_kwargs["desc"] = f"{action} (<{min_len} or >{max_len})" dataset = dataset.filter( functools.partial( filter_sequences_by_length, sequence_len=max_len, min_sequence_len=min_len, raise_on_drop=raise_on_long, ), batched=True, **filter_kwargs, **desc_kwargs, ) dropped = 0 if not raise_on_long and prior_len: dropped = prior_len - len(dataset) if dropped > 0: LOG.info( f"Dropped {dropped} sequences outside valid range " f"([{min_len}, {max_len}])" ) return dataset, dropped def handle_long_seq_in_dataset( dataset: Dataset, sequence_len: int, cfg: DictDefault ) -> Dataset: """Remove sequences longer than configured maximum from dataset. Args: dataset: Dataset to filter. sequence_len: Maximum length for sequences to keep cfg: Dictionary mapping `axolotl` config keys to values. Returns: Filtered dataset with long sequences handled according to the excess_length_strategy value: 'drop' (default) excludes any sequence longer than sequence_len 'truncate' truncates them down to sequence_len 'raise' raises a ValueError if any sequence was found that was longer than sequence_len """ # Early returns for special cases if _should_skip_processing(dataset): return dataset excess_length_strategy = (cfg.excess_length_strategy or "drop").lower() _log_dataset_stats(dataset) # Setup kwargs filter_kwargs = _build_filter_kwargs(dataset, cfg) # Handle sequences based on strategy if excess_length_strategy == "truncate": dataset, _ = _filter_short_sequences(dataset, cfg.min_sample_len, filter_kwargs) dataset = _truncate_long_sequences(dataset, sequence_len, filter_kwargs) else: raise_on_long = excess_length_strategy == "raise" dataset, _ = _drop_outside_range( dataset, sequence_len, cfg.min_sample_len, raise_on_long, filter_kwargs ) return dataset ================================================ FILE: src/axolotl/utils/data/wrappers.py ================================================ """Data handling specific to SFT.""" import logging from typing import Any, NoReturn, cast from datasets import ( Dataset, IterableDataset, Sequence, Value, ) from transformers import PreTrainedTokenizer from transformers.processing_utils import ProcessorMixin from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt from axolotl.prompt_strategies import load from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load from axolotl.prompt_tokenizers import ( AlpacaMultipleChoicePromptTokenizingStrategy, AlpacaPromptTokenizingStrategy, AlpacaReflectionPTStrategy, DatasetWrappingStrategy, GPTeacherPromptTokenizingStrategy, JeopardyPromptTokenizingStrategy, OpenAssistantPromptTokenizingStrategy, PromptTokenizingStrategy, SummarizeTLDRPromptTokenizingStrategy, ) from axolotl.prompters import ( AlpacaPrompter, GPTeacherPrompter, JeopardyPrompter, MultipleChoiceConcisePrompter, MultipleChoiceExplainPrompter, Prompter, ReflectAlpacaPrompter, SummarizeTLDRPrompter, UnsupportedPrompter, ) from axolotl.utils.dict import DictDefault LOG = logging.getLogger(__name__) def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn: """Raise error for unknown dataset strategy.""" ds_type = dataset_config.type suffix = "" if ":load_" in ds_type: suffix = f"Did you mean {ds_type.replace(':load_', '.load_')}?" error_message = f"unhandled prompt tokenization strategy: {ds_type}. {suffix}" LOG.error(error_message) raise ValueError(error_message) def get_dataset_wrapper( dataset_config: DictDefault, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset_base_type: str | None, dataset: Dataset | IterableDataset, dataset_prompt_style: str | None = None, processor: ProcessorMixin | None = None, ) -> tuple[Dataset | IterableDataset, Prompter | None]: """Create an appropriate dataset wrapper and prompter based on dataset configuration. Args: dataset_config: Configuration for the dataset. tokenizer: Tokenizer to use for processing text. cfg: Global configuration object. dataset_base_type: The base type of the dataset. dataset: The actual dataset object. dataset_prompt_style: Optional prompt style specification. processor: Optional processor for multimodal datasets. Returns: tuple of (dataset_wrapper, dataset_prompter). """ # Common parameters for dataset wrapping dataset_kwargs: dict[str, Any] = { "process_count": cfg.dataset_num_proc, "keep_in_memory": cfg.dataset_keep_in_memory is True, } LOG.info( f"Loading dataset: {dataset_config['path']} with base_type: " f"{dataset_base_type} and prompt_style: {dataset_prompt_style}" ) # Dataset is already tokenized if _is_dataset_already_tokenized(dataset): return dataset, UnsupportedPrompter() # Custom dataset type definition if isinstance(dataset_config.type, DictDefault): return _handle_custom_dataset_type( dataset_config, tokenizer, cfg, dataset, dataset_kwargs ) # Skip preparation if configured if cfg.skip_prepare_dataset: return dataset, None # Bradley-Terry dataset if dataset_config.type.startswith("bradley_terry"): return _handle_bradley_terry_dataset( dataset_config, tokenizer, cfg, dataset, dataset_kwargs ) # Stepwise supervised dataset if dataset_config.type.startswith("stepwise_supervised"): return _handle_stepwise_supervised_dataset( dataset_config, tokenizer, cfg, dataset, dataset_kwargs ) # Try to load prompt tokenizer / dataset wrapper strategy from registry dataset_strategy = load( dataset_config.type, tokenizer, cfg, dataset_config, processor=processor ) if dataset_strategy: return _handle_loaded_strategy(dataset_strategy, dataset, dataset_kwargs) # Known dataset types with specific handling if dataset_base_type in DATASET_HANDLERS: handler = DATASET_HANDLERS[dataset_base_type] return handler(dataset_prompt_style, tokenizer, cfg, dataset, dataset_kwargs) # Unhandled dataset type handle_unknown_dataset_strategy(dataset_config) def _is_dataset_already_tokenized(dataset: Dataset | IterableDataset) -> bool: """Check if the dataset is already tokenized.""" return ( isinstance(dataset, Dataset) and "input_ids" in dataset.features and "attention_mask" in dataset.features and "labels" in dataset.features ) def _handle_custom_dataset_type( dataset_config: DictDefault, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a custom dataset type defined in the configuration.""" dataset_strategy = cast( PromptTokenizingStrategy, load("user_defined", tokenizer, cfg, dataset_config.type.to_dict()), ) dataset_prompter = UnsupportedPrompter() dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_bradley_terry_dataset( dataset_config: DictDefault, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter | None]: """Handle a Bradley-Terry dataset.""" bt_type = dataset_config.type.split(".", 1)[1] dataset_strategy = bradley_terry_load(bt_type, tokenizer, cfg, dataset_config) if not dataset_strategy: handle_unknown_dataset_strategy(dataset_config) dataset_prompter = UnsupportedPrompter() dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_stepwise_supervised_dataset( dataset_config: DictDefault, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a stepwise supervised dataset.""" dataset_prompter = UnsupportedPrompter() dataset_strategy = load(dataset_config.type, tokenizer, cfg, dataset_config) # We need to explicitly cast boolean labels to int # for compatibility with how trl's PRMTrainer works if isinstance(dataset, Dataset): dataset = dataset.cast_column("labels", Sequence(Value("int64"))) dataset_wrapper = TokenizedPromptDataset( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_loaded_strategy( dataset_strategy: PromptTokenizingStrategy | DatasetWrappingStrategy, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter | None]: """Handle a dataset with a strategy loaded from the registry.""" if isinstance(dataset_strategy, DatasetWrappingStrategy): return dataset_strategy.wrap_dataset(dataset, **dataset_kwargs), None dataset_prompter = UnsupportedPrompter() dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_alpaca_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle an Alpaca dataset.""" dataset_prompter = AlpacaPrompter(dataset_prompt_style) dataset_strategy = AlpacaPromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_explainchoice_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle an ExplainChoice dataset.""" dataset_prompter = MultipleChoiceExplainPrompter(dataset_prompt_style) dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_concisechoice_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a ConciseChoice dataset.""" dataset_prompter = MultipleChoiceConcisePrompter(dataset_prompt_style) dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_summarizetldr_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a SummarizeTLDR dataset.""" dataset_prompter = SummarizeTLDRPrompter(dataset_prompt_style) dataset_strategy = SummarizeTLDRPromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_jeopardy_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a Jeopardy dataset.""" dataset_prompter = JeopardyPrompter(dataset_prompt_style) dataset_strategy = JeopardyPromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_oasst_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle an OpenAssistant dataset.""" dataset_prompter = AlpacaPrompter(dataset_prompt_style) dataset_strategy = OpenAssistantPromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_gpteacher_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a GPTeacher dataset.""" dataset_prompter = GPTeacherPrompter(dataset_prompt_style) dataset_strategy = GPTeacherPromptTokenizingStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter def _handle_reflection_dataset( dataset_prompt_style: str | None, tokenizer: PreTrainedTokenizer, cfg: DictDefault, dataset: Dataset | IterableDataset, dataset_kwargs: dict[str, Any], ) -> tuple[Dataset | IterableDataset, Prompter]: """Handle a Reflection dataset.""" dataset_prompter = ReflectAlpacaPrompter(dataset_prompt_style) dataset_strategy = AlpacaReflectionPTStrategy( dataset_prompter, tokenizer, cfg.train_on_inputs, cfg.sequence_len, ) dataset_wrapper = wrap_dataset_for_tokenized_prompt( dataset_strategy, dataset, **dataset_kwargs, ) return dataset_wrapper, dataset_prompter DATASET_HANDLERS = { "alpaca": _handle_alpaca_dataset, "explainchoice": _handle_explainchoice_dataset, "concisechoice": _handle_concisechoice_dataset, "summarizetldr": _handle_summarizetldr_dataset, "jeopardy": _handle_jeopardy_dataset, "oasst": _handle_oasst_dataset, "gpteacher": _handle_gpteacher_dataset, "reflection": _handle_reflection_dataset, } ================================================ FILE: src/axolotl/utils/datasets.py ================================================ """helper functions for datasets""" import os from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def get_default_process_count(): if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"): return int(axolotl_dataset_num_proc) if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"): LOG.warning( "AXOLOTL_DATASET_PROCESSES and `dataset_processes` are deprecated and will be " "removed in a future version. Please use `dataset_num_proc` instead." ) return int(axolotl_dataset_processes) if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"): return int(runpod_cpu_count) return os.cpu_count() ================================================ FILE: src/axolotl/utils/dict.py ================================================ """Module containing the DictDefault class""" from addict import Dict class DictDefault(Dict): """ A Dict that returns None instead of returning empty Dict for missing keys. """ def __missing__(self, key): return None def __or__(self, other): return DictDefault(super().__ror__(other)) def __setitem__(self, name, value): # workaround for pickle/unpickle issues and __frozen not being available try: isFrozen = hasattr(self, "__frozen") and object.__getattribute__( self, "__frozen" ) except AttributeError: isFrozen = False if isFrozen and name not in super().keys(): raise KeyError(name) super(Dict, self).__setitem__(name, value) try: p = object.__getattribute__(self, "__parent") key = object.__getattribute__(self, "__key") except AttributeError: p = None key = None if p is not None: p[key] = self object.__delattr__(self, "__parent") object.__delattr__(self, "__key") def remove_none_values(obj): """ Remove null from a dictionary-like obj or list. These can appear due to Dataset loading causing schema merge. See https://github.com/axolotl-ai-cloud/axolotl/pull/2909 """ if hasattr(obj, "items"): return {k: remove_none_values(v) for k, v in obj.items() if v is not None} if isinstance(obj, list): return [remove_none_values(elem) for elem in obj] return obj ================================================ FILE: src/axolotl/utils/distributed.py ================================================ """Utilities for distributed functionality.""" import os import pickle # nosec from contextlib import contextmanager from datetime import timedelta import torch import torch.distributed as dist from accelerate import PartialState from accelerate.utils import ParallelismConfig from transformers.utils.import_utils import ( is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available, ) distributed_state = None def get_device_type() -> torch.device: device = torch.device("cpu") if is_torch_cuda_available(): device = torch.device("cuda") elif is_torch_mps_available(): device = torch.device("mps") elif is_torch_npu_available(): device = torch.device("npu") return device def get_device_count() -> int: cur_device = get_device_type() if "cuda" in str(cur_device): return torch.cuda.device_count() if "npu" in str(cur_device): return torch.npu.device_count() return 1 def get_current_device() -> int: cur_device = get_device_type() if "cuda" in str(cur_device): return torch.cuda.current_device() if "npu" in str(cur_device): return torch.npu.current_device() return 0 def init_distributed_state(): global distributed_state if distributed_state is None: timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800)) try: distributed_state = PartialState(timeout=timedelta(seconds=timeout)) except ValueError: pass def get_distributed_state() -> PartialState | None: return distributed_state def is_distributed() -> bool: """Check if distributed training is initialized.""" init_distributed_state() if distributed_state is None: return False return distributed_state.use_distributed and distributed_state.initialized def barrier(): """ Acts as a barrier to wait for all processes. This ensures that all processes reach the barrier before proceeding further. """ if is_distributed(): dist.barrier() def is_main_process() -> bool: """ Check if the current process is the main process. If not in distributed mode, always return `True`. We use a simpler logic when the distributed state is not initialized: we just log on the 0-th local rank. Returns: `True` if the current process is the main process, `False` otherwise. """ if get_distributed_state() is None: return os.environ.get("LOCAL_RANK", "0") == "0" if not is_distributed(): return True return dist.get_rank() == 0 def is_local_main_process() -> bool: if get_distributed_state() is None: return os.environ.get("LOCAL_RANK", "0") == "0" return PartialState().is_local_main_process def get_world_size() -> int: return int(os.getenv("WORLD_SIZE", "1")) def cleanup_distributed(): """ Destroy process group if torch distributed is initialized. Called in training early termination or when training successfully completes. """ # Ensure that all operations are completed before destroying the process group if torch.cuda.is_available(): torch.cuda.synchronize() if torch.xpu.is_available(): torch.xpu.synchronize() # Destroy the process group if torch.distributed.is_initialized(): torch.distributed.destroy_process_group() @contextmanager def zero_first(is_main: bool): """ runs the wrapped context so that rank 0 runs first before other ranks """ if not is_main: # other ranks wait first barrier() yield if is_main: # then rank 0 waits after it has run the context barrier() def gather_scalar_from_all_ranks(fn, world_size=1): """ Run a callable 'fn' on all ranks and gather the results on the specified rank. Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that gathers the values. Default is 0. - world_size (int, optional): Total number of processes in the current distributed setup. Returns: - A list of computed values from all ranks if on the gathering rank, otherwise None. """ value_scalar = fn() if not is_distributed(): return [value_scalar] value_tensor = torch.tensor( value_scalar, device=f"{get_device_type()}:{get_current_device()}" ).float() if not is_main_process(): dist.gather(value_tensor, dst=0) else: gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)] dist.gather(value_tensor, gather_list=gathered_tensors, dst=0) # Convert tensors back to their original type (int or float) gathered_values = [] for tensor in gathered_tensors: if tensor == tensor.int(): gathered_values.append(int(tensor.item())) else: gathered_values.append(float(tensor.item())) return gathered_values return None def broadcast_dict(vals: dict): if not is_distributed(): return vals cur_device = get_device_type() if is_main_process(): data_byte = pickle.dumps(vals) data_tensor = torch.ByteTensor(list(data_byte)).to(cur_device) data_size = torch.IntTensor([len(data_byte)]).to(cur_device) else: data_tensor = torch.empty([1024], dtype=torch.uint8, device=cur_device) data_size = torch.IntTensor([0]).to(cur_device) dist.broadcast(data_size, 0) if not is_main_process(): # resize data_tensor = data_tensor.new_empty([data_size.item()]) dist.broadcast(data_tensor, 0) if not is_main_process(): data_list = data_tensor.cpu().tolist() data_byte = bytes(data_list[: data_size.item()]) vals = pickle.loads(data_byte) # nosec return vals def compute_and_broadcast(fn): """ Compute a value using the function 'fn' only on the specified rank (default is 0). The value is then broadcasted to all other ranks. Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that computes the value. Default is 0. Returns: - The computed value (int or float). """ cur_device = f"{get_device_type()}:{get_current_device()}" if is_main_process(): value_scalar = fn() value_tensor = torch.tensor( value_scalar, device=cur_device, dtype=torch.float32 ) else: value_tensor = torch.tensor( 0.0, device=cur_device, dtype=torch.float32 ) # Placeholder tensor # Broadcast the tensor to all processes. barrier() dist.broadcast(value_tensor, src=0) # Convert the tensor back to its original type (int or float) if value_tensor == value_tensor.int(): return int(value_tensor.item()) return float(value_tensor.item()) def gather_from_all_ranks(fn, world_size=1): """ Run a callable 'fn' on all ranks and gather the results on the specified rank. Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that gathers the values. Default is 0. - world_size (int, optional): Total number of processes in the current distributed setup. Returns: - A list of computed values from all ranks if on the gathering rank, otherwise None. """ value_scalar = fn() value_tensor = torch.tensor( value_scalar, device=f"{get_device_type()}:{get_current_device()}" ).float() # Placeholder tensor for gathering results if is_main_process(): gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)] else: gathered_tensors = None dist.gather(value_tensor, gather_list=gathered_tensors, dst=0) if is_main_process(): # Convert tensors back to their original type (int or float) gathered_values = [] for tensor in gathered_tensors: if tensor == tensor.int(): gathered_values.append(int(tensor.item())) else: gathered_values.append(float(tensor.item())) return gathered_values return None def reduce_and_broadcast(fn1, fn2): """ Run a callable 'fn1' on all ranks, gather the results, reduce them using 'fn2', and then broadcast the reduced result to all ranks. Args: - fn1 (callable): A function that computes the value on each rank. - fn2 (callable): A reduction function that takes a list of values and returns a single value. - world_size (int, optional): Total number of processes in the current distributed setup. Returns: - The reduced and broadcasted value. """ # Gather values from all ranks using fn1 if not is_distributed(): return fn2([fn1()]) gathered_values = gather_from_all_ranks(fn1, world_size=dist.get_world_size()) # Use compute_and_broadcast to compute the reduced value on the main process # and then broadcast it to all ranks return compute_and_broadcast(lambda: fn2(gathered_values)) def build_parallelism_config(cfg): pc_kwargs = _get_parallel_config_kwargs( get_world_size(), cfg.tensor_parallel_size, cfg.context_parallel_size, cfg.dp_shard_size, cfg.dp_replicate_size, bool(cfg.fsdp or cfg.fsdp_config), ) if pc_kwargs: parallelism_config = ParallelismConfig( **pc_kwargs, ) device_mesh = parallelism_config.build_device_mesh("cuda") return parallelism_config, device_mesh return None, None def _get_parallel_config_kwargs( world_size: int, tensor_parallel_size: int = 1, context_parallel_size: int = 1, dp_shard_size: int | None = None, dp_replicate_size: int | None = None, is_fsdp: bool = False, ): pc_kwargs = {} remaining_world_size = world_size if tensor_parallel_size and tensor_parallel_size > 1: pc_kwargs["tp_size"] = tensor_parallel_size remaining_world_size = remaining_world_size // tensor_parallel_size if context_parallel_size and context_parallel_size > 1: pc_kwargs["cp_size"] = context_parallel_size remaining_world_size = remaining_world_size // context_parallel_size if dp_shard_size is None and dp_replicate_size in (None, 1): if remaining_world_size > 1: pc_kwargs["dp_shard_size"] = remaining_world_size remaining_world_size = 1 if dp_replicate_size and dp_replicate_size > 1: pc_kwargs["dp_replicate_size"] = dp_replicate_size remaining_world_size = remaining_world_size // dp_replicate_size if remaining_world_size > 1 and dp_shard_size and dp_shard_size > 1: if not is_fsdp: raise ValueError( "dp_shard_size was configured without a corresponding fsdp_config! " "Please ensure you have configured FSDP using fsdp_config." ) pc_kwargs["dp_shard_size"] = dp_shard_size remaining_world_size = remaining_world_size // dp_shard_size if remaining_world_size > 1 and "dp_replicate_size" not in pc_kwargs: pc_kwargs["dp_replicate_size"] = remaining_world_size remaining_world_size = 1 if remaining_world_size > 1: if "dp_shard_size" not in pc_kwargs and is_fsdp: pc_kwargs["dp_shard_size"] = remaining_world_size remaining_world_size = 1 if remaining_world_size > 1: raise ValueError( f"The configured parallelisms are incompatible with the current world size ({get_world_size()})!\n" f"{pc_kwargs}" ) return pc_kwargs ================================================ FILE: src/axolotl/utils/environment.py ================================================ """ utils to get GPU info for the current environment """ import os from importlib.metadata import version import torch from accelerate.utils.environment import ( check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support, ) from packaging.version import Version, parse from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def check_cuda_p2p_ib_support(): if not accelerate_check_cuda_p2p_ib_support(): return False if not check_cuda_p2p_support(): return False return True def check_cuda_p2p_support() -> bool: try: world_size = int(os.environ.get("WORLD_SIZE", "1")) local_rank = int(os.environ.get("LOCAL_RANK", "0")) except ValueError: return True if world_size > 1: node_world_size = int(os.environ.get("NODE_WORLD_SIZE", "8")) local_other_rank = (local_rank // node_world_size) * node_world_size local_other_rank += 1 if (local_rank % node_world_size) == 0 else 0 try: can_p2p = torch.cuda.can_device_access_peer(local_rank, local_other_rank) except AssertionError as exc: # some sort of logic error in indexing processes, assume p2p is fine for now LOG.warning(exc) return True return can_p2p return True def get_package_version(package: str) -> Version: version_str = version(package) return parse(version_str) def is_package_version_ge(package: str, version_: str) -> bool: package_version = get_package_version(package) return package_version >= parse(version_) ================================================ FILE: src/axolotl/utils/freeze.py ================================================ """ module to freeze/unfreeze parameters by name """ import re from typing import Callable, List, Tuple, Union from axolotl.utils.distributed import is_main_process from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def freeze_layers_except(model, regex_patterns): """ Freezes all layers of the given model except for the layers that match given regex patterns. Periods in the patterns are treated as literal periods, not as wildcard characters. Parameters: - model (nn.Module): The PyTorch model to be modified. - regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen. Note that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names. Also, to match the entire layer name, the pattern should start with "^" and end with "$", otherwise it will match any part of the layer name. The range pattern part is optional and it is not compiled as a regex pattern which means you must put "$" before the range pattern if you want to match the entire layer name. E.g., ["^model.embed_tokens.weight$[:32000]", "layers.2[0-9]+.block_sparse_moe.gate.[a-z]+$"] Returns: None; the model is modified in place. """ if isinstance(regex_patterns, str): regex_patterns = [regex_patterns] patterns = [LayerNamePattern(pattern) for pattern in regex_patterns] # Unfreeze layers that match the regex patterns for name, param in model.named_parameters(): param.requires_grad = False unfrozen_ranges = [] for pattern in patterns: if not pattern.match(name): continue param.requires_grad = True if pattern.range is not None: unfrozen_ranges.append(pattern.range) merged_unfrozen_ranges = _merge_ranges(unfrozen_ranges, len(param)) if param.requires_grad and is_main_process(): unfrozen_ranges = ( f" with ranges {merged_unfrozen_ranges}" if merged_unfrozen_ranges else "" ) LOG.debug(f"Unfrozen {name}{unfrozen_ranges}") if not merged_unfrozen_ranges: continue # The range list we need is actually the inverted of the merged ranges ranges_to_freeze = _invert_ranges(merged_unfrozen_ranges, len(param)) param.register_hook(_create_freeze_parameters_hook(ranges_to_freeze)) if is_main_process() and all( not param.requires_grad for param in model.parameters() ): LOG.warning("All parameters are frozen. Model will not be trained.") def _invert_ranges( given_ranges: List[Tuple[int, int]], layer_size: int ) -> List[Tuple[int, int]]: """ Inverts a list of ranges to obtain the ranges not covered by the given ranges. Parameters: - given_ranges (List[Tuple[int, int]]): List of ranges to invert. Each range is represented as a tuple of start (inclusive) and end (exclusive) indices. - layer_size (int): The length of the layer. E.g., len(model.layer.weight) Returns: - List[Tuple[int, int]]: List of inverted ranges, where each range is represented as a tuple of start (inclusive) and end (exclusive) indices. """ if not given_ranges: return [(0, layer_size)] inverted_ranges = [] current_start = 0 for start, end in sorted(given_ranges): if start > current_start: inverted_ranges.append((current_start, start)) current_start = max(current_start, end) # Handle the case where the last given range does not reach the end of the total_size if current_start < layer_size: inverted_ranges.append((current_start, layer_size)) return inverted_ranges def _merge_ranges( given_ranges: List[Tuple[int, Union[int, None]]], layer_size: int ) -> List[Tuple[int, int]]: """ Merges overlapping ranges and sorts the given ranges. This function takes a list of ranges and merges any overlapping ranges. The ranges are represented as tuples, where the first element is the start index (inclusive) and the second element is the end index (exclusive). The end index can be None, indicating that the range extends to the end of the sequence. Parameters: - given_ranges (List[Tuple[int, int | None]]): List of ranges to merge. - layer_size (int): The length of the layer. E.g., len(model.layer.weight) Returns: - List[Tuple[int, int]]: List of merged ranges, as start (inclusive) and end (exclusive) indices. """ # End of each range can be determined now since we have the total size processed_ranges = [ (start, end if end is not None else layer_size) for start, end in given_ranges ] for start, end in processed_ranges: if start < 0 or end > layer_size > 0 or start >= end: raise ValueError(f"invalid unfreeze range: start={start}, end={end}") # No need to merge if there's only one or no ranges if len(processed_ranges) <= 1: return processed_ranges sorted_ranges = sorted(processed_ranges) merged_ranges = [sorted_ranges[0]] for start, end in sorted_ranges[1:]: prev_start, prev_end = merged_ranges[-1] if start <= prev_end: merged_ranges[-1] = (prev_start, max(prev_end, end)) else: merged_ranges.append((start, end)) return merged_ranges def _create_freeze_parameters_hook(ranges_to_freeze: List[Tuple[int, int]]) -> Callable: """ Create a hook to freeze parameters in specified ranges by setting their gradients to zero. This function takes a list of tuples representing the ranges of indices to freeze. Each tuple should contain two integers representing the start and end indices of the range. Parameters: - ranges_to_freeze (List[Tuple[int, int]]): Ranges of indices to freeze. Returns: - Callable: A hook function to be used with `register_hook` on parameters. Example usage: ``` ranges_to_freeze = [(0, 10), (20, 30)] hook = _create_freeze_parameters_hook(ranges_to_freeze) model.register_hook(hook) ``` """ def freeze_parameters_hook(gradients): for start, end in ranges_to_freeze: gradients[start:end].zero_() return freeze_parameters_hook class LayerNamePattern: """ Represents a regex pattern for layer names, potentially including a parameter index range. """ def __init__(self, pattern: str): """ Initializes a new instance of the LayerNamePattern class. Parameters: - pattern (str): The regex pattern for layer names, potentially including a parameter index range. """ self.raw_pattern = pattern name_pattern, self.range = self._parse_pattern(pattern) self.name_regex = re.compile(re.sub(r"\.(?!\+)", "\\.", name_pattern)) def match(self, name: str) -> bool: """ Checks if the given layer name matches the regex pattern. Parameters: - name (str): The layer name to check. Returns: - bool: True if the layer name matches the pattern, False otherwise. """ return self.name_regex.match(name) is not None def _parse_pattern( self, pattern: str ) -> Tuple[str, Union[Tuple[int, Union[int, None]], None]]: """ Extracts the range pattern from the given pattern. Parameters: - pattern (str): The pattern to extract the range from. Returns: - Tuple[str, Tuple[int, int | None] | None]: A tuple containing the regex pattern to match the layer name without the range pattern and the range of layer indices to match, if specified. """ match = re.match(r"^(.+)\[([0-9]*)(?::([0-9]*))?\]$", pattern) if not match: return pattern, None base_pattern, start_part, end_part = match.groups() if end_part is None and start_part.isdecimal(): index = int(start_part) return base_pattern, (index, index + 1) # [:end] or [start:] or [start:end] start = int(start_part) if start_part else 0 end = int(end_part) if end_part else None if end is not None and start >= end: raise ValueError( f"Invalid range in layer name pattern: {pattern}." "End of range must be greater than start." ) return base_pattern, (start, end) ================================================ FILE: src/axolotl/utils/generation/__init__.py ================================================ """Generation utilities for monitoring during training.""" from .sft import format_generation_for_logging, generate_samples __all__ = ["generate_samples", "format_generation_for_logging"] ================================================ FILE: src/axolotl/utils/generation/sft.py ================================================ """Sample generation utilities for SFT/Pretrain training.""" from typing import Any, List, Optional import torch from accelerate.utils import extract_model_from_parallel from colorama import Fore, Style from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def generate_samples( model: torch.nn.Module, tokenizer: Any, dataloader: Any, num_generation_samples: int = 3, max_new_tokens: int = 50, temperature: float = 0.7, top_p: Optional[float] = None, top_k: Optional[int] = None, do_sample: bool = True, prompt_ratio: float = 0.5, ) -> List[dict]: """ Generate samples from the model during training for monitoring. Args: model: The model to generate from tokenizer: The tokenizer to use for encoding/decoding dataloader: Dataloader to sample prompts from num_generation_samples: Number of samples to generate max_new_tokens: Maximum new tokens to generate temperature: Sampling temperature (0.0 = greedy) top_p: Nucleus sampling parameter top_k: Top-k sampling parameter do_sample: Whether to use sampling vs greedy decoding prompt_ratio: Ratio of sequence to use as prompt (0.0-1.0) Returns: List of dicts with 'prompt', 'generated', and 'full_text' keys """ unwrapped_model = extract_model_from_parallel(model) training = unwrapped_model.training unwrapped_model.eval() device = next(unwrapped_model.parameters()).device generations = [] try: with torch.no_grad(): samples_collected = 0 for batch in dataloader: if samples_collected >= num_generation_samples: break input_ids = batch["input_ids"].to(device) attention_mask = batch.get("attention_mask") if attention_mask is not None: attention_mask = attention_mask.to(device) batch_size = input_ids.shape[0] indices = torch.randperm(batch_size)[ : num_generation_samples - samples_collected ] for idx in indices: if samples_collected >= num_generation_samples: break sequence = input_ids[idx] if attention_mask is not None: seq_len = attention_mask[idx].sum().item() else: seq_len = sequence.shape[0] if seq_len < 5: continue prompt_len = max(1, int(seq_len * prompt_ratio)) prompt_ids = sequence[:prompt_len].unsqueeze(0) try: generation_config = { "max_new_tokens": max_new_tokens, "do_sample": do_sample, "pad_token_id": tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id, } if do_sample: generation_config["temperature"] = temperature if top_p is not None: generation_config["top_p"] = top_p if top_k is not None: generation_config["top_k"] = top_k generated_ids = unwrapped_model.generate( prompt_ids, **generation_config ) prompt_text = tokenizer.decode( prompt_ids[0], skip_special_tokens=True ) generated_text = tokenizer.decode( generated_ids[0][prompt_len:], skip_special_tokens=True ) full_text = tokenizer.decode( generated_ids[0], skip_special_tokens=True ) generations.append( { "prompt": prompt_text, "generated": generated_text, "full_text": full_text, } ) samples_collected += 1 except Exception as e: LOG.warning(f"Failed to generate sample: {e}", exc_info=True) continue except Exception as e: LOG.warning(f"Error during sample generation: {e}", exc_info=True) if training: unwrapped_model.train() else: unwrapped_model.eval() return generations def format_generation_for_logging( sample: dict, sample_idx: int, step: int ) -> tuple[str, str]: """ Format a generation sample for pretty logging. Args: sample: Dict with 'prompt', 'generated', and 'full_text' keys sample_idx: Index of the sample step: Current training step Returns: Tuple of (console_text, wandb_text) """ console_text = ( f"\n{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n" f"{Style.BRIGHT}{Fore.GREEN}Sample {sample_idx + 1} (Step {step}){Style.RESET_ALL}\n" f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n" f"{Style.BRIGHT}{Fore.YELLOW}[PROMPT]{Style.RESET_ALL}\n{sample['prompt']}\n\n" f"{Style.BRIGHT}{Fore.MAGENTA}[GENERATED]{Style.RESET_ALL}\n{sample['generated']}\n" f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n" ) wandb_text = ( f"\n{'=' * 80}\n" f"Sample {sample_idx + 1} (Step {step})\n" f"{'=' * 80}\n" f"[PROMPT]\n{sample['prompt']}\n\n" f"[GENERATED]\n{sample['generated']}\n" f"{'=' * 80}\n" ) return console_text, wandb_text ================================================ FILE: src/axolotl/utils/import_helper.py ================================================ """ Helper for importing modules from strings """ import importlib def get_cls_from_module_str(module_str: str): # use importlib to dynamically load the reward function from the module if not isinstance(module_str, str) or not module_str.strip(): raise ValueError("module_str must be a non-empty string") parts = module_str.split(".") if len(parts) < 2: raise ValueError(f"Invalid module string format: {module_str}") try: cls_name = parts[-1] module_path = ".".join(parts[:-1]) mod = importlib.import_module(module_path) mod_cls = getattr(mod, cls_name) return mod_cls except ImportError as e: raise ImportError(f"Failed to import module '{module_path}': {e}") from e except AttributeError as e: raise AttributeError( f"Class '{cls_name}' not found in module '{module_path}': {e}" ) from e ================================================ FILE: src/axolotl/utils/logging.py ================================================ """Logging helpers to only log on main process.""" import functools import logging import warnings from axolotl.utils.distributed import is_main_process # Suppress noisy bitsandbytes warnings about dtype casting during quantization warnings.filterwarnings( "ignore", message=".*MatMul8bitLt: inputs will be cast from.*", category=UserWarning, ) # Adapted from Accelerate # https://github.com/huggingface/accelerate/blob/main/src/accelerate/logging.py class MultiProcessAdapter(logging.LoggerAdapter): """ Logger adapter for distributed logging, specifically to only log on main process. """ @staticmethod def _should_log(main_process_only: bool): return not main_process_only or is_main_process() def log(self, level, msg, *args, **kwargs): main_process_only = kwargs.pop("main_process_only", True) kwargs.setdefault("stacklevel", 2) if self.isEnabledFor(level) and self._should_log(main_process_only): msg, kwargs = self.process(msg, kwargs) self.logger.log(level, msg, *args, **kwargs) @functools.lru_cache(maxsize=10) def warning_once(self, *args, **kwargs): """ This method is identical to `logger.warning()`, but will emit the warning with the same message only once Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to another type of cache that includes the caller frame information in the hashing function. """ self.warning(*args, **kwargs) def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter: logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) return MultiProcessAdapter(logger, extra={}) ================================================ FILE: src/axolotl/utils/lora.py ================================================ # Copyright 2025 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ module to get the state dict of a merged lora model """ import torch from peft.tuners.tuners_utils import onload_layer from peft.utils import ModulesToSaveWrapper, _get_submodules def get_lora_merged_state_dict( model: torch.nn.Module, ) -> dict: r""" Create and return a state_dict that has the LoRA deltas merged into the base model’s weights, without modifying `model` in place. Arguments: model (torch.nn.Module): A model that has LoRA/PEFT adapters attached. Returns: dict: A state_dict of the merged parameters. """ base_model_prefix = "base_model.model." state_dict = {} key_list = [key for key, _ in model.named_modules() if model.prefix not in key] for key in key_list: try: _, target, _ = _get_submodules(model, key) except AttributeError: continue with onload_layer(target): weight_key = key.replace(base_model_prefix, "") + ".weight" bias_key = key.replace(base_model_prefix, "") + ".bias" if hasattr(target, "base_layer"): target.merge(safe_merge=True, adapter_names=None) # get the state_dict of target.base_layer layer_state_dict = target.base_layer.state_dict() state_dict[weight_key] = layer_state_dict["weight"] elif isinstance(target, ModulesToSaveWrapper): # save any additional trainable modules part of `modules_to_save` new_module = target.modules_to_save[target.active_adapter] if hasattr(new_module, "base_layer"): # check if the module is itself a tuner layer new_module.merge(safe_merge=True, adapter_names=None) layer_state_dict = new_module.state_dict() state_dict[weight_key] = layer_state_dict["weight"] elif hasattr(target, "weight"): if any( skip in key for skip in [ ".original_module", ".modules_to_save", ".base_layer", ] ): continue layer_state_dict = target.state_dict() state_dict[weight_key] = layer_state_dict["weight"] if hasattr(target, "bias") and "bias" in layer_state_dict.keys(): state_dict[bias_key] = layer_state_dict["bias"] return state_dict ================================================ FILE: src/axolotl/utils/mistral/__init__.py ================================================ """Init for `axolotl.utils.mistral` module.""" from axolotl.utils.mistral.mistral3_processor import Mistral3Processor from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer __all__ = ["HFMistralTokenizer", "Mistral3Processor"] ================================================ FILE: src/axolotl/utils/mistral/mistral3_processor.py ================================================ """Processor for Mistral3 multimodal models with image support""" from typing import Any, Dict, Optional, Union import torch from transformers import ProcessorMixin from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessingKwargs from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer class Mistral3ProcessorKwargs(ProcessingKwargs): _defaults: Dict[str, Dict[str, Any]] = { "text_kwargs": { "padding": True, }, "common_kwargs": { "return_tensors": "pt", "return_dict": True, "tokenize": True, }, } class Mistral3Processor(ProcessorMixin): """ Processor for Mistral3 multimodal models that handles text and images. Wraps HFMistralTokenizer and adds image processing capabilities. """ def __init__(self, tokenizer: HFMistralTokenizer): super().__init__(tokenizer) @property def audio_tokenizer(self) -> None: """Audio tokenizer is not supported. Dummy method to satisfy HuggingFace API.""" return None def _merge_kwargs( self, processor_kwargs_class: Any, **kwargs: Any ) -> Dict[str, Dict[str, Any]]: """Merge kwargs with defaults similar to ProcessorMixin""" defaults = processor_kwargs_class._defaults output_kwargs: Dict[str, Dict[str, Any]] = {} for kwarg_type, default_values in defaults.items(): output_kwargs[kwarg_type] = {**default_values} # Update with provided kwargs for key, value in kwargs.items(): # Try to match key to appropriate kwarg type if key in ["padding", "truncation", "max_length"]: output_kwargs.setdefault("text_kwargs", {}).update({key: value}) elif key in ["return_tensors", "return_dict", "tokenize"]: output_kwargs.setdefault("common_kwargs", {}).update({key: value}) else: # Add to text_kwargs by default output_kwargs.setdefault("text_kwargs", {}).update({key: value}) return output_kwargs def apply_chat_template( self, conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]], **kwargs: Any, ) -> Union[BatchFeature, str, list[str]]: """ Apply chat template with image support for Mistral3. Similar to VoxtralProcessor, this method extracts images from the conversation, calls the tokenizer's apply_chat_template, then adds pixel_values and image_sizes to the result. """ output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs) text_kwargs = output_kwargs["text_kwargs"] common_kwargs = output_kwargs["common_kwargs"] return_tensors = common_kwargs.pop("return_tensors", "pt") if return_tensors != "pt": raise ValueError( f"{self.__class__.__name__} only supports `return_tensors='pt'`." ) return_dict = common_kwargs.pop("return_dict", False) tokenize = common_kwargs.pop("tokenize", False) # Determine if batched if isinstance(conversation, (list, tuple)) and ( isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content") ): is_batched = True conversations = conversation else: is_batched = False conversations = [conversation] # type: ignore # Call tokenizer's apply_chat_template tokenizer_kwargs = {**text_kwargs, **common_kwargs} tokenizer_kwargs["return_tensors"] = return_tensors tokenizer_kwargs["tokenize"] = tokenize tokenizer_kwargs["return_dict"] = return_dict encoded_instruct_inputs = self.tokenizer.apply_chat_template( conversations, **tokenizer_kwargs, ) if tokenize: if return_dict: # The tokenizer already handles pixel_values, we just need to add image_sizes if hasattr(encoded_instruct_inputs, "items"): data: Dict[str, Any] = dict(encoded_instruct_inputs) # type: ignore elif hasattr(encoded_instruct_inputs, "data"): data = encoded_instruct_inputs.data # type: ignore else: raise ValueError("Unknown data type") if "pixel_values" in data: pixel_values = data["pixel_values"] # MistralTokenizer returns a Double, so we convert to fp32 data["pixel_values"] = pixel_values.to(dtype=torch.float32) # Always batched: [B, C, H, W] -> image_sizes: [B, 2] # Since tensor is homogeneous, all images have same H, W batch_size = pixel_values.shape[0] image_sizes = torch.tensor([pixel_values.shape[-2:]] * batch_size) data["image_sizes"] = image_sizes return BatchFeature(data=data, tensor_type=return_tensors) if not is_batched: return encoded_instruct_inputs[0] return encoded_instruct_inputs def __call__( self, text: Optional[ Union[ TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput] ] ], **kwargs: Any, ) -> BatchFeature: """ Forward text processing to the tokenizer. This method does not support images - use apply_chat_template instead. """ output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs) text_kwargs = output_kwargs["text_kwargs"] common_kwargs = output_kwargs["common_kwargs"] out = self.tokenizer(text, **text_kwargs) return BatchFeature( data=out, tensor_type=common_kwargs.pop("return_tensors", None) ) ================================================ FILE: src/axolotl/utils/mistral/mistral_tokenizer.py ================================================ """Wrapper for MistralTokenizer from mistral-common""" import os from typing import Optional import numpy as np from mistral_common.protocol.instruct.validator import ValidationMode from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub from torch import Tensor from transformers.tokenization_mistral_common import MistralCommonBackend from transformers.tokenization_utils_base import VERY_LARGE_INTEGER class HFMistralTokenizer(MistralCommonBackend): """ Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer and exposes HuggingFace API for special tokens. """ def __init__(self, name_or_path: str, **kwargs): """ Args: name_or_path: The name or path to the tokenizer files or the repo id. **kwargs: Additional keyword arguments passed to the parent class. """ kwargs.pop("mode", None) mode = ValidationMode.finetuning super().__init__(**kwargs, mode=mode) self._name_or_path = name_or_path # set mode as is not set upstream self._set_mode(mode) @property def name_or_path(self) -> str: return self._name_or_path @name_or_path.setter def name_or_path(self, name_or_path: str) -> None: self._name_or_path = name_or_path @property def chat_template(self) -> str | None: """Chat template is not supported. Dummy method to satisfy HuggingFace API.""" return "[This is a dummy chat template]" @chat_template.setter def chat_template(self, chat_template: str | None) -> None: pass def _set_mode(self, mode: ValidationMode): """Set the mode of the MistralRequestValidator. Args: mode: The mode to set. Raises: RuntimeError: If the MistralRequestValidator does not have a _mode attribute. """ # Check if MistralRequestValidator has a _mode attribute. # This is a private API and may change in the future. from mistral_common.protocol.instruct.validator import MistralRequestValidator if not ( hasattr(self.tokenizer, "_chat_completion_request_validator") and isinstance( self.tokenizer._chat_completion_request_validator, MistralRequestValidator, ) and hasattr(self.tokenizer._chat_completion_request_validator, "_mode") ): raise RuntimeError( f"Unable to switch mistral tokenizer to {mode.value} mode - " "private API `_chat_completion_request_validator._mode` missing." ) self.tokenizer._chat_completion_request_validator._mode = mode def apply_chat_template( # type: ignore self, conversation: list[dict] | list[list[dict]], chat_template: str | None = None, add_generation_prompt: bool = False, **kwargs, ) -> str | list[int]: """Patched fn to handle setting test mode, remove chat_template and add_generation_prompt kwarg""" # pop unnecessary kwarg for mistral kwargs.pop("real_last_index", None) kwargs.pop("add_special_tokens", None) try: if add_generation_prompt: self._set_mode(ValidationMode.test) out = super().apply_chat_template(conversation, **kwargs) return out # type: ignore finally: if add_generation_prompt: self._set_mode(ValidationMode.finetuning) def decode( # type: ignore self, token_ids: int | list[int] | np.ndarray | Tensor, **kwargs, ) -> str: """ Decode token_ids into str. This overrides upstream.decode to convert int to list[int] """ if isinstance(token_ids, int): token_ids = [token_ids] return super().decode(token_ids, **kwargs) @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str | os.PathLike, *init_inputs, mode: ValidationMode = ValidationMode.test, cache_dir: Optional[str | os.PathLike] = None, force_download: bool = False, local_files_only: bool = False, token: Optional[str | bool] = None, revision: str = "main", model_max_length: int = VERY_LARGE_INTEGER, padding_side: str = "left", truncation_side: str = "right", model_input_names: Optional[list[str]] = None, clean_up_tokenization_spaces: bool = False, **kwargs, ): r""" Patched fn to pass `name_or_path` and remove extra kwargs. Instantiate a `MistralCommonBackend` from a predefined tokenizer. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): Can be either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. - A path to a *directory* containing the tokenizer config, for instance saved using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g., `./my_model_directory/`. mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`): Validation mode for the `MistralTokenizer` tokenizer. cache_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist. token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `hf auth login` (stored in `~/.huggingface`). local_files_only (`bool`, *optional*, defaults to `False`): Whether or not to only rely on local files and not to attempt to download any files. revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. max_length (`int`, *optional*): Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. padding_side (`str`, *optional*, defaults to `"left"`): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (`str`, *optional*, defaults to `"right"`): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. model_input_names (`List[string]`, *optional*): The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or `"attention_mask"`). Default value is picked from the class attribute of the same name. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. kwargs (additional keyword arguments, *optional*): Not supported by `MistralCommonBackend.from_pretrained`. Will raise an error if used. """ if init_inputs: raise ValueError( "`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`." ) # Delete trust_remote_code as it does nothing kwargs.pop("trust_remote_code", None) # Delete tokenizer as it does nothing kwargs.pop("tokenizer", None) # Handle kwargs and AutoTokenizer case if kwargs and not kwargs.keys() == {"_from_auto"}: raise ValueError( f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`." ) if not os.path.isfile(pretrained_model_name_or_path): tokenizer_path = download_tokenizer_from_hf_hub( repo_id=str(pretrained_model_name_or_path), cache_dir=str(cache_dir), token=token, revision=revision, force_download=force_download, local_files_only=local_files_only, ) else: tokenizer_path = str(pretrained_model_name_or_path) return cls( name_or_path=str(pretrained_model_name_or_path), tokenizer_path=tokenizer_path, mode=mode, model_max_length=model_max_length, padding_side=padding_side, truncation_side=truncation_side, model_input_names=model_input_names, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) def save_pretrained(self, *args, **kwargs) -> tuple[str, ...]: """ Patches to remove save_jinja_files from being passed onwards. """ kwargs.pop("save_jinja_files", None) return super().save_pretrained(*args, **kwargs) ================================================ FILE: src/axolotl/utils/mlflow_.py ================================================ """Module for mlflow utilities""" import os from axolotl.utils.dict import DictDefault def setup_mlflow_env_vars(cfg: DictDefault): for key in cfg.keys(): if key.startswith("mlflow_") or key.startswith("hf_mlflow_"): value = cfg.get(key, "") if value and isinstance(value, str) and len(value) > 0: os.environ[key.upper()] = value # Enable mlflow if experiment name is present if cfg.mlflow_experiment_name and len(cfg.mlflow_experiment_name) > 0: cfg.use_mlflow = True # Enable logging hf artifacts in mlflow if value is truthy if cfg.hf_mlflow_log_artifacts is True: os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "true" ================================================ FILE: src/axolotl/utils/model_shard_quant.py ================================================ """ module to handle loading model on cpu/meta device for FSDP """ import os import time from typing import List, Optional, Type, Union import safetensors import torch from accelerate import init_empty_weights from bitsandbytes.nn import Linear4bit, Params4bit from fastcore.parallel import parallel from torch import Tensor, nn from tqdm import tqdm from transformers import AutoModelForCausalLM from transformers.quantizers import AutoHfQuantizer from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub def _replace_linear( model: nn.Module, linear_replacement: Type[nn.Module], quant_config: Union[dict, None] = None, skip_modules=None, **kwargs, ): """ Replace linear modules with a new Linear module. Parameters: model (`torch.nn.Module`): Input model or `torch.nn.Module` as the function is run recursively. linear_replacement (`torch.nn.Module`): The linear module that replaces the old one. Only expects standard arguments. If other arguments need to be passed, use a lambda. skip_modules (`List[str]`, *optional*, defaults to `lm_head`): List of modules names not to convert. Defaults to `lm_head`. """ if skip_modules is None: skip_modules = ["lm_head"] for name, module in model.named_children(): if len(list(module.children())) > 0: _replace_linear( module, linear_replacement, quant_config, skip_modules, **kwargs ) if isinstance(module, torch.nn.Linear) and name not in skip_modules: if issubclass(linear_replacement, Linear4bit): model._modules[name] = linear_replacement( module.in_features, module.out_features, module.bias is not None, **kwargs, ) else: raise ValueError( f"Unsupported linear replacement: {type(linear_replacement)}" ) return model def load_and_quantize( module: nn.Module, name: str, value: Tensor, device: torch.device = None, dtype: torch.dtype = None, skip_names: Optional[List[str]] = None, to_cpu: bool = False, to_meta: bool = False, verbose: bool = False, quant_method: str = "bnb", ): """ Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`. Quantizes `Params4bit` on `device` then places on "cpu" if to_cpu=True or "meta" if to_meta=True. """ if not skip_names: skip_names = [] def place_on_device(value): if to_meta: device = "meta" elif to_cpu: device = "cpu" return value.to(device=device, dtype=dtype) if any(skip_name in name for skip_name in skip_names): if verbose: print(f"Skipping {name} because it is in skip_names") return module_key, _, value_key = name.rpartition(".") try: submodule = module.get_submodule(module_key) except AttributeError as exc: print(f"Module {module_key} not found:\n{exc}") return try: if quant_method == "bnb": param = submodule.get_parameter(value_key) if isinstance(param, Params4bit): # With `sync_module_states=True`, a meta device Params4bit needs to be the same # shape as the quantized Params4bit with an initialized quant_state. However, # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This # workaround quantizes Params4bit to initialize quant_state on all ranks, then # replaces Params4bit's data with a meta tensor to free memory on non-rank 0. value = type(param)( value.to(device=device, dtype=dtype).data, **param.__dict__ ).cuda(device) if to_meta: value = type(param)(value.data.to("meta"), **value.__dict__) elif to_cpu: value = type(param)(value.data.to("cpu"), **value.__dict__) else: value = type(param)(place_on_device(value).data) except AttributeError: # it's a buffer value = place_on_device(value) setattr(submodule, value_key, value) def n_loading_workers(quant_method: str, param_count: float): devprops = torch.cuda.get_device_properties(torch.cuda.current_device()) left = int(os.cpu_count() / torch.cuda.device_count()) model_params_b = 70 right = int( (4 if quant_method == "hqq" else 8) * (devprops.total_memory / 1e9 / 40) * (model_params_b / (param_count / 1e9)) ) return min(left, right) def load_sharded_model( model_name, model_config, cfg, torch_dtype=torch.bfloat16, low_memory=True, ): if (low_memory and cfg.local_rank == 0) or not low_memory: model = AutoModelForCausalLM.from_pretrained( model_name, use_cache=False, dtype=torch.float32, _attn_implementation=model_config._attn_implementation, trust_remote_code=cfg.trust_remote_code, ) dtype = torch_dtype if not cfg.float32 else None model.to(dtype=dtype, device="cpu" if low_memory else cfg.local_rank) else: with init_empty_weights(): model = AutoModelForCausalLM.from_config( model_config, dtype=torch_dtype, trust_remote_code=cfg.trust_remote_code, ) return model def load_sharded_model_quant( model_name, model_config, cfg, compute_dtype=torch.bfloat16, quant_storage=torch.float32, low_memory=True, verbose=False, loading_workers=2, quantization_config=None, ): with init_empty_weights(): model = AutoModelForCausalLM.from_config( model_config, trust_remote_code=cfg.trust_remote_code, ) if hasattr(model, "transformer"): model.transformer = _replace_linear( model.transformer, Linear4bit, compute_dtype=compute_dtype, quant_type="nf4", quant_storage=quant_storage, compress_statistics=True, # bnb_4bit_use_double_quant skip_modules=[ "lm_head", "embed_out", ], ) else: # this is the more common case with HF transformers # TODO can we detect the model arch and dynamically set skip_modules model.model = _replace_linear( model.model, Linear4bit, compute_dtype=compute_dtype, quant_type="nf4", quant_storage=quant_storage, compress_statistics=True, # bnb_4bit_use_double_quant skip_modules=[ "lm_head", "embed_out", ], ) model.is_loaded_in_4bit = True # Grab the safetensors files that hold the weights try: idx = hub.cached_file(model_name, SAFE_WEIGHTS_INDEX_NAME) files, _ = hub.get_checkpoint_shard_files(model_name, idx) except OSError: try: # This means the model doesn't have a model.safetensors.index.json because it is not sharded files = [] files.append(hub.cached_file(model_name, SAFE_WEIGHTS_NAME)) except OSError as exc: # This means the model probably doesn't have a safetensors file raise exc # Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly # and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage def load_and_quantize_parallel(name_param, model, **kwargs): name, param = name_param load_and_quantize(model, name, param, **kwargs) quant_method = "bnb" param_count = sum((p.numel() for n, p in model.named_parameters())) n_workers = ( n_loading_workers(quant_method, param_count) if loading_workers == -1 else loading_workers ) if cfg.local_rank == 0 and verbose: print(f"Using n_workers: {n_workers} for loading") start = time.time() for filename in tqdm( files, desc="Loading & Quantizing Model Shards", disable=cfg.local_rank != 0, position=0, ): weights = safetensors.torch.load_file(filename) parallel( load_and_quantize_parallel, iter(weights.items()), n_workers=n_workers, threadpool=True, model=model, dtype=quant_storage, device=cfg.local_rank, skip_names=[], to_cpu=(low_memory and cfg.local_rank == 0), to_meta=(low_memory and cfg.local_rank != 0), verbose=verbose, quant_method=quant_method, ) # these attributes are needed to inform transformers/peft of the quantization model.is_quantized = True model.quantization_method = "bitsandbytes" model.hf_quantizer = AutoHfQuantizer.from_config(quantization_config) if cfg.local_rank == 0 and verbose: print(f"Loaded model weights in {time.time() - start:.3f} seconds") # cleanup any extra memory usage from parallel loading torch.cuda.empty_cache() return model ================================================ FILE: src/axolotl/utils/optimizers/__init__.py ================================================ ================================================ FILE: src/axolotl/utils/optimizers/adopt.py ================================================ """ Copied from https://github.com/iShohei220/adopt ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024) Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka """ # mypy: ignore-errors # flake8: noqa # mypy: allow-untyped-decorators # mypy: allow-untyped-defs from typing import Callable, List, Optional, Tuple, Union, cast import torch from torch import Tensor from torch.optim.optimizer import ( # DeviceDict,; _capturable_doc,; _differentiable_doc,; _foreach_doc,; _fused_doc,; _maximize_doc,; _stack_if_compiling, DeviceDict, Optimizer, ParamsT, _capturable_doc, _default_to_fused_or_foreach, _device_dtype_check_for_fused, _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, _fused_doc, _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _stack_if_compiling, _use_grad_for_differentiable, _view_as_real, ) __all__ = ["ADOPT", "adopt"] class ADOPT(Optimizer): def __init__( self, params: ParamsT, lr: Union[float, Tensor] = 1e-3, betas: Tuple[float, float] = (0.9, 0.9999), eps: float = 1e-6, clip_lambda: Optional[Callable[[int], float]] = lambda step: step**0.25, weight_decay: float = 0.0, decouple: bool = False, *, foreach: Optional[bool] = None, maximize: bool = False, capturable: bool = False, differentiable: bool = False, fused: Optional[bool] = None, ): if isinstance(lr, Tensor): if foreach and not capturable: raise ValueError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) if lr.numel() != 1: raise ValueError("Tensor lr must be 1-element") if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") if not 0.0 <= weight_decay: raise ValueError(f"Invalid weight_decay value: {weight_decay}") self.clip_lambda = clip_lambda defaults = dict( lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, decouple=decouple, maximize=maximize, foreach=foreach, capturable=capturable, differentiable=differentiable, fused=fused, ) super().__init__(params, defaults) if fused: # TODO: support fused raise RuntimeError("`fused` is not currently supported") if differentiable: raise RuntimeError("`fused` does not support `differentiable`") self._step_supports_amp_scaling = True # TODO(crcrpar): [low prec params & their higher prec copy] # Support AMP with FP16/BF16 model params which would need # higher prec copy of params to do update math in higher prec to # alleviate the loss of information. if foreach: raise RuntimeError("`fused` and `foreach` cannot be `True` together.") def __setstate__(self, state): super().__setstate__(state) for group in self.param_groups: group.setdefault("maximize", False) group.setdefault("foreach", None) group.setdefault("capturable", False) group.setdefault("differentiable", False) fused = group.setdefault("fused", None) for p in group["params"]: p_state = self.state.get(p, []) if len(p_state) != 0 and not torch.is_tensor(p_state["step"]): step_val = float(p_state["step"]) p_state["step"] = ( torch.tensor( step_val, dtype=_get_scalar_dtype(is_fused=fused), device=p.device, ) if group["capturable"] or group["fused"] else torch.tensor(step_val, dtype=_get_scalar_dtype()) ) def _init_group( self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, ): has_complex = False for p in group["params"]: if p.grad is not None: has_complex |= torch.is_complex(p) params_with_grad.append(p) if p.grad.is_sparse: raise RuntimeError("ADOPT does not support sparse gradients") grads.append(p.grad) state = self.state[p] # Lazy state initialization if len(state) == 0: if group["fused"]: _device_dtype_check_for_fused(p) # note(crcrpar): [special device hosting for step] # Deliberately host `step` on CPU if both capturable and fused are off. # This is because kernel launches are costly on CUDA and XLA. state["step"] = ( torch.zeros( (), dtype=_get_scalar_dtype(is_fused=group["fused"]), device=p.device, ) if group["capturable"] or group["fused"] else torch.tensor(0.0, dtype=_get_scalar_dtype()) ) # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like( p, memory_format=torch.preserve_format ) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format ) exp_avgs.append(state["exp_avg"]) exp_avg_sqs.append(state["exp_avg_sq"]) if group["differentiable"] and state["step"].requires_grad: raise RuntimeError( "`requires_grad` is not supported for `step` in differentiable mode" ) # Foreach without capturable does not support a tensor lr if ( group["foreach"] and torch.is_tensor(group["lr"]) and not group["capturable"] ): raise RuntimeError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) state_steps.append(state["step"]) return has_complex @_use_grad_for_differentiable def step(self, closure=None): """Perform a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. """ self._cuda_graph_capture_health_check() loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: params_with_grad: List[Tensor] = [] grads: List[Tensor] = [] exp_avgs: List[Tensor] = [] exp_avg_sqs: List[Tensor] = [] state_steps: List[Tensor] = [] beta1, beta2 = group["betas"] has_complex = self._init_group( group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, ) adopt( params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, has_complex=has_complex, beta1=beta1, beta2=beta2, lr=group["lr"], clip_lambda=self.clip_lambda, weight_decay=group["weight_decay"], decouple=group["decouple"], eps=group["eps"], maximize=group["maximize"], foreach=group["foreach"], capturable=group["capturable"], differentiable=group["differentiable"], fused=group["fused"], grad_scale=getattr(self, "grad_scale", None), found_inf=getattr(self, "found_inf", None), ) return loss def _single_tensor_adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], grad_scale: Optional[Tensor], found_inf: Optional[Tensor], *, has_complex: bool, beta1: float, beta2: float, lr: Union[float, Tensor], clip_lambda: Optional[Callable[[int], float]], weight_decay: float, decouple: bool, eps: float, maximize: bool, capturable: bool, differentiable: bool, ): assert grad_scale is None and found_inf is None if torch.jit.is_scripting(): # this assert is due to JIT being dumb and not realizing that the ops below # have overloads to handle both float and Tensor lrs, so we just assert it's # a float since most people using JIT are using floats assert isinstance(lr, float) for i, param in enumerate(params): grad = grads[i] if not maximize else -grads[i] exp_avg = exp_avgs[i] exp_avg_sq = exp_avg_sqs[i] step_t = state_steps[i] # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: capturable_supported_devices = _get_capturable_supported_devices() assert ( param.device.type == step_t.device.type and param.device.type in capturable_supported_devices ), ( f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." ) step = step_t if capturable or differentiable else _get_value(step_t) if weight_decay != 0 and not decouple: grad = grad.add(param, alpha=weight_decay) if torch.is_complex(param): grad = torch.view_as_real(grad) if exp_avg is not None: exp_avg = torch.view_as_real(exp_avg) if exp_avg_sq is not None: exp_avg_sq = torch.view_as_real(exp_avg_sq) param = torch.view_as_real(param) if step == 0: exp_avg_sq.addcmul_(grad, grad.conj()) # update step step_t += 1 continue if weight_decay != 0 and decouple: param.add_(param, alpha=-lr * weight_decay) denom = torch.clamp(exp_avg_sq.sqrt(), eps) normed_grad = grad.div(denom) if clip_lambda is not None: clip = clip_lambda(step) normed_grad.clamp_(-clip, clip) exp_avg.lerp_(normed_grad, 1 - beta1) param.add_(exp_avg, alpha=-lr) exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2) # update step step_t += 1 def _multi_tensor_adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], grad_scale: Optional[Tensor], found_inf: Optional[Tensor], *, has_complex: bool, beta1: float, beta2: float, lr: Union[float, Tensor], clip_lambda: Optional[Callable[[int], float]], weight_decay: float, decouple: bool, eps: float, maximize: bool, capturable: bool, differentiable: bool, ): if len(params) == 0: return if isinstance(lr, Tensor) and not capturable: raise RuntimeError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: capturable_supported_devices = _get_capturable_supported_devices( supports_xla=False ) assert all( p.device.type == step.device.type and p.device.type in capturable_supported_devices for p, step in zip(params, state_steps) ), ( f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." ) assert grad_scale is None and found_inf is None assert not differentiable, "_foreach ops don't support autograd" grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_avg_sqs, state_steps] # type: ignore[list-item] ) for ( device_params_, device_grads_, device_exp_avgs_, device_exp_avg_sqs_, device_state_steps_, ), _ in grouped_tensors.values(): device_params = cast(List[Tensor], device_params_) device_grads = cast(List[Tensor], device_grads_) device_exp_avgs = cast(List[Tensor], device_exp_avgs_) device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_) device_state_steps = cast(List[Tensor], device_state_steps_) # Handle complex parameters if has_complex: _view_as_real( device_params, device_grads, device_exp_avgs, device_exp_avg_sqs ) if maximize: device_grads = torch._foreach_neg(device_grads) # type: ignore[assignment] if weight_decay != 0 and not decouple: # Re-use the intermediate memory (device_grads) already allocated for maximize if maximize: torch._foreach_add_(device_grads, device_params, alpha=weight_decay) else: device_grads = torch._foreach_add( # type: ignore[assignment] device_grads, device_params, alpha=weight_decay ) if device_state_steps[0] == 0: torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads) # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just # wrapped it once now. The alpha is required to assure we go to the right overload. if not torch._utils.is_compiling() and device_state_steps[0].is_cpu: torch._foreach_add_( device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0 ) else: torch._foreach_add_(device_state_steps, 1) continue if weight_decay != 0 and decouple: torch._foreach_add_(device_params, device_params, alpha=-lr * weight_decay) exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs) torch._foreach_maximum_(exp_avg_sq_sqrt, eps) normed_grad = torch._foreach_div(device_grads, exp_avg_sq_sqrt) if clip_lambda is not None: clip = clip_lambda(device_state_steps[0]) torch._foreach_maximum_(normed_grad, -clip) torch._foreach_minimum_(normed_grad, clip) torch._foreach_lerp_(device_exp_avgs, normed_grad, 1 - beta1) torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr) torch._foreach_mul_(device_exp_avg_sqs, beta2) torch._foreach_addcmul_( device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2 ) # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just # wrapped it once now. The alpha is required to assure we go to the right overload. if not torch._utils.is_compiling() and device_state_steps[0].is_cpu: torch._foreach_add_( device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0 ) else: torch._foreach_add_(device_state_steps, 1) @_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt) def adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 # setting this as kwarg for now as functional API is compiled by torch/distributed/optim foreach: Optional[bool] = None, capturable: bool = False, differentiable: bool = False, fused: Optional[bool] = None, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None, has_complex: bool = False, *, beta1: float, beta2: float, lr: Union[float, Tensor], clip_lambda: Optional[Callable[[int], float]], weight_decay: float, decouple: bool, eps: float, maximize: bool, ): r"""Functional API that performs ADOPT algorithm computation.""" # Respect when the user inputs False/True for foreach or fused. We only want to change # the default when neither have been user-specified. Note that we default to foreach # and pass False to use_fused. This is not a mistake--we want to give the fused impl # bake-in time before making it the default, even if it is typically faster. if fused is None and foreach is None: _, foreach = _default_to_fused_or_foreach( params, differentiable, use_fused=False ) # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False. if foreach and isinstance(lr, Tensor) and not capturable: foreach = False if fused is None: fused = False if foreach is None: foreach = False # this check is slow during compilation, so we skip it # if it's strictly needed we can add this check back in dynamo if not torch._utils.is_compiling() and not all( isinstance(t, torch.Tensor) for t in state_steps ): raise RuntimeError( "API has changed, `state_steps` argument must contain a list of singleton tensors" ) if foreach and torch.jit.is_scripting(): raise RuntimeError("torch.jit.script not supported with foreach optimizers") if fused and torch.jit.is_scripting(): raise RuntimeError("torch.jit.script not supported with fused optimizers") # if fused and not torch.jit.is_scripting(): # func = _fused_adopt # elif foreach and not torch.jit.is_scripting(): if foreach and not torch.jit.is_scripting(): func = _multi_tensor_adopt else: func = _single_tensor_adopt func( params, grads, exp_avgs, exp_avg_sqs, state_steps, has_complex=has_complex, beta1=beta1, beta2=beta2, lr=lr, clip_lambda=clip_lambda, weight_decay=weight_decay, decouple=decouple, eps=eps, maximize=maximize, capturable=capturable, differentiable=differentiable, grad_scale=grad_scale, found_inf=found_inf, ) ================================================ FILE: src/axolotl/utils/quantization.py ================================================ """ Utilities for quantization including QAT and PTQ using torchao. """ import torch from packaging import version from torchao.core.config import AOBaseConfig from torchao.prototype.qat import MXFakeQuantizeConfig from torchao.quantization import quantize_ from torchao.quantization.qat import ( QATConfig, ) from torchao.quantization.quant_api import ( Float8DynamicActivationFloat8WeightConfig, Float8DynamicActivationInt4WeightConfig, Int8DynamicActivationInt4WeightConfig, ) from axolotl.utils.schemas.enums import TorchAOQuantDType quantization_config_to_str = { Int8DynamicActivationInt4WeightConfig: "int8int4", Float8DynamicActivationFloat8WeightConfig: "fp8fp8", Float8DynamicActivationInt4WeightConfig: "fp8int4", } if version.parse(torch.__version__) >= version.parse("2.8.0"): try: from torchao.prototype.mx_formats import NVFP4InferenceConfig quantization_config_to_str[NVFP4InferenceConfig] = "nvfp4" except (ImportError, RuntimeError): pass # int4 weight config imports will fail on machines with fbgemm-gpu installed # without a CUDA runtime available so we do this safely try: from torchao.quantization.quant_api import Int4WeightOnlyConfig quantization_config_to_str[Int4WeightOnlyConfig] = "int4" except (ImportError, RuntimeError): pass try: from torchao.prototype.qat import MXFakeQuantizeConfig quantization_config_to_str[MXFakeQuantizeConfig] = "mxfp4" except ImportError: pass def get_quantization_config( weight_dtype: TorchAOQuantDType, activation_dtype: TorchAOQuantDType | None = None, group_size: int | None = None, ) -> AOBaseConfig: """ This function is used to build a post-training quantization config. Args: weight_dtype: The dtype to use for weight quantization. activation_dtype: The dtype to use for activation quantization. group_size: The group size to use for weight quantization. Returns: The post-training quantization config. Raises: ValueError: If the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization. """ if activation_dtype is None: if weight_dtype == TorchAOQuantDType.int8: raise ValueError("Int8WeightOnlyConfig is not supported by torchao QAT.") if weight_dtype == TorchAOQuantDType.int4: from torchao.quantization.quant_api import Int4WeightOnlyConfig if group_size is not None: return Int4WeightOnlyConfig(group_size=group_size, version=2) else: return Int4WeightOnlyConfig(version=2) if ( activation_dtype == TorchAOQuantDType.int4 and weight_dtype == TorchAOQuantDType.int4 ): raise ValueError( "Int4DynamicActivationInt4WeightConfig is not supported by torchao QAT." ) if ( activation_dtype == TorchAOQuantDType.int8 and weight_dtype == TorchAOQuantDType.int8 ): raise ValueError( "Int8DynamicActivationInt8WeightConfig is not supported by torchao QAT." ) if ( activation_dtype == TorchAOQuantDType.int8 and weight_dtype == TorchAOQuantDType.int4 ): if group_size is not None: return Int8DynamicActivationInt4WeightConfig(group_size=group_size) else: return Int8DynamicActivationInt4WeightConfig() if ( activation_dtype == TorchAOQuantDType.float8_e4m3fn and weight_dtype == TorchAOQuantDType.float8_e4m3fn ): return Float8DynamicActivationFloat8WeightConfig() if ( activation_dtype == TorchAOQuantDType.float8_e4m3fn and weight_dtype == TorchAOQuantDType.int4 ): return Float8DynamicActivationInt4WeightConfig() if weight_dtype == TorchAOQuantDType.nvfp4: from torchao.prototype.mx_formats import NVFP4InferenceConfig if group_size is not None and group_size != 16: raise ValueError("NVFP4 quantization must use a group_size of 16") return NVFP4InferenceConfig() if weight_dtype == TorchAOQuantDType.mxfp4: from torchao.prototype.qat import MXFakeQuantizeConfig # MXFP4 uses block_size=32 by default (vs NVFP4's 16) block_size = group_size if group_size is not None else 32 if block_size != 32: raise ValueError( "MXFP4 quantization must use a block_size (group_size) of 32" ) return MXFakeQuantizeConfig(dtype=torch.float4_e2m1fn_x2, block_size=block_size) raise ValueError( f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}" ) def quantize_model( model, weight_dtype: TorchAOQuantDType, group_size: int | None = None, activation_dtype: TorchAOQuantDType | None = None, quantize_embedding: bool | None = None, ): """ This function is used to quantize a model. Args: model: The model to quantize. weight_dtype: The dtype to use for weight quantization. group_size: The group size to use for weight quantization. activation_dtype: The dtype to use for activation quantization. quantize_embedding: Whether to quantize the model's embedding weights. """ linear_ptq_config = get_quantization_config( weight_dtype=weight_dtype, activation_dtype=activation_dtype, group_size=group_size, ) quantize_(model, linear_ptq_config) if quantize_embedding: # activation fake quantization is not supported for embedding layers embedding_quantize_config = get_quantization_config( weight_dtype=weight_dtype, activation_dtype=None, group_size=group_size, ) quantize_( model, embedding_quantize_config, filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding), ) def prepare_model_for_qat( model, weight_dtype: TorchAOQuantDType, group_size: int | None = None, activation_dtype: TorchAOQuantDType | None = None, quantize_embedding: bool = False, ): """ This function is used to prepare a model for QAT by swapping the model's linear layers with fake quantized linear layers, and optionally the embedding weights with fake quantized embedding weights. Args: model: The model to quantize. weight_dtype: The dtype to use for weight quantization. group_size: The group size to use for weight quantization. activation_dtype: The dtype to use for activation quantization. quantize_embedding: Whether to quantize the model's embedding weights. Raises: ValueError: If the activation/weight dtype combination is invalid. """ base_config = get_quantization_config( weight_dtype=weight_dtype, activation_dtype=activation_dtype, group_size=group_size, ) if isinstance(base_config, MXFakeQuantizeConfig): qat_config = QATConfig( activation_config=base_config, weight_config=base_config, ) else: qat_config = QATConfig(base_config) quantize_(model, qat_config) if quantize_embedding: # activation fake quantization is not supported for embedding layers embedding_base_config = get_quantization_config( weight_dtype=weight_dtype, activation_dtype=None, group_size=group_size, ) if isinstance(embedding_base_config, MXFakeQuantizeConfig): embedding_qat_config = QATConfig( weight_config=embedding_base_config, ) else: embedding_qat_config = QATConfig(embedding_base_config) quantize_( model, embedding_qat_config, filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding), ) def convert_qat_model( model, quantize_embedding: bool = False, ): """ This function converts a QAT model which has fake quantized layers back to the original model. """ config = QATConfig(step="convert") quantize_(model, config) if quantize_embedding: quantize_( model, config, filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding), ) ================================================ FILE: src/axolotl/utils/samplers/__init__.py ================================================ """ axolotl samplers module """ from .multipack import MultipackBatchSampler # noqa: F401 from .utils import get_dataset_lengths # noqa: F401 ================================================ FILE: src/axolotl/utils/samplers/multipack.py ================================================ """ Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences into fixed-capacity batches to optimize memory usage and training throughput. """ import gc import math import os import time from concurrent.futures import ProcessPoolExecutor from multiprocessing import cpu_count, get_context from typing import Iterable, Iterator, Union import numba import numpy as np from torch.utils.data import BatchSampler, Sampler, SequentialSampler from axolotl.utils.distributed import reduce_and_broadcast from axolotl.utils.logging import get_logger LOG = get_logger(__name__) @numba.njit def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int) -> bool: """First-fit-decreasing bin packing algorithm check. Checks if sequences with the given lengths could fit in the specified number of bins. Args: sequence_lengths: Array of sequence lengths. bin_capacity: Maximum capacity of each bin. num_bins: Number of bins available. Returns: `True` if all sequences can be packed, `False` otherwise. """ # Sort sequence lengths in descending order for optimal packing sequence_lengths = np.sort(sequence_lengths)[::-1] # Initialize all bins with full capacity bins = np.full((num_bins,), bin_capacity, dtype=sequence_lengths.dtype) # Try to place each sequence in the first bin it fits for size in sequence_lengths: not_found = True for idx in range(num_bins): if bins[idx] >= size: bins[idx] -= size not_found = False break # If no bin could fit this sequence, packing failed if not_found: return False return True @numba.njit def pack_group( sequence_lengths: np.ndarray, group_offset: int, bin_capacity: int, max_bins: int, bin_size: int, safe_mode: bool = True, ) -> list[list[int]]: """Pack a group of sequences into bins using First-Fit Decreasing algorithm. Args: sequence_lengths: Array of sequence lengths. group_offset: Offset to apply to indices when returning results. bin_capacity: Maximum capacity of each bin. max_bins: Maximum number of bins to use. bin_size: Maximum number of sequences per bin. safe_mode: If True, use a more conservative packing approach. Returns: List of bins, where each bin contains indices of sequences assigned to it. """ bins_remaining_space: list = [] # Tracks remaining capacity in each bin bins_assigned_sequences: list = [] # Tracks sequence indices assigned to each bin for seq_id, size in enumerate(sequence_lengths): global_idx = seq_id + group_offset # Try to place sequence in existing bins add_new_bin = True for bin_idx, _ in enumerate(bins_remaining_space): if ( bins_remaining_space[bin_idx] >= size and len(bins_assigned_sequences[bin_idx]) < bin_size ): bins_remaining_space[bin_idx] -= size bins_assigned_sequences[bin_idx].append(global_idx) add_new_bin = False break # Create a new bin if needed and if we haven't reached the limit if add_new_bin: if len(bins_remaining_space) >= max_bins and safe_mode: # In safe mode, skip items that would exceed max_bins continue bins_remaining_space.append(bin_capacity - size) bins_assigned_sequences.append([global_idx]) # Safety check to avoid infinite bins if len(bins_remaining_space) > len(sequence_lengths): break return bins_assigned_sequences def _process_group( args: tuple[np.ndarray, int, int, int, int, bool], ) -> list[list[int]]: """Standalone function for multiprocessing.""" group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode = args return pack_group( group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode ) def pack_parallel( sequence_lengths: np.ndarray, bin_capacity: int, group_size: int, bin_size: int, num_processes: int | None = None, safe_mode: bool = True, mp_start_method: str | None = "fork", ) -> list[list[int]]: """Pack sequences into bins using parallel processing. Args: sequence_lengths: Array of sequence lengths. bin_capacity: Maximum capacity of each bin as total number of tokens. group_size: Number of sequences to process in each group. bin_size: Maximum number of bins to use. num_processes: Number of parallel processes to use. safe_mode: If True, use a more conservative packing approach. mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver'). 'spawn' is often safer with Numba/PyTorch. Set to None to use system default. Returns: List of bins, where each bin contains indices of sequences assigned to it. """ num_items = len(sequence_lengths) if num_processes is None: num_processes = max(1, min(num_items // group_size, cpu_count(), 16)) # Create tasks for parallel processing tasks = [] for i in range(0, num_items, group_size): group_lengths = sequence_lengths[i : i + group_size] max_bins = len(group_lengths) # Allow as many bins as items in the group tasks.append((group_lengths, i, bin_capacity, max_bins, bin_size, safe_mode)) # Process groups in parallel all_bins = [] mp_ctx = None if mp_start_method: try: mp_ctx = get_context(mp_start_method) except ValueError: LOG.warning( f"Failed to get multiprocessing context '{mp_start_method}'. " f"Falling back to default. Available: {get_context().get_all_start_methods()}" ) mp_ctx = ( None # Fallback to default context if specified one is not available ) if num_processes == 1: LOG.debug("Using single process for pack_parallel, running sequentially.") for task_args in tasks: group_bins = _process_group(task_args) all_bins.extend(group_bins) else: # Use ProcessPoolExecutor only if num_processes > 1 # Pass mp_context if available with ProcessPoolExecutor( max_workers=num_processes, mp_context=mp_ctx ) as executor: for group_bins in executor.map(_process_group, tasks): all_bins.extend(group_bins) return all_bins @numba.njit def allocate_sequentially( sequence_lengths: np.ndarray, rank: int, bin_capacity: int, num_ranks: int ) -> tuple[list[list[int]], int, int]: """Sequential allocator that preserves example order. Args: sequence_lengths: The lengths of all examples. rank: The current rank (for distributed training). bin_capacity: The capacity of each bin (maximum sequence length). num_ranks: Number of ranks (processes / GPUs). Returns: rank_batches: List of batches for the current rank. total_tokens_used: Number of actual example tokens. total_token_slots: Maximum theoretical number of example tokens (number of bins * bin capacity). """ result = [] total_used = 0 # First, do sequential packing into bins all_bins = [] current_bin = [0 for i in range(0)] # numba hint remaining_capacity = bin_capacity for idx, size in enumerate(sequence_lengths): if size <= remaining_capacity: # Example fits in current bin current_bin.append(idx) remaining_capacity -= size total_used += size else: # Example doesn't fit, start a new bin if current_bin: # Add non-empty bin to all_bins all_bins.append(current_bin) current_bin = [idx] remaining_capacity = bin_capacity - size total_used += size # Add the last bin if not empty if current_bin: all_bins.append(current_bin) # Assign bins to ranks - each rank gets every n-th bin for bin_idx in range(rank, len(all_bins), num_ranks): result.append(all_bins[bin_idx]) return result, total_used, len(all_bins) * bin_capacity class MultipackBatchSampler(BatchSampler): """Batch sampler class for efficient packing of variable-length sequences This sampler packs sequences into fixed-capacity bins (batches) to maximize GPU memory utilization and training throughput by reducing padding. It supports both parallel packing (using FFD algorithm) and sequential packing (preserving original sequence order). """ _batches: list[list[list[int]]] | None = None _len_across_ranks: int | None = None def __init__( self, sampler: Union[Sampler[int], Iterable[int]], batch_size: int, # Number of bins per batch batch_max_len: int, # Maximum sequence length (bin capacity) lengths: np.ndarray, # Sequence lengths bin_size: int, # The max number of samples that can be packed in a single bin packing_efficiency_estimate: float = 1.0, # Initial efficiency estimate drop_last: bool = True, # Whether to drop final batches (might be incomplete) num_count_samples: int = 4, # Number of times to estimate batch count sequential: bool = False, # Whether to use sequential packing group_size: int = 100_000, # Size of groups for parallel packing num_processes: int | None = None, # Number of processes for parallel packing safe_mode: bool = True, # Conservative packing to prevent training instability mp_start_method: str = "fork", **kwargs, ): super().__init__(sampler, batch_size, drop_last) self.batch_size = batch_size self.batch_max_len = batch_max_len self.lengths = np.array(lengths, dtype=np.int32) self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0 self.sequential = sequential self.group_size = group_size self.bin_size = bin_size self.num_processes = num_processes self.safe_mode = safe_mode self.mp_start_method = mp_start_method assert isinstance(self.lengths, np.ndarray) self.epoch = 0 # Efficiency statistics tracking self.total_tokens_used = 0 self.total_token_slots = 0 # The number of times to calculate batches to determine minimum packed dataset length world_size = int(os.environ.get("WORLD_SIZE", "1")) self.num_count_samples = ( 1 if world_size >= num_count_samples else num_count_samples ) if self.sequential and not isinstance(sampler, SequentialSampler): LOG.warning( "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?" ) def set_epoch(self, epoch: int): """Set the epoch number, used for reproducible shuffling across epochs""" self.epoch = epoch self._batches = None # Invalidate batch cache def generate_batches(self, set_stats: bool = False) -> list[list[list[int]]]: """Generate packed batches for training. Args: set_stats: Whether to update efficiency statistics. Returns: List of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices. """ if self._batches is not None: return self._batches # Get indices from the sampler indices = [idx for idx in self.sampler] # Get lengths of the selected sequences lengths = self.lengths[indices] # Pack sequences into bins using either sequential or parallel packing if self.sequential: bins, total_used, total_slots = allocate_sequentially( lengths, rank=0, bin_capacity=self.batch_max_len, num_ranks=1, ) # Map bin indices back to original indices bins = [[indices[b_idx] for b_idx in bin_indices] for bin_indices in bins] else: # Use parallel packing num_processes = self.num_processes or 1 all_bins = pack_parallel( lengths, bin_capacity=self.batch_max_len, group_size=self.group_size, bin_size=self.bin_size or self.batch_max_len, num_processes=min(4, num_processes) if num_processes else 4, safe_mode=self.safe_mode, mp_start_method=self.mp_start_method, ) # Map bin indices back to original indices bins = [ [indices[b_idx] for b_idx in bin_indices] for bin_indices in all_bins ] # Calculate efficiency statistics total_used = lengths.sum() total_slots = len(all_bins) * self.batch_max_len del all_bins # Group bins into batches (each batch contains batch_size bins) batches = [ bins[i : i + self.batch_size] for i in range(0, len(bins), self.batch_size) ] # Drop last batch if requested and it's incomplete if self.drop_last and len(batches[-1]) < self.batch_size: batches = batches[:-1] # Adjust total_slots if we dropped a batch if not self.sequential: total_slots -= (self.batch_size - len(batches[-1])) * self.batch_max_len # Update statistics if requested if set_stats: self.total_tokens_used += total_used self.total_token_slots += total_slots self._batches = batches gc.collect() return batches def __iter__(self) -> Iterator[list[list[int]]]: """Return an iterator over batches. The batches are truncated to match the minimum number of batches across all ranks to ensure distributed training balance. """ batches = self.generate_batches(set_stats=True) if self._len_across_ranks: # Truncate batches to ensure all ranks have the same number of batches batches = batches[: self._len_across_ranks] return iter(batches) def efficiency(self) -> float: """Calculate the packing efficiency (ratio of tokens used to total token slots). Higher is better - 1.0 would mean perfect packing with no wasted space. """ if self.total_token_slots == 0: self.generate_batches(set_stats=True) if self.total_token_slots == 0: return 0.0 # Return a Python float instead of potentially a numpy float return float(self.total_tokens_used / self.total_token_slots) def gather_efficiency(self) -> float: """Gather and synchronize packing efficiency estimates across all distributed ranks. Returns: A conservative efficiency estimate based on the measurements. """ def calc_sample_packing_eff_est(estimates: list[float]): LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}") # Use 99.7% of max observed efficiency as a safe estimate max_eff = max(float(eff) for eff in estimates) return math.floor(0.997 * max_eff) # Gather efficiency from all ranks and apply the calculation function sample_packing_actual_eff_all = reduce_and_broadcast( lambda: float(self.efficiency()), calc_sample_packing_eff_est, ) # Quantize to 0.5% intervals for stability sample_packing_eff_est = ( math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0 ) return sample_packing_eff_est def gather_len_batches(self, num: int) -> int: """Gather and synchronize batch counts across all distributed ranks. Returns the minimum number of batches available on any rank. """ def calc_min_len(estimates: list[int]) -> int: LOG.info(f"gather_len_batches: {repr(estimates)}") return math.floor(min(estimates)) # Find minimum batch count across ranks to ensure balance min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len) return min_len_batches def __len__(self) -> int: """Return the total number of batches that will be yielded by this sampler. This is calculated as the minimum number of batches available on any rank to ensure balanced distributed training. """ if self._batches is None: self._batches = self.generate_batches(set_stats=True) if self._len_across_ranks is None: # Sample multiple times to get stable estimate _sampled_lens = [] for _ in range(self.num_count_samples): self._batches = None # Reset cached batches # log timer for generating batches start_time = time.time() _sampled_lens.append(len(self.generate_batches(set_stats=False))) LOG.debug(f"generate_batches time: {time.time() - start_time}") len_batches = min(_sampled_lens) # Gather minimum across all ranks if self._len_across_ranks is None: self._len_across_ranks = self.gather_len_batches(len_batches) else: self._len_across_ranks = min( self._len_across_ranks, self.gather_len_batches(len_batches) ) return self._len_across_ranks ================================================ FILE: src/axolotl/utils/samplers/utils.py ================================================ """ helper util to calculate dataset lengths """ import numpy as np def get_dataset_lengths(dataset, from_arrow=False): if "length" in dataset.column_names: lengths = np.array(dataset["length"]) elif "position_ids" in dataset.column_names: position_ids = dataset["position_ids"] lengths = np.array([x[-1] + 1 for x in position_ids]) else: if from_arrow: input_ids = dataset.data.column("input_ids") lengths = np.vectorize(len)(np.array(input_ids, dtype=object)) else: input_ids = dataset["input_ids"] lengths = np.array([len(seq) for seq in input_ids]) return lengths ================================================ FILE: src/axolotl/utils/schedulers.py ================================================ """Module for custom LRScheduler class""" import math from functools import partial from typing import Sequence from torch import Tensor from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR, LRScheduler class RexLR(LRScheduler): """ Reflected Exponential (REX) learning rate scheduler. - Original implementation: https://github.com/IvanVassi/REX_LR - Original license: Apache 2.0 - Based on: https://arxiv.org/abs/2107.04197 Args: optimizer (torch.optim.Optimizer): The optimizer to schedule the learning rate for. max_lr (float): The maximum learning rate. min_lr (float): The minimum learning rate. total_steps (int): The total number of training steps. num_warmup_steps (int): The number of warmup steps. last_step (int): The index of last step. """ def __init__( self, optimizer, max_lr, min_lr, total_steps=0, num_warmup_steps=0, last_step=0 ): if min_lr > max_lr: raise ValueError( f'Value of "min_lr" should be less than value of "max_lr". Got min_lr={min_lr} and max_lr={max_lr}' ) if num_warmup_steps > total_steps: raise ValueError( f"num_warmup_steps ({num_warmup_steps}) must be less than or equal to total_steps ({total_steps})." ) self.min_lr = min_lr self.max_lr = max_lr self.total_steps = total_steps self.num_warmup_steps = num_warmup_steps self.last_step = max(last_step - 1, 0) # Ensure each parameter group has an "initial_lr" key to avoid issues when resuming. for group in optimizer.param_groups: initial_lr = group["lr"] if isinstance(initial_lr, Tensor): initial_lr = initial_lr.clone() group.setdefault("initial_lr", initial_lr) # Pass self.last_step as last_epoch to the parent. super().__init__(optimizer, last_epoch=self.last_step) @property def last_step(self): return self.last_epoch @last_step.setter def last_step(self, value): self.last_epoch = value def get_lr(self): # Warmup phase: if defined, increase lr linearly from 0 to max_lr. if 1 <= self.last_step <= self.num_warmup_steps: return [ base_lr * self.last_step / self.num_warmup_steps for base_lr in self.base_lrs ] # Post-warmup phase: adjust step relative to the end of warmup. step_after = self.last_step - self.num_warmup_steps remaining_steps = self.total_steps - self.num_warmup_steps # Avoid LR spiking if step_after >= remaining_steps or step_after == -1 or remaining_steps <= 0: return [self.min_lr for _ in self.base_lrs] mod_iter = step_after % remaining_steps z = (remaining_steps - mod_iter) / remaining_steps rex_factor = self.min_lr / self.max_lr + (1.0 - self.min_lr / self.max_lr) * ( z / (0.1 + 0.9 * z) ) return [base_lr * rex_factor for base_lr in self.base_lrs] class InterpolatingLogScheduler(LRScheduler): """ A scheduler that interpolates learning rates in a logarithmic fashion """ def __init__(self, optimizer, num_steps, min_lr, max_lr, last_epoch=-1): """A scheduler that interpolates learning rates in a logarithmic fashion Args: - optimizer: pytorch optimizer - num_steps: int, the number of steps over which to increase from the min_lr to the max_lr - min_lr: float, the minimum learning rate - max_lr: float, the maximum learning rate Usage: fc = nn.Linear(1,1) optimizer = optim.Adam(fc.parameters()) lr_scheduler = InterpolatingLogScheduler(optimizer, num_steps=400, min_lr=1e-6, max_lr=1e-4) """ self.num_steps = num_steps self.min_lr = min_lr self.max_lr = max_lr self.q = (max_lr / min_lr) ** (1 / (num_steps - 1)) super().__init__(optimizer, last_epoch) def get_lr(self): if self.last_epoch <= 0: lrs = [self.min_lr for base_lr in self.base_lrs] elif self.last_epoch < self.num_steps: lrs = [ self.min_lr * (self.q ** (self.last_epoch - 1)) for base_lr in self.base_lrs ] else: lrs = [self.max_lr for base_lr in self.base_lrs] return lrs def _get_cosine_schedule_with_quadratic_warmup_lr_lambda( current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, ): if current_step < num_warmup_steps: return (float(current_step) / float(max(1, num_warmup_steps))) ** 2 progress = float(current_step - num_warmup_steps) / float( max(1, num_training_steps - num_warmup_steps) ) return max( 0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) ) def get_cosine_schedule_with_quadratic_warmup( optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1, ): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer. Args: optimizer ([`~torch.optim.Optimizer`]): The optimizer for which to schedule the learning rate. num_warmup_steps (`int`): The number of steps for the warmup phase. num_training_steps (`int`): The total number of training steps. num_cycles (`float`, *optional*, defaults to 0.5): The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). last_epoch (`int`, *optional*, defaults to -1): The index of the last epoch when resuming training. Return: `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. """ lr_lambda = partial( _get_cosine_schedule_with_quadratic_warmup_lr_lambda, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=num_cycles, ) return LambdaLR(optimizer, lr_lambda, last_epoch) def _get_cosine_schedule_with_min_lr_lambda( current_step: int, *, num_warmup_steps: int, num_training_steps: int, min_lr_ratio: float, ): # Warm up if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) # Cosine learning rate decay progress = float(current_step - num_warmup_steps) / float( max(1, num_training_steps - num_warmup_steps) ) scaling = 0.5 * (1.0 + math.cos(math.pi * progress)) return (1 - min_lr_ratio) * scaling + min_lr_ratio def get_cosine_schedule_with_min_lr( optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, min_lr_ratio: float = 0.0, ): """ Create a learning rate schedule which has: - linear warmup from 0 -> `max_lr` over `num_warmup_steps` - cosine learning rate annealing from `max_lr` -> `min_lr` over `num_training_steps` """ lr_lambda = partial( _get_cosine_schedule_with_min_lr_lambda, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, min_lr_ratio=min_lr_ratio, ) return LambdaLR(optimizer, lr_lambda) def _get_cosine_schedule_with_warmup_decay_constant_lr_lambda( current_step: int, *, num_warmup_steps: int, num_training_steps: int, constant_lr_ratio: float, min_lr_ratio: float, num_cycles: float, ): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) num_constant_steps = int(num_training_steps * constant_lr_ratio) current_step = min(current_step, num_constant_steps) progress = float(current_step - num_warmup_steps) / float( max(1, num_constant_steps - num_warmup_steps) ) return ( max( 0, (1 - min_lr_ratio) * 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)), ) + min_lr_ratio ) def get_cosine_schedule_with_warmup_decay_constant( optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, constant_lr_ratio: float, min_lr_ratio: float, num_cycles: float = 0.5, last_epoch: int = -1, ): """ Implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf) Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate , after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer. Args: optimizer ([`~torch.optim.Optimizer`]): The optimizer for which to schedule the learning rate. num_warmup_steps (`int`): The number of steps for the warmup phase. num_training_steps (`int`): The total number of training steps. constant_lr_ratio: (`float`): The ratio of num_training_steps to decrease by cosine function. min_lr_ratio: (`float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate. num_cycles (`float`, *optional*, defaults to 0.5): The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). last_epoch (`int`, *optional*, defaults to -1): The index of the last epoch when resuming training. Return: `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. """ lr_lambda = partial( _get_cosine_schedule_with_warmup_decay_constant_lr_lambda, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, constant_lr_ratio=constant_lr_ratio, min_lr_ratio=min_lr_ratio, num_cycles=num_cycles, ) return LambdaLR(optimizer, lr_lambda, last_epoch) class JaggedLRRestartScheduler(LRScheduler): """Wraps another scheduler to apply per-lora-restart learning rate warmups.""" def __init__( self, optimizer: Optimizer, inner_schedule: LRScheduler, jagged_restart_steps: int, jagged_restart_warmup_steps: int, jagged_restart_anneal_steps: int = 1, min_lr_scale: float = 0.001, ) -> None: self.inner_schedule = inner_schedule self.restarts_steps = jagged_restart_steps self.warmup_steps = jagged_restart_warmup_steps self.anneal_steps = jagged_restart_anneal_steps self.min_lr_scale = min_lr_scale super().__init__(optimizer, inner_schedule.last_epoch) def get_lr(self) -> float | Sequence[float]: self.inner_schedule.last_epoch = self.last_epoch original = self.inner_schedule.get_lr() step = self.last_epoch if step < self.restarts_steps - self.anneal_steps: scale = 1 else: per_restart_progress = step % self.restarts_steps if per_restart_progress < self.warmup_steps: cycle_t = min(1.0, (per_restart_progress) / self.warmup_steps) elif per_restart_progress > (self.restarts_steps - self.anneal_steps): cycle_t = min( 1.0, (self.restarts_steps - per_restart_progress) / self.anneal_steps, ) else: cycle_t = 1 scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale if isinstance(original, Sequence): return [lr * scale for lr in original] return original * scale ================================================ FILE: src/axolotl/utils/schemas/__init__.py ================================================ ================================================ FILE: src/axolotl/utils/schemas/config.py ================================================ """Module with Pydantic models for configuration.""" from typing import Annotated, Any, Literal from accelerate.utils import is_fp8_available from annotated_types import MinLen from packaging import version from pydantic import ( BaseModel, Field, StringConstraints, field_serializer, model_validator, ) from axolotl.utils.datasets import get_default_process_count from axolotl.utils.logging import get_logger from axolotl.utils.schemas.datasets import ( DatasetConfig, DPODataset, KTODataset, PretrainingDataset, SFTDataset, StepwiseSupervisedDataset, ) from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters from axolotl.utils.schemas.dynamic_checkpoint import DynamicCheckpointConfig from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType from axolotl.utils.schemas.fsdp import FSDPConfig from axolotl.utils.schemas.integrations import ( CometConfig, GradioConfig, LISAConfig, MLFlowConfig, OpenTelemetryConfig, RayConfig, TrackioConfig, WandbConfig, ) from axolotl.utils.schemas.internal import EnvCapabilities, GPUCapabilities from axolotl.utils.schemas.model import ( ModelInputConfig, ModelOutputConfig, SpecialTokensConfig, ) from axolotl.utils.schemas.multimodal import MultiModalConfig from axolotl.utils.schemas.peft import LoraConfig, ReLoRAConfig from axolotl.utils.schemas.quantization import PTQConfig, QATConfig from axolotl.utils.schemas.training import HyperparametersConfig, JaggedLRConfig from axolotl.utils.schemas.trl import TRLConfig from axolotl.utils.schemas.validation import ValidationMixin from axolotl.utils.schemas.vllm import VllmConfig LOG = get_logger(__name__) class AxolotlInputConfig( ModelInputConfig, ModelOutputConfig, LoraConfig, ReLoRAConfig, JaggedLRConfig, HyperparametersConfig, WandbConfig, MLFlowConfig, CometConfig, TrackioConfig, OpenTelemetryConfig, LISAConfig, GradioConfig, RayConfig, MultiModalConfig, RemappedParameters, DeprecatedParameters, ValidationMixin, BaseModel, ): """Wrapper of all config options.""" model_config = {"populate_by_name": True} strict: bool | None = Field( default=False, json_schema_extra={"description": "Allow overwrite yml config using from cli"}, ) resume_from_checkpoint: str | None = Field( default=None, json_schema_extra={"description": "Resume from a specific checkpoint dir"}, ) auto_resume_from_checkpoints: bool | None = Field( default=None, json_schema_extra={ "description": "If resume_from_checkpoint isn't set and you simply want it to start where it left off. Be careful with this being turned on between different models." }, ) resize_token_embeddings_to_32x: bool | None = Field( default=None, json_schema_extra={ "description": "Resize the model embeddings when new tokens are added to multiples of 32. This is reported to improve training speed on some models" }, ) mean_resizing_embeddings: bool | None = False # optionally shrink the embeddings when the tokenizer vocab size is smaller shrink_embeddings: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink." }, ) embeddings_skip_upcast: bool | None = Field( default=None, json_schema_extra={ "description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs" }, ) reinit_weights: bool | None = Field( default=None, json_schema_extra={ "description": "Reinitialize model weights randomly instead of loading pretrained weights" }, ) trainer_cls: str | None = Field( default=None, json_schema_extra={ "description": "module to custom trainer class to use for training" }, ) rl: RLType | None = Field( default=None, json_schema_extra={ "description": "Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'" }, ) trl: TRLConfig | None = Field( default_factory=lambda: TRLConfig(), ) vllm: VllmConfig | None = Field( default_factory=lambda: VllmConfig(), ) qat: QATConfig | None = None quantization: PTQConfig | None = None reward_model: bool | None = Field( default=None, json_schema_extra={"description": "Reward modelling: `True` or `False`"}, ) dynamic_checkpoint: DynamicCheckpointConfig | None = Field( default=None, json_schema_extra={ "description": "Configuration for dynamic checkpointing (trigger by file or signal). " "Set 'enabled: true' to activate this feature." }, ) process_reward_model: bool | None = Field( default=None, json_schema_extra={ "description": "Process reward modelling: `True` or `False`" }, ) center_rewards_coefficient: float | None = Field( default=None, json_schema_extra={ "description": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`." }, ) num_labels: int | None = None # Whether to use weighting in DPO trainer. # If `None`, default is `False` in the trainer. dpo_use_weighting: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to perform weighting in DPO trainer" }, ) dpo_label_smoothing: float | None = None dpo_norm_loss: bool | None = None dpo_use_liger_kernel: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use Liger kernel for DPO loss."}, ) dpo_padding_free: bool | None = None datasets: ( Annotated[ list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1), ] | None ) = Field( default=None, json_schema_extra={ "description": "A list of one or more datasets to finetune the model with" }, ) test_datasets: ( Annotated[ list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1), ] | None ) = Field( default=None, json_schema_extra={ "description": "A list of one or more datasets to eval the model with. You can use either test_datasets, or val_set_size, but not both." }, ) shuffle_merged_datasets: bool | None = Field( default=True, json_schema_extra={ "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true." }, ) shuffle_before_merging_datasets: bool | None = Field( default=False, json_schema_extra={ "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false." }, ) dataset_prepared_path: str | None = Field( default=None, json_schema_extra={ "description": "Axolotl attempts to save the dataset as an arrow after packing the data together so subsequent training attempts load faster, relative path" }, ) dataset_shard_num: int | None = Field( default=None, json_schema_extra={"description": "Num shards for whole dataset"} ) dataset_shard_idx: int | None = Field( default=None, json_schema_extra={"description": "Index of shard to use for whole dataset"}, ) skip_prepare_dataset: bool | None = False num_dataset_shards_to_save: int | None = Field( default=None, json_schema_extra={ "description": "Number of shards to save the prepared dataset" }, ) pretraining_dataset: ( Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None ) = Field( default=None, json_schema_extra={ "description": "Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize" }, ) dataset_processes: int | None = Field( default=None, deprecated="Use `dataset_num_proc` instead. This parameter will be removed in a future version.", json_schema_extra={ "description": ( "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n" "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT." ) }, ) dataset_num_proc: int | None = Field( default=None, json_schema_extra={ "description": ( "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n" "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT." ) }, ) dataset_exact_deduplication: bool | None = Field( default=None, json_schema_extra={ "description": "Deduplicates datasets and test_datasets with identical entries" }, ) dataset_keep_in_memory: bool | None = Field( default=None, json_schema_extra={ "description": "Keep dataset in memory while preprocessing. Only needed if cached dataset is taking too much storage" }, ) dataloader_pin_memory: bool | None = None dataloader_num_workers: int | None = None dataloader_prefetch_factor: int | None = None dataloader_drop_last: bool | None = None accelerator_config: dict[str, Any] | None = None remove_unused_columns: bool | None = None push_dataset_to_hub: str | None = Field( default=None, json_schema_extra={ "description": "Push prepared dataset to hub - repo_org/repo_name" }, ) hf_use_auth_token: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets. Required to be true when used in combination with `push_dataset_to_hub`" }, ) device: Any | None = None device_map: Any | None = Field( default=None, json_schema_extra={ "description": "Passed through to transformers when loading the model when launched without accelerate. Use `sequential` when training w/ model parallelism to limit memory" }, ) world_size: int | None = None local_rank: int | None = Field( default=None, json_schema_extra={ "description": "Don't mess with this, it's here for accelerate and torchrun" }, ) ddp: bool | None = None seed: int | None = Field( default=None, json_schema_extra={"description": "Seed for reproducibility"} ) ddp_timeout: int | None = Field( default=None, json_schema_extra={"description": "Advanced DDP Arguments - timeout"}, ) ddp_bucket_cap_mb: int | None = Field( default=None, json_schema_extra={"description": "Advanced DDP Arguments - bucket cap in MB"}, ) ddp_broadcast_buffers: bool | None = Field( default=None, json_schema_extra={"description": "Advanced DDP Arguments - broadcast buffers"}, ) ddp_find_unused_parameters: bool | None = None do_causal_lm_eval: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`" }, ) eval_causal_lm_metrics: list[str] | None = Field( default=None, json_schema_extra={ "description": "HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', 'chrf', 'perplexity']" }, ) do_bench_eval: bool | None = None bench_dataset: str | None = None bench_split: str | None = None metric_for_best_model: str | None = None greater_is_better: bool | None = None loss_watchdog_threshold: float | None = Field( default=None, json_schema_extra={ "description": "High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)" }, ) loss_watchdog_patience: int | None = Field( default=None, json_schema_extra={ "description": "Number of high-loss steps in a row before the trainer aborts (default: 3)" }, ) gc_steps: int | None = Field( default=None, json_schema_extra={ "description": "Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before evaluations. Default is 0 (disabled)." }, ) bf16: Literal["auto"] | bool | None = Field( default="auto", json_schema_extra={ "description": "Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere" }, ) fp16: bool | None = Field( default=None, json_schema_extra={"description": "Use CUDA fp16"} ) fp8: bool | None = Field( default=None, json_schema_extra={ "description": "Enable FP8 mixed precision training using TorchAO. Best " "used in combination with torch.compile." }, ) fp8_enable_fsdp_float8_all_gather: bool | None = Field( default=None, json_schema_extra={ "description": "Enable FSDP float8 all-gather optimization for FP8 training. Can " "improve training speed by 10-15% when FSDP is enabled." }, ) bfloat16: bool | None = Field( default=None, json_schema_extra={ "description": "No AMP (automatic mixed precision) - require >=ampere" }, ) # for non-AMP cases float16: bool | None = Field( default=None, json_schema_extra={"description": "No AMP (automatic mixed precision)"}, ) # for non-AMP cases tf32: Literal["auto"] | bool | None = Field( default="auto", json_schema_extra={ "description": "bool to use CUDA tf32 or 'auto' for automatic detection - require >=ampere" }, ) float32: bool | None = None gradient_checkpointing: Literal["offload", "offload_disk"] | bool | None = Field( default=False, json_schema_extra={ "description": "Whether to use gradient checkpointing. Available options are: true, false, 'offload', 'offload_disk'. https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing" }, ) gradient_checkpointing_kwargs: dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "Additional kwargs to pass to the trainer for gradient checkpointing" }, ) activation_offloading: Literal["legacy", "disk"] | bool | None = Field( default=False, json_schema_extra={ "description": "Whether to offload activations. Available options are: true, false, 'legacy', 'disk'." }, ) unfrozen_parameters: list[str] | None = Field( default=None, json_schema_extra={ "description": "List of regex patterns for parameter names to keep unfrozen. " "All other parameters will be frozen via requires_grad=False. " "Note: range-based patterns (e.g. embed_tokens.weight$[:32000]) use gradient " "zeroing rather than a true freeze, so weight decay will still apply to the " "frozen portion and optimizer states are allocated for the full parameter." }, ) sequence_len: int = Field( default=512, json_schema_extra={ "description": "The maximum length of an input to train with, this should typically be less than 2048 as most models have a token/context limit of 2048" }, ) excess_length_strategy: Literal["drop", "truncate", "raise"] | None = Field( default=None, json_schema_extra={ "description": "What to do when a tokenized row exceeds sequence_len. 'drop' removes the row; 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to 'drop' for backward compatibility." }, ) eval_sequence_len: int | None = Field( default=None, json_schema_extra={ "description": "The maximum length of an input for evaluation. If not specified, defaults to sequence_len" }, ) min_sample_len: int | None = None max_prompt_len: int | None = Field( default=None, json_schema_extra={"description": "maximum prompt length for RL training"}, ) sample_packing: bool | None = Field( default=None, json_schema_extra={ "description": "Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'" }, ) sample_packing_group_size: int | None = Field( default=100_000, json_schema_extra={ "description": "The number of samples packed at a time. Increasing the following values helps with packing, but usually only slightly (<%1.)" }, ) sample_packing_bin_size: int | None = Field( default=200, json_schema_extra={ "description": "The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples." }, ) sample_packing_sequentially: bool | None = Field( default=None, json_schema_extra={"description": "Whether to pack samples sequentially"}, ) sample_packing_mp_start_method: str | None = Field( default=None, json_schema_extra={ "description": "The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or 'forkserver'" }, ) eval_sample_packing: bool | None = Field( default=None, json_schema_extra={ "description": "Set to 'false' if getting errors during eval with sample_packing on" }, ) pad_to_sequence_len: bool | None = Field( default=None, json_schema_extra={ "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to True if `sample_packing` enabled" }, ) curriculum_sampling: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use sequential sampling for curriculum learning" }, ) multipack_real_batches: bool | None = None batch_flattening: Literal["auto"] | bool | None = Field( default=None, json_schema_extra={ "description": "Use batch flattening for speedups when not using sample_packing" }, ) # for PoSE context length extension use_pose: bool | None = None pose_split_on_token_ids: list[int] | None = None pose_max_context_len: int | None = None pose_num_chunks: int | None = None # Deprecated: Use streaming_multipack_buffer_size instead pretrain_multipack_buffer_size: int | None = Field( default=None, deprecated="Deprecated in v0.13.0, will be removed in v0.14.0. Use streaming_multipack_buffer_size instead", ) pretrain_multipack_attn: bool | None = Field( default=True, json_schema_extra={ "description": "whether to prevent cross attention for packed sequences during pretraining", }, ) pretraining_sample_concatenation: bool | None = Field( default=None, json_schema_extra={ "description": "whether to concatenate samples during pretraining", }, ) streaming: bool | None = Field( default=None, json_schema_extra={"description": "Use streaming mode for loading datasets"}, ) streaming_multipack_buffer_size: int | None = Field( default=10_000, json_schema_extra={ "description": "Buffer size for multipack streaming datasets" }, ) xformers_attention: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use xformers attention patch https://github.com/facebookresearch/xformers" }, ) sdp_attention: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html" }, ) s2_attention: bool | None = Field( default=None, json_schema_extra={ "description": "Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf" }, ) flex_attention: bool | None = None flex_attn_compile_kwargs: dict[str, Any] | None = None flash_attention: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention" }, ) flash_attn_cross_entropy: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use flash-attention cross entropy implementation - advanced use only" }, ) flash_attn_rms_norm: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use flash-attention rms norm implementation - advanced use only" }, ) flash_attn_fuse_mlp: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to fuse part of the MLP into a single operation" }, ) flash_optimum: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use bettertransformers"}, ) sage_attention: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use SageAttention https://github.com/thu-ml/SageAttention" }, ) eager_attention: bool | None = None attn_implementation: str | None = Field( default=None, json_schema_extra={ "description": "Specify a custom attention implementation, used mostly for kernels." }, ) experts_implementation: str | None = Field( default=None, json_schema_extra={ "description": "Which experts implementation to use for MoE models," }, ) quantize_moe_experts: bool = Field( default=False, json_schema_extra={ "description": "Quantize MoE expert weights on load to reduce VRAM. " "Requires adapter (lora/qlora) with load_in_4bit or load_in_8bit. " "Requires CUDA (not compatible with ROCm or other backends). " "Note: total parameter count may be reported incorrectly when enabled " "(trainable param count is correct)." }, ) scaling_softmax: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399" }, ) scaling_softmax_factor: float | None = Field( default=None, json_schema_extra={ "description": "Scaling factor for SSMax attention. Default is 0.43" }, ) scaling_softmax_bias: float | None = Field( default=None, json_schema_extra={ "description": "Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better length generalization." }, ) unsloth_cross_entropy_loss: bool | None = None unsloth_lora_mlp: bool | None = None unsloth_lora_qkv: bool | None = None unsloth_lora_o: bool | None = None unsloth_rms_norm: bool | None = None unsloth_rope: bool | None = None lora_mlp_kernel: bool | None = Field( default=None, json_schema_extra={ "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html" }, ) lora_qkv_kernel: bool | None = Field( default=None, json_schema_extra={ "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html" }, ) lora_o_kernel: bool | None = Field( default=None, json_schema_extra={ "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html" }, ) chunked_cross_entropy: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use chunked cross entropy loss for memory efficiency" }, ) chunked_cross_entropy_num_chunks: int | None = Field( default=None, json_schema_extra={ "description": "Number of chunks to use for chunked cross entropy loss" }, ) use_eaft: bool | None = Field( default=None, json_schema_extra={ "description": "Enable Entropy-Aware Focal Training loss (EAFT)" }, ) eaft_alpha: float | None = Field( default=1.0, json_schema_extra={ "description": "Exponent for entropy weighting in EAFT (default: 1.0)" }, ) eaft_k: int | None = Field( default=20, json_schema_extra={ "description": "Number of top logits for entropy approximation (default: 20)" }, ) tiled_mlp: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use ALST tiled mlp for memory efficient long context" }, ) tiled_mlp_num_shards: int | None = Field( default=None, json_schema_extra={ "description": "Number of shards to use for ALST tiled mlp. If unset, it will be set based on seqlen/hidden_size" }, ) tiled_mlp_use_original_mlp: bool | None = Field( default=True, json_schema_extra={ "description": "Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on llama." }, ) llama4_linearized_experts: bool | None = None deepspeed: str | dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "Deepspeed config path. e.g., deepspeed_configs/zero3.json" }, ) deepcompile: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use deepcompile for faster training with deepspeed" }, ) fsdp: list[str] | None = Field( default=None, json_schema_extra={"description": "FSDP configuration"}, deprecated="Configuring FSDP using `fsdp` is deprecated. Please use `fsdp_config` instead. ", ) fsdp_config: FSDPConfig | None = Field( default=None, json_schema_extra={"description": "FSDP configuration options"} ) fsdp_version: int | None = Field( default=None, json_schema_extra={"description": "FSDP version"}, ) fsdp_final_state_dict_type: ( Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None ) = Field( default=None, deprecated="Configuring FSDP final state dict type using `fsdp_final_state_dict_type` is deprecated. Please use `fsdp_config.final_state_dict_type` instead.", ) val_set_size: float | None = Field( default=0.0, json_schema_extra={ "description": "How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval." }, ) dp_shard_size: int | None = Field( default=None, json_schema_extra={ "description": "Number of devices to shard across. If not set, will use all available devices." }, ) dp_replicate_size: int | None = Field( default=None, json_schema_extra={"description": "Number of devices to replicate across."}, ) sequence_parallel_degree: int | None = Field( default=None, json_schema_extra={ "description": "Deprecated: use `context_parallel_size` instead" }, ) context_parallel_size: int | None = Field( default=None, json_schema_extra={ "description": "Set to a divisor of the number of GPUs available to split sequences into chunks of equal size. Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details." }, ) heads_k_stride: int | None = Field( default=None, json_schema_extra={ "description": "Optional; strides across the key dimension. Larger values use more memory but should make training faster. Must evenly divide the number of KV heads in your model." }, ) ring_attn_func: RingAttnFunc | None = Field( default=None, json_schema_extra={ "description": "One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing case." }, ) tensor_parallel_size: int | None = Field( default=None, json_schema_extra={ "description": "Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP." }, ) special_tokens: SpecialTokensConfig | None = Field( default=None, json_schema_extra={ "description": "Add or change special tokens. If you add tokens here, you don't need to add them to the `tokens` list." }, ) tokens: list[str] | None = Field( default=None, json_schema_extra={"description": "Add extra tokens to the tokenizer"}, ) added_tokens_overrides: dict[int, str] | None = Field( default=None, json_schema_extra={ "description": "Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer. Only works for tokens that are not part of the base vocab (aka are added_tokens). Can be checked if they exist in tokenizer.json added_tokens." }, ) torch_compile: Literal["auto"] | bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use torch.compile and which backend to use. setting to `auto` will enable torch compile when torch>=2.6.0" }, ) torch_compile_backend: str | None = Field( default=None, json_schema_extra={"description": "Backend to use for torch.compile"}, ) torch_compile_mode: Literal["default", "reduce-overhead", "max-autotune"] | None = ( None ) max_steps: int | None = Field( default=None, json_schema_extra={ "description": "Maximum number of iterations to train for. It precedes num_epochs which means that if both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps" }, ) warmup_steps: int | None = Field( default=None, json_schema_extra={ "description": "Number of warmup steps. Cannot use with warmup_ratio" }, ) warmup_ratio: float | None = Field( default=None, json_schema_extra={"description": "Warmup ratio. Cannot use with warmup_steps"}, ) eval_steps: int | float | None = Field( default=None, json_schema_extra={ "description": "Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps" }, ) evals_per_epoch: int | None = Field( default=None, json_schema_extra={ "description": "Number of times per epoch to run evals, mutually exclusive with eval_steps" }, ) eval_strategy: str | None = Field( default=None, json_schema_extra={ "description": "Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer from `eval_steps`" }, ) save_steps: int | float | None = Field( default=None, json_schema_extra={ "description": "Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps" }, ) saves_per_epoch: int | None = Field( default=None, json_schema_extra={ "description": "Number of times per epoch to save a checkpoint, mutually exclusive with save_steps" }, ) save_strategy: str | None = Field( default=None, json_schema_extra={ "description": "Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better result is achieved, leave empty to infer from `save_steps`" }, ) save_total_limit: int | None = Field( default=None, json_schema_extra={"description": "Checkpoints saved at a time"} ) save_first_step: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to checkpoint a model after the first step of training. Defaults to False." }, ) logging_steps: int | None = Field( default=None, json_schema_extra={"description": "Logging frequency"} ) early_stopping_patience: int | None = Field( default=None, json_schema_extra={ "description": "Stop training after this many evaluation losses have increased in a row. https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback" }, ) load_best_model_at_end: bool | None = False save_only_model: bool | None = Field( default=False, json_schema_extra={ "description": "Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints." }, ) use_tensorboard: bool | None = Field( default=None, json_schema_extra={"description": "Use tensorboard for logging"} ) profiler_steps: int | None = Field( default=None, json_schema_extra={ "description": "Enable the pytorch profiler to capture the first N steps of training to the output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information. Snapshots can be visualized @ https://pytorch.org/memory_viz" }, ) profiler_steps_start: int | None = Field( default=0, json_schema_extra={ "description": "Which step to start the profiler at. Useful for only capturing a few steps mid-run." }, ) include_tokens_per_second: bool | None = Field( default=None, json_schema_extra={ "description": "bool of whether to report tokens per second at the end of training. This is not supported with pre-training datasets." }, ) include_tkps: bool | None = Field( default=True, json_schema_extra={ "description": "bool of whether to report tokens per second per-gpu during training by measuring throughput of non-padding tokens." }, ) neftune_noise_alpha: float | None = Field( default=None, json_schema_extra={ "description": "NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings. Currently only supported on Llama and Mistral" }, ) orpo_alpha: float | None = Field( default=None, json_schema_extra={ "description": "Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping." }, ) rpo_alpha: float | None = Field( default=None, json_schema_extra={ "description": "Weighting of NLL term in loss from RPO paper" }, ) simpo_gamma: float | None = Field( default=None, json_schema_extra={"description": "Target reward margin for the SimPO loss"}, ) cpo_alpha: float | None = Field( default=None, json_schema_extra={"description": "Weight of the BC regularizer"} ) kto_desirable_weight: float | None = Field( default=None, json_schema_extra={"description": "Factor for desirable loss term in KTO loss"}, ) kto_undesirable_weight: float | None = Field( default=None, json_schema_extra={ "description": "Factor for undesirable loss term in KTO loss" }, ) rl_beta: float | None = Field( default=None, json_schema_extra={"description": "The beta parameter for the RL training"}, ) max_memory: dict[int | Literal["cpu", "disk"], int | str] | None = Field( default=None, json_schema_extra={ "description": "Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model." }, ) gpu_memory_limit: int | str | None = Field( default=None, json_schema_extra={ "description": "Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset" }, ) low_cpu_mem_usage: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use low_cpu_mem_usage"}, ) chat_template: ( ChatTemplate | Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")] ) | None = Field( default=None, json_schema_extra={ "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. The selected chat template will be saved to the tokenizer_config.json for easier inferencing" }, ) chat_template_jinja: str | None = Field( default=None, json_schema_extra={ "description": "Custom jinja template or path to jinja file for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null." }, ) chat_template_kwargs: dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "Additional kwargs to pass to the chat template. This is useful for customizing the chat template. For example, you can pass `thinking=False` to add a generation prompt to the chat template." }, ) eot_tokens: list[str] | None = Field( default=None, json_schema_extra={ "description": "Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the boundaries between conversation turns. For example: ['/INST', '
', '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is useful for templates that use multiple delimiter tokens." }, ) default_system_message: str | None = Field( default=None, json_schema_extra={ "description": "Changes the default system message. Currently only supports chatml." }, ) fix_untrained_tokens: int | list[int] | None = Field( default=None, json_schema_extra={ "description": ( "Token index or indices to adjust embedding weights to the mean of the other tokens. " "This is useful when the model has untrained embeddings." ) }, ) # INTERNALS - document for now, generally not set externally is_preprocess: bool | None = None preprocess_iterable: bool | None = None total_num_tokens: int | None = Field( default=None, json_schema_extra={"description": "Total number of tokens - internal use"}, ) total_supervised_tokens: int | None = None sample_packing_eff_est: float | None = Field( default=None, json_schema_extra={ "description": "You can set these packing optimizations AFTER starting a training at least once. The trainer will provide recommended values for these values." }, ) axolotl_config_path: str | None = None is_falcon_derived_model: bool | None = Field( default=None, json_schema_extra={ "description": "Internal use only - Used to identify which the model is based on" }, ) is_llama_derived_model: bool | None = Field( default=None, json_schema_extra={ "description": "Internal use only - Used to identify which the model is based on" }, ) is_mistral_derived_model: bool | None = Field( default=None, json_schema_extra={ "description": "Internal use only - Used to identify which the model is based on. Please note that if you set this to true, `padding_side` will be set to 'left' by default" }, ) is_qwen_derived_model: bool | None = Field( default=None, json_schema_extra={ "description": "Internal use only - Used to identify which the model is based on" }, ) plugins: list[str] | None = Field( default=None, json_schema_extra={ "description": "Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available plugins or doc below for more details. https://docs.axolotl.ai/docs/custom_integrations.html" }, ) generate_samples: bool | None = Field( default=False, json_schema_extra={ "description": "Enable sample generation during training for monitoring" }, ) num_generation_samples: int | None = Field( default=3, json_schema_extra={ "description": "Number of samples to generate at each interval" }, ) generation_max_new_tokens: int | None = Field( default=50, json_schema_extra={"description": "Maximum new tokens to generate per sample"}, ) generation_temperature: float | None = Field( default=0.7, json_schema_extra={ "description": "Temperature for sample generation (0.0 = greedy)" }, ) generation_top_p: float | None = Field( default=None, json_schema_extra={"description": "Nucleus sampling parameter for generation"}, ) generation_top_k: int | None = Field( default=None, json_schema_extra={"description": "Top-k sampling parameter for generation"}, ) generation_prompt_ratio: float | None = Field( default=0.5, json_schema_extra={"description": "Ratio of input to use as prompt (0.0-1.0)"}, ) generation_do_sample: bool | None = Field( default=True, json_schema_extra={ "description": "Whether to use sampling (vs greedy decoding)" }, ) @field_serializer("datasets") def datasets_serializer( self, ds_configs: list[DatasetConfig] | None ) -> list[dict[str, Any]] | None: if ds_configs: return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs] return None @model_validator(mode="before") @classmethod def warn_peft_trainable_token_to_fix_untrained(cls, data): if ( peft_trainable_token_indices := data.get("peft_trainable_token_indices") ) and (fix_untrained_tokens := data.get("fix_untrained_tokens")): if isinstance(fix_untrained_tokens, int): fix_untrained_tokens = (fix_untrained_tokens,) if isinstance(peft_trainable_token_indices, int): peft_trainable_token_indices = (peft_trainable_token_indices,) for untrained_token_id in fix_untrained_tokens: if untrained_token_id not in peft_trainable_token_indices: LOG.warning_once( f"Token {untrained_token_id} is fixed via `fix_untrained_tokens`, yet not in `peft_trainable_token_indices: ` list. " "Please add it, otherwise the token won't be trained on." ) return data @model_validator(mode="before") @classmethod def check_sageattn_wo_sample_packing(cls, data): if (not data.get("sample_packing", False)) and data.get("sage_attention"): if not data.get("pad_to_sequence_len", False): LOG.warning( "We recommend turning on `pad_to_sequence_len` for SageAttention without packing." "This is because there has been signs that the loss explodes after a few steps." ) return data @model_validator(mode="before") @classmethod def check_sageattn_fft(cls, data): if (not data.get("adapter", False)) and data.get("sage_attention"): LOG.warning( "We found loss to drop to 0 with SageAttention full finetuning." "Please observe the loss, otherwise switch to LoRA/QLoRA or another attention method." ) return data class AxolotlConfigWCapabilities(AxolotlInputConfig): """Wrapper to valdiate GPU capabilities with the configured options""" capabilities: GPUCapabilities env_capabilities: EnvCapabilities @model_validator(mode="after") def check_bf16(self): if self.capabilities.bf16: if not self.bf16 and not self.bfloat16: LOG.info( "bf16 support detected, but not enabled for this configuration." ) else: if ( not self.merge_lora and not self.is_preprocess and (self.bf16 is True or self.bfloat16 is True) ): raise ValueError( "bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above." ) return self @model_validator(mode="after") def check_tf32(self): if self.tf32 == "auto": self.tf32 = self.capabilities.tf32 return self @model_validator(mode="after") def check_fp8(self): if self.fp8 and not self.capabilities.fp8: raise ValueError("fp8 requested, but fp8 is not supported on this GPU") elif self.fp8 and self.capabilities.fp8 and not is_fp8_available(): raise ValueError( "fp8 requested, but missing one of ms-amp, transformers-engine or torchao." ) return self @model_validator(mode="before") @classmethod def check_sample_packing_w_sdpa_bf16(cls, data): is_sm_90: bool = ( data["capabilities"] and data["capabilities"].get("compute_capability") == "sm_90" ) if ( data.get("sample_packing") and data.get("sdp_attention") and (data.get("bfloat16") or data.get("bf16")) and not is_sm_90 ): # https://github.com/pytorch/pytorch/blob/1b03423526536b5f3d35bdfa95ccc6197556cf9b/test/test_transformers.py#L2440-L2450 LOG.warning( "sample_packing & torch sdpa with bf16 is unsupported may results in 0.0 loss. " "This may work on H100s." ) return data @model_validator(mode="before") @classmethod def check_compute_capability_w_sageattn(cls, data): if ( data.get("sage_attention") and data.get("capabilities") and data.get("capabilities").get("compute_capability") not in ["sm_80", "sm_86", "sm_89", "sm_90", "sm_120"] ): raise ValueError( "SageAttention supports compute capability between sm_80 and sm_120. " "Please use a different attention implementation." ) return data @model_validator(mode="before") @classmethod def check_multigpu_unsloth(cls, data): if ( data.get("unsloth_lora_mlp") or data.get("unsloth_lora_qkv") or data.get("unsloth_lora_o") ): capabilities = data.get("capabilities") if capabilities and capabilities.get("n_gpu", 0) > 1: raise ValueError( "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with multi-GPU training." ) return data @model_validator(mode="before") @classmethod def check_multigpu_lora_kernels(cls, data): if ( data.get("lora_mlp_kernel") or data.get("lora_qkv_kernel") or data.get("lora_o_kernel") ): capabilities = data.get("capabilities") is_fsdp = data.get("fsdp_config") is not None is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2" if capabilities and capabilities.get("n_gpu", 0) > 1 and not is_fsdp2: if is_fsdp: raise ValueError( "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not compatible with FSDP1." ) return data @model_validator(mode="before") @classmethod def check_quantize_moe_experts(cls, data): if data.get("quantize_moe_experts"): if data.get("lora_target_linear"): raise ValueError( "lora_target_linear is not compatible with quantize_moe_experts. " "Use lora_target_parameters to target expert weights instead." ) if data.get("adapter") not in ("lora", "qlora"): raise ValueError("quantize_moe_experts requires adapter: lora or qlora") if not (data.get("load_in_4bit") or data.get("load_in_8bit")): raise ValueError( "quantize_moe_experts requires load_in_4bit or load_in_8bit" ) if ( data.get("capabilities") and data["capabilities"].get("compute_capability") and not data["capabilities"]["compute_capability"].startswith("sm_") ): raise ValueError( "quantize_moe_experts requires CUDA (not compatible with ROCm or other backends)" ) return data @model_validator(mode="before") @classmethod def check_auto_enable_lora_kernels(cls, data): # Only proceed if using LoRA or QLoRA adapter if data.get("rl"): # RL trainers not tested so don't enable kernels by default return data if data.get("adapter") in ["lora", "qlora"]: # Skip if already set, using unsloth optimizations, or using 8-bit unsloth_fields = ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"] kernel_fields = ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"] if ( any(data.get(k) is not None for k in kernel_fields) or any(data.get(k) for k in unsloth_fields) or data.get("adapter") == "lora" and data.get("load_in_8bit") ): return data # Skip if trust_remote_code is enabled, as lora kernels are not compatible if data.get("trust_remote_code"): return data # Skip if dropout is not 0, as auto enabling it would just disable it during runtime patch checks if data.get("lora_dropout") != 0: return data # Check multi-GPU compatibility capabilities = data.get("capabilities") is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1 is_fsdp = data.get("fsdp_config") is not None is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2" if ( not is_multi_gpu or (is_multi_gpu and not is_fsdp) or (is_multi_gpu and is_fsdp2) ): # Auto-enable kernels if not explicitly set by user if data.get("lora_mlp_kernel") is None: data["lora_mlp_kernel"] = True if data.get("lora_qkv_kernel") is None: data["lora_qkv_kernel"] = True if data.get("lora_o_kernel") is None: data["lora_o_kernel"] = True LOG.warning( "Auto-enabling LoRA kernel optimizations for faster training. " + "Please explicitly set `lora_*_kernel` config values to `false` to disable. " + "See https://docs.axolotl.ai/docs/lora_optims.html for more info." ) return data @model_validator(mode="before") @classmethod def check_adopt_torch_version(cls, data): if (data.get("optimizer") is not None) and ("adopt" in data.get("optimizer")): env_capabilities = data.get("env_capabilities", {}) torch_version = env_capabilities.get("torch_version") if torch_version is None: import torch torch_version = str(torch.__version__).split("+", maxsplit=1)[0] if version.parse(torch_version) < version.parse("2.5.1"): raise ValueError( "ADOPT optimizer is incompatible with torch version < 2.5.1" ) return data @model_validator(mode="before") @classmethod def check_flex_torch_version(cls, data): if (data.get("flex_attention") is not None) and (data.get("flex_attention")): env_capabilities = data.get("env_capabilities", {}) torch_version = env_capabilities.get("torch_version") if torch_version is None: import torch torch_version = str(torch.__version__).split("+", maxsplit=1)[0] if version.parse(torch_version) < version.parse("2.6.0"): raise ValueError( "Flex attention is not supported on torch version < 2.6.0" ) return data @model_validator(mode="before") @classmethod def check_torch_compile_auto(cls, data): if data.get("torch_compile") == "auto": env_capabilities = data.get("env_capabilities", {}) if env_capabilities.get("torch_version"): if version.parse( env_capabilities.get("torch_version") ) >= version.parse("2.5.1"): LOG.info( "torch.compile is available, setting torch_compile to True" ) data["torch_compile"] = True else: data["torch_compile"] = False else: data["torch_compile"] = False return data @model_validator(mode="before") @classmethod def check_beta_and_trl_beta_match(cls, data): if data.get("beta") and data.get("trl", {}).get("beta"): if data["beta"] != data["trl"]["beta"]: raise ValueError("beta and trl.beta must match or one must be removed") return data @model_validator(mode="after") def check_min_torch_version(self): if self.env_capabilities and self.env_capabilities.torch_version: torch_version = self.env_capabilities.torch_version if version.parse(torch_version) < version.parse("2.6.0"): LOG.warning( f"torch=={torch_version} not be supported. Please upgrade to torch>=2.6.0." ) return self @model_validator(mode="before") @classmethod def check_qat_config(cls, data): qat_cfg = data.get("qat", {}) if not qat_cfg: return data if data.get("peft"): raise ValueError("QAT and PEFT cannot be used together.") if data.get("load_in_8bit"): raise ValueError("QAT and load_in_8bit cannot be used together.") if data.get("load_in_4bit"): raise ValueError("QAT and load_in_4bit cannot be used together.") env_capabilities = data.get("env_capabilities", {}) torch_version = env_capabilities.get("torch_version") if torch_version is None: import torch torch_version = str(torch.__version__).split("+", maxsplit=1)[0] if version.parse(torch_version) < version.parse("2.6.0"): raise ValueError("QAT is not supported on torch version < 2.6.0") return data @model_validator(mode="before") @classmethod def check_fsdp_torch_version(cls, data): env_capabilities = data.get("env_capabilities", {}) torch_version = env_capabilities.get("torch_version") if torch_version is None: import torch torch_version = str(torch.__version__).split("+", maxsplit=1)[0] if data.get("fsdp_config") and str(data.get("fsdp_version")) == "2": if version.parse(torch_version) < version.parse("2.7.0"): raise ValueError("FSDP2 is not supported on torch version < 2.7.0") return data @model_validator(mode="before") @classmethod def default_dataloader_opts(cls, data): if ( data.get("dataloader_num_workers") is None and data.get("dataloader_pin_memory") is None and data.get("dataloader_prefetch_factor") is None ): data["dataloader_num_workers"] = data.get("capabilities").get("n_gpu", 1) data["dataloader_pin_memory"] = True data["dataloader_prefetch_factor"] = 256 return data @model_validator(mode="before") @classmethod def default_dataset_num_proc(cls, data): if data.get("dataset_processes") is not None: if data.get("dataset_num_proc") is None: data["dataset_num_proc"] = data["dataset_processes"] LOG.warning( "dataset_processes is deprecated and will be removed in a future version. " "Please use dataset_num_proc instead." ) else: LOG.warning( "Both dataset_processes and dataset_num_proc are set. " "Using dataset_num_proc and ignoring dataset_processes." ) del data["dataset_processes"] elif data.get("dataset_num_proc") is None: data["dataset_num_proc"] = get_default_process_count() return data @model_validator(mode="before") @classmethod def check_deduplication_with_streaming(cls, data): if data.get("dataset_exact_deduplication") and ( data.get("streaming") or data.get("pretraining_dataset") ): raise NotImplementedError( "dataset_exact_deduplication is not available for streaming datasets. " ) return data @model_validator(mode="before") @classmethod def check_deduplication_with_skip_prepare(cls, data): if data.get("dataset_exact_deduplication") and data.get("skip_prepare_dataset"): raise ValueError( "dataset_exact_deduplication=True has no effect when " "skip_prepare_dataset=True. Deduplication runs as part of the " "prepare pipeline, which is skipped. Either set " "skip_prepare_dataset: false or disable " "dataset_exact_deduplication." ) return data ================================================ FILE: src/axolotl/utils/schemas/datasets.py ================================================ """Pydantic models for datasets-related configuration""" from typing import Literal from pydantic import BaseModel, Field, model_validator from axolotl.utils.schemas.enums import ChatTemplate from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic class UserDefinedPrompterType(BaseModel): """Structure for user defined prompt types""" system_prompt: str | None = Field( default=None, json_schema_extra={"description": "Custom user instruction prompt"}, ) system_format: str | None = Field( default=None, json_schema_extra={"description": "Use {system} as key to be replaced"}, ) field_system: str | None = None field_instruction: str | None = None field_input: str | None = None field_output: str | None = None format: str | None = Field( default=None, json_schema_extra={ "description": "Customizable to be single line or multi-line. Use {instruction}/{input} as key to be replaced. 'format' can include {input}" }, ) no_input_format: str | None = Field( default=None, json_schema_extra={"description": "'no_input_format' cannot include {input}"}, ) class SFTDataset(BaseModel): """SFT configuration subset""" path: str | None = Field( default=None, json_schema_extra={ "description": "HuggingFace dataset repo | s3:// | gs:// | path to local file or directory" }, ) split: str | None = Field( default=None, json_schema_extra={"description": "name of dataset split to load from"}, ) type: str | UserDefinedPrompterType | None = Field( default=None, json_schema_extra={ "description": "The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]" }, ) input_transform: str | None = None shards: int | None = Field( default=None, json_schema_extra={ "description": "split dataset into N pieces (use with shards_idx)" }, ) shards_idx: int | None = Field( default=None, json_schema_extra={"description": "the index of sharded dataset to use"}, ) preprocess_shards: int | None = Field( default=None, json_schema_extra={ "description": "process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)" }, ) conversation: str | None = None # Do not make this too strict or it will break the validator to choose different dataset class chat_template: ChatTemplate | str | None = Field( default=None, json_schema_extra={ "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field." }, ) chat_template_jinja: str | None = Field( default=None, json_schema_extra={ "description": "Custom jinja chat template or path to jinja file. Used only if `chat_template: jinja` or empty." }, ) data_files: str | list[str] | None = Field( default=None, json_schema_extra={"description": "path to source data files"} ) input_format: str | None = None name: str | None = Field( default=None, json_schema_extra={"description": "name of dataset configuration to load"}, ) ds_type: str | None = Field( default=None, json_schema_extra={"description": "defines the datatype when path is a file"}, ) field: str | None = Field( default=None, json_schema_extra={ "description": "For `completion` datasets only, uses the provided field instead of `text` column" }, ) field_human: str | None = None field_model: str | None = None field_messages: str | None = Field( default=None, json_schema_extra={ "description": 'Key containing the messages (default: "messages")' }, ) field_tools: str | None = Field( default=None, json_schema_extra={ "description": 'Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).' }, ) field_thinking: str | None = Field( default=None, json_schema_extra={ "description": 'Key containing the reasoning trace (default: "reasoning_content").' }, ) template_thinking_key: str | None = Field( default=None, json_schema_extra={ "description": "The key the chat template expects that indicates the reasoning trace." }, ) # deprecated, use message_property_mappings message_field_role: str | None = None # deprecated, use message_property_mappings message_field_content: str | None = None message_property_mappings: dict[str, str] | None = Field( default=None, json_schema_extra={ "description": "Mapping of properties from the input dataset to the chat template. (default: message_property_mappings={'role':'role', 'content':'content'}) If a property exists in the template but not in this mapping, the system will attempt to load it directly from the message using the property name as the key. Example: In the mapping below, 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and used as 'content' in the chat template." }, ) message_field_training: str | None = Field( default=None, json_schema_extra={ "description": "The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`." }, ) message_field_training_detail: str | None = Field( default=None, json_schema_extra={ "description": "The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train)." }, ) split_thinking: bool | None = Field( default=None, json_schema_extra={ "description": "(for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags" }, ) logprobs_field: str | None = None temperature: float | None = None roles_to_train: list[str] | None = Field( default=None, json_schema_extra={ "description": "Roles to train on. The tokens from these roles will be considered for the loss." }, ) train_on_eos: Literal["all", "turn", "last"] | None = Field( default=None, json_schema_extra={ "description": "Which EOS tokens to train on in the conversation. Possible values are: all: train on all EOS tokens, turn (default): train on the EOS token at the end of each trainable turn, last: train on the last EOS token in the conversation" }, ) roles: dict[str, list[str]] | None = Field( default=None, json_schema_extra={ "description": 'Roles mapping in the messages. The format is {target_role: [source_roles]}. All source roles will be mapped to the target role. The default is: user: ["human", "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]' }, ) drop_system_message: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to drop the system turn from the dataset. Only works with chat_template. This does not drop the default system message from chat_template if it exists. If you wish to, we recommend using a custom jinja template with the default system message removed or adding a system turn with empty content." }, ) trust_remote_code: bool | None = Field( default=False, json_schema_extra={"description": "Trust remote code for untrusted source"}, ) revision: str | None = Field( default=None, json_schema_extra={ "description": "The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets." }, ) @model_validator(mode="before") @classmethod def handle_legacy_message_fields(cls, data): """Handle backwards compatibility between legacy message field mapping and new property mapping system.""" return handle_legacy_message_fields_logic(data) @model_validator(mode="before") @classmethod def check_chat_template_config(cls, data): if isinstance(data, BaseModel): data = data.model_dump() # Set chat_template to tokenizer_default if not set if data.get("type") == "chat_template" and not data.get("chat_template"): data["chat_template"] = ChatTemplate.tokenizer_default # if chat_template is set to jinja, chat_template_jinja is required if data.get("chat_template") == ChatTemplate.jinja and not data.get( "chat_template_jinja" ): raise ValueError( "chat_template_jinja is required when chat_template is set to jinja" ) # If chat_template_jinja is set, set chat_template to jinja if data.get("chat_template_jinja") and not data.get("chat_template"): data["chat_template"] = ChatTemplate.jinja return data class PretrainingDataset(BaseModel): """Pretraining dataset configuration subset""" name: str | None = None path: str | None = None split: str | None = "train" text_column: str | None = "text" type: str | None = "pretrain" trust_remote_code: bool | None = False data_files: str | None = None skip: int | None = None class UserDefinedDPOType(BaseModel): """User defined typing for DPO""" field_system: str | None = None field_prompt: str | None = None field_chosen: str | None = None field_rejected: str | None = None prompt_format: str | None = None chosen_format: str | None = None rejected_format: str | None = None class DPODataset(BaseModel): """DPO configuration subset""" path: str | None = None split: str | None = None type: UserDefinedDPOType | str | None = None data_files: list[str] | None = None revision: str | None = None field_messages: str | None = None class StepwiseSupervisedDataset(BaseModel): """Stepwise supervised dataset configuration subset""" path: str | None = None split: str | None = None data_files: list[str] | None = None revision: str | None = None step_separator: str | None = None max_completion_length: int | None = None train_on_last_step_only: bool | None = None class UserDefinedKTOType(BaseModel): """User defined typing for KTO""" field_system: str | None = None field_prompt: str | None = None field_completion: str | None = None field_label: bool | None = None prompt_format: str | None = None completion_format: str | None = None class KTODataset(BaseModel): """KTO configuration subset""" path: str | None = None split: str | None = None type: UserDefinedKTOType | str | None = None data_files: list[str] | None = None trust_remote_code: bool | None = False revision: str | None = None DatasetConfig = SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset ================================================ FILE: src/axolotl/utils/schemas/deprecated.py ================================================ """Pydantic models for deprecated and remapped configuration parameters""" from typing import Any from pydantic import BaseModel, Field, field_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class DeprecatedParameters(BaseModel): """configurations that are deprecated""" max_packed_sequence_len: int | None = None rope_scaling: Any | None = None noisy_embedding_alpha: float | None = None dpo_beta: float | None = None evaluation_strategy: str | None = None eval_table_size: int | None = None eval_max_new_tokens: int | None = None dpo_use_logits_to_keep: bool | None = None dpo_generate_during_eval: bool | None = None @field_validator("max_packed_sequence_len") @classmethod def validate_max_packed_sequence_len(cls, max_packed_sequence_len): if max_packed_sequence_len: raise DeprecationWarning("`max_packed_sequence_len` is no longer supported") return max_packed_sequence_len @field_validator("rope_scaling") @classmethod def validate_rope_scaling(cls, rope_scaling): if rope_scaling: raise DeprecationWarning( "`rope_scaling` is no longer supported, it should now be be a key under `model_config`" ) return rope_scaling @field_validator("noisy_embedding_alpha") @classmethod def validate_noisy_embedding_alpha(cls, noisy_embedding_alpha): if noisy_embedding_alpha: LOG.warning("noisy_embedding_alpha is deprecated, use neftune_noise_alpha") return noisy_embedding_alpha @field_validator("dpo_beta") @classmethod def validate_dpo_beta(cls, dpo_beta): if dpo_beta is not None: LOG.warning("dpo_beta is deprecated, use rl_beta instead") return dpo_beta @field_validator("evaluation_strategy") @classmethod def validate_evaluation_strategy(cls, evaluation_strategy): if evaluation_strategy is not None: LOG.warning("evaluation_strategy is deprecated, use eval_strategy instead") return evaluation_strategy @field_validator("eval_table_size") @classmethod def validate_eval_table_size(cls, eval_table_size): if eval_table_size is not None: LOG.warning( "eval_table_size is deprecated and superseded by generate_samples config. " "Please use generate_samples: true and num_generation_samples instead. " "The LogPredictionCallback is replaced by the new sample generation feature." ) return eval_table_size @field_validator("eval_max_new_tokens") @classmethod def validate_eval_max_new_tokens(cls, eval_max_new_tokens): if eval_max_new_tokens is not None: LOG.warning( "eval_max_new_tokens is deprecated and superseded by generate_samples config. " "Please use generation_max_new_tokens instead." ) return eval_max_new_tokens @field_validator("dpo_use_logits_to_keep") @classmethod def validate_dpo_use_logits_to_keep(cls, dpo_use_logits_to_keep): if dpo_use_logits_to_keep is not None: raise DeprecationWarning( "`dpo_use_logits_to_keep` is no longer supported, " "it has been removed in TRL >= 0.29.0" ) return dpo_use_logits_to_keep @field_validator("dpo_generate_during_eval") @classmethod def validate_dpo_generate_during_eval(cls, dpo_generate_during_eval): if dpo_generate_during_eval is not None: raise DeprecationWarning( "`dpo_generate_during_eval` is no longer supported, " "it has been removed in TRL >= 0.29.0" ) return dpo_generate_during_eval class RemappedParameters(BaseModel): """Parameters that have been remapped to other names""" overrides_of_model_config: dict[str, Any] | None = Field( default=None, alias="model_config", json_schema_extra={ "description": "optional overrides to the base model configuration" }, ) overrides_of_model_kwargs: dict[str, Any] | None = Field( default=None, alias="model_kwargs", json_schema_extra={ "description": "optional overrides the base model loading from_pretrained" }, ) type_of_model: str | None = Field( default=None, alias="model_type", json_schema_extra={ "description": "If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too" }, ) revision_of_model: str | None = Field( default=None, alias="model_revision", json_schema_extra={ "description": "You can specify to choose a specific model revision from huggingface hub" }, ) ================================================ FILE: src/axolotl/utils/schemas/dynamic_checkpoint.py ================================================ """Schema for dynamic checkpoint configuration.""" from pydantic import BaseModel, Field class DynamicCheckpointConfig(BaseModel): """Configuration for dynamic checkpoint triggering during training.""" enabled: bool = Field( default=False, json_schema_extra={ "description": "Enable dynamic checkpoint triggering during training. " "Create a file 'axolotl_checkpoint.save' in the configured `output_dir` to trigger. " }, ) check_interval: int = Field( default=10, ge=1, json_schema_extra={ "description": "Check for trigger file every N steps (reduces I/O overhead). " "Default: 100" }, ) trigger_file_path: str = Field( default="", json_schema_extra={ "description": "Custom trigger filename (optional). " "If not specified, defaults to 'axolotl_checkpoint.save'. " "Specify a filename (not a full path) to override the default." }, ) ================================================ FILE: src/axolotl/utils/schemas/enums.py ================================================ """Enums for Axolotl input config""" from enum import Enum import torch class TorchAOQuantDType(Enum): int4 = torch.int4 int8 = torch.int8 float8_e4m3fn = torch.float8_e4m3fn nvfp4 = "nvfp4" mxfp4 = "mxfp4" def from_string(str): if str == "int4": return TorchAOQuantDType.int4 if str == "int8": return TorchAOQuantDType.int8 if str in ["float8_e4m3fn", "fp8", "float8"]: return TorchAOQuantDType.float8_e4m3fn if str == "nvfp4": return TorchAOQuantDType.nvfp4 if str == "mxfp4": return TorchAOQuantDType.mxfp4 class RLType(str, Enum): """RL trainer type configuration subset""" DPO = "dpo" GDPO = "gdpo" GRPO = "grpo" IPO = "ipo" ORPO = "orpo" KTO = "kto" SIMPO = "simpo" class ChatTemplate(str, Enum): """Chat templates configuration subset""" alpaca = "alpaca" chatml = "chatml" mistral_v1 = "mistral_v1" mistral_v2v3 = "mistral_v2v3" mistral_v3_tekken = "mistral_v3_tekken" mistral_v7_tekken = "mistral_v7_tekken" gemma = "gemma" cohere = "cohere" llama3 = "llama3" llama3_2_vision = "llama3_2_vision" llama4 = "llama4" phi_3 = "phi_3" phi_35 = "phi_35" deepseek_v2 = "deepseek_v2" deepseek_v3 = "deepseek_v3" jamba = "jamba" jinja = "jinja" qwen_25 = "qwen_25" qwen3 = "qwen3" qwen3_5 = "qwen3_5" falcon_h1 = "falcon_h1" tokenizer_default = "tokenizer_default" exaone = "exaone" exaone4 = "exaone4" metharme = "metharme" pixtral = "pixtral" llava = "llava" qwen2_vl = "qwen2_vl" gemma3 = "gemma3" gemma3n = "gemma3n" command_a = "command_a" command_a_tool_use = "command_a_tool_use" command_a_rag = "command_a_rag" aya = "aya" class CustomSupportedOptimizers(str, Enum): """Custom supported optimizers""" optimi_adamw = "optimi_adamw" ao_adamw_4bit = "ao_adamw_4bit" ao_adamw_8bit = "ao_adamw_8bit" ao_adamw_fp8 = "ao_adamw_fp8" adopt_adamw = "adopt_adamw" came_pytorch = "came_pytorch" muon = "muon" dion = "dion" flash_adamw = "flash_adamw" flash_adam = "flash_adam" flash_sgd = "flash_sgd" flash_sgdw = "flash_sgdw" flash_lion = "flash_lion" class RingAttnFunc(str, Enum): """Enum class for supported `ring-flash-attn` implementations""" VARLEN_LLAMA3 = "varlen_llama3" BATCH_RING = "batch_ring" # VARLEN_RING = "varlen_ring" # VARLEN_ZIGZAG = "varlen_zigzag" # BATCH_ZIGZAG = "batch_zigzag" # BATCH_STRIPE = "batch_stripe" ================================================ FILE: src/axolotl/utils/schemas/fsdp.py ================================================ """ FSDP Configuration Schema """ from typing import Literal from pydantic import AliasChoices, BaseModel, Field class FSDPConfig(BaseModel): """ FSDP Configuration Schema """ fsdp_version: int | None = Field( validation_alias=AliasChoices("fsdp_version", "version"), default=None, json_schema_extra={"description": "FSDP version"}, ) activation_checkpointing: bool | None = Field( default=None, description="Enable activation checkpointing to reduce memory usage during forward passes", ) offload_params: bool | None = Field( default=None, description="Offload parameters to CPU to reduce GPU memory usage", ) sync_module_states: bool | None = Field( default=None, description="Synchronize module states across all processes", ) cpu_ram_efficient_loading: bool | None = Field( default=None, description="Enable CPU RAM efficient loading to reduce memory usage during model loading", ) cpu_offload_pin_memory: bool | None = Field( default=None, description="Disabling this enables swap memory usage for resource-constrained setups when offload_params is enabled.", ) use_orig_params: bool | None = Field( default=None, description="Use original parameters instead of flattened parameters", ) state_dict_type: ( Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None ) = Field( default=None, description="Type of state dict to use for saving/loading checkpoints", ) final_state_dict_type: ( Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None ) = Field( default=None, description="Final state dict type to use after training completion", ) auto_wrap_policy: Literal["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP"] | None = ( Field( default=None, description="Policy for automatically wrapping modules with FSDP", ) ) transformer_layer_cls_to_wrap: str | None = Field( default=None, description="Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')", ) reshard_after_forward: bool | None = Field( default=None, description="Reshard parameters after forward pass to save memory", ) mixed_precision_policy: str | None = Field( default=None, description="Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')", ) ================================================ FILE: src/axolotl/utils/schemas/integrations.py ================================================ """Pydantic models for Axolotl integrations""" from typing import Any from pydantic import BaseModel, Field, model_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class MLFlowConfig(BaseModel): """MLFlow configuration subset""" use_mlflow: bool | None = None mlflow_tracking_uri: str | None = Field( default=None, json_schema_extra={"description": "URI to mlflow"} ) mlflow_experiment_name: str | None = Field( default=None, json_schema_extra={"description": "Your experiment name"} ) mlflow_run_name: str | None = Field( default=None, json_schema_extra={"description": "Your run name"} ) hf_mlflow_log_artifacts: bool | None = Field( default=None, json_schema_extra={ "description": "set to true to copy each saved checkpoint on each save to mlflow artifact registry" }, ) class LISAConfig(BaseModel): """LISA configuration subset""" lisa_n_layers: int | None = Field( default=None, json_schema_extra={"description": "the number of activate layers in LISA"}, ) lisa_step_interval: int | None = Field( default=None, json_schema_extra={"description": "how often to switch layers in LISA"}, ) lisa_layers_attribute: str | None = Field( default="model.layers", json_schema_extra={"description": "path under the model to access the layers"}, ) class WandbConfig(BaseModel): """Wandb configuration subset""" use_wandb: bool | None = None wandb_name: str | None = Field( default=None, json_schema_extra={"description": "Set the name of your wandb run"}, ) wandb_run_id: str | None = Field( default=None, json_schema_extra={"description": "Set the ID of your wandb run"} ) wandb_mode: str | None = Field( default=None, json_schema_extra={ "description": '"offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb' }, ) wandb_project: str | None = Field( default=None, json_schema_extra={"description": "Your wandb project name"} ) wandb_entity: str | None = Field( default=None, json_schema_extra={"description": "A wandb Team name if using a Team"}, ) wandb_watch: str | None = None wandb_log_model: str | None = Field( default=None, json_schema_extra={ "description": '"checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training' }, ) @model_validator(mode="before") @classmethod def check_wandb_run(cls, data): if data.get("wandb_run_id") and not data.get("wandb_name"): data["wandb_name"] = data.get("wandb_run_id") LOG.warning( "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead." ) return data class CometConfig(BaseModel): """Comet configuration subset""" use_comet: bool | None = Field( default=None, json_schema_extra={"description": "Enable or disable Comet integration."}, ) comet_api_key: str | None = Field( default=None, json_schema_extra={ "description": "API key for Comet. Recommended to set via `comet login`." }, ) comet_workspace: str | None = Field( default=None, json_schema_extra={ "description": "Workspace name in Comet. Defaults to the user's default workspace." }, ) comet_project_name: str | None = Field( default=None, json_schema_extra={ "description": "Project name in Comet. Defaults to Uncategorized." }, ) comet_experiment_key: str | None = Field( default=None, json_schema_extra={ "description": "Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key." }, ) comet_mode: str | None = Field( default=None, json_schema_extra={ "description": 'Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.' }, ) comet_online: bool | None = Field( default=None, json_schema_extra={ "description": "Set to True to log data to Comet server, or False for offline storage. Default is True." }, ) comet_experiment_config: dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "Dictionary for additional configuration settings, see the doc for more details." }, ) class GradioConfig(BaseModel): """Gradio configuration subset""" gradio_title: str | None = None gradio_share: bool | None = None gradio_server_name: str | None = None gradio_server_port: int | None = None gradio_max_new_tokens: int | None = None gradio_temperature: float | None = None class RayConfig(BaseModel): """Ray launcher configuration subset""" use_ray: bool = Field(default=False) ray_run_name: str | None = Field( default=None, json_schema_extra={ "help": "The training results will be saved at `saves/ray_run_name`." }, ) ray_num_workers: int = Field( default=1, json_schema_extra={ "help": "The number of workers for Ray training. Default is 1 worker." }, ) resources_per_worker: dict = Field( default_factory=lambda: {"GPU": 1}, json_schema_extra={ "help": "The resources per worker for Ray training. Default is to use 1 GPU per worker." }, ) class OpenTelemetryConfig(BaseModel): """OpenTelemetry configuration subset""" use_otel_metrics: bool | None = Field( default=False, json_schema_extra={ "description": "Enable OpenTelemetry metrics collection and Prometheus export" }, ) otel_metrics_host: str | None = Field( default="localhost", json_schema_extra={ "title": "OpenTelemetry Metrics Host", "description": "Host to bind the OpenTelemetry metrics server to", }, ) otel_metrics_port: int | None = Field( default=8000, json_schema_extra={ "description": "Port for the Prometheus metrics HTTP server" }, ) class TrackioConfig(BaseModel): """Trackio configuration subset""" use_trackio: bool | None = None trackio_project_name: str | None = Field( default=None, json_schema_extra={"description": "Your trackio project name"}, ) trackio_run_name: str | None = Field( default=None, json_schema_extra={"description": "Set the name of your trackio run"}, ) trackio_space_id: str | None = Field( default=None, json_schema_extra={ "description": "Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)" }, ) ================================================ FILE: src/axolotl/utils/schemas/internal/__init__.py ================================================ """module for gpu capabilities""" from typing import Optional from pydantic import BaseModel, Field class GPUCapabilities(BaseModel): """model to manage the gpu capabilities statically""" bf16: bool = Field(default=False) fp8: bool = Field(default=False) tf32: bool = Field(default=False) n_gpu: int = Field(default=1) n_node: int = Field(default=1) compute_capability: Optional[str] = Field(default=None) class EnvCapabilities(BaseModel): """model to manage the environment capabilities statically""" torch_version: Optional[str] = Field(default=None) ================================================ FILE: src/axolotl/utils/schemas/model.py ================================================ """Pydantic models for model input / output, etc. configuration""" from typing import Any, Literal from pydantic import BaseModel, Field, field_validator from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class ModelInputConfig(BaseModel): """Model configuration subset""" model_config = {"protected_namespaces": ()} base_model: str = Field( json_schema_extra={ "description": "This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This can also be a relative path to a model on disk" } ) base_model_config: str | None = Field( default=None, json_schema_extra={ "description": "If the base_model repo on hf hub doesn't include configuration .json files, You can set that here, or leave this empty to default to base_model" }, ) cls_model_config: str | None = Field( default=None, json_schema_extra={ "description": "transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to AutoConfig." }, ) tokenizer_config: str | None = Field( default=None, json_schema_extra={ "description": "Optional tokenizer configuration path in case you want to use a different tokenizer than the one defined in the base model" }, ) tokenizer_use_fast: bool | None = Field( default=None, json_schema_extra={ "description": "use_fast option for tokenizer loading from_pretrained, default to True" }, ) tokenizer_legacy: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use the legacy tokenizer setting, defaults to True" }, ) tokenizer_use_mistral_common: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer." }, ) tokenizer_type: str | None = Field( default=None, json_schema_extra={ "description": "Corresponding tokenizer for the model AutoTokenizer is a good choice" }, ) processor_type: str | None = Field( default=None, json_schema_extra={"description": "transformers processor class"} ) tokenizer_save_jinja_files: bool | None = Field( default=True, # match the default behavior from transformers json_schema_extra={ "description": "Whether to save jinja files for tokenizer, transformers default is True" }, ) trust_remote_code: bool | None = Field( default=None, json_schema_extra={"description": "Trust remote code for untrusted source"}, ) experimental_skip_move_to_device: bool | None = Field( default=True, json_schema_extra={ "description": "Don't move the model to the device before sharding. Set to `false` to revert to legacy behavior." }, ) use_kernels: bool | None = Field( default=None, json_schema_extra={"description": "Use custom kernels, e.g. MegaBlocks."}, ) model_quantization_config: Literal["Mxfp4Config"] | None = Field( default=None, json_schema_extra={"description": "Model loading quantization config"}, ) model_quantization_config_kwargs: dict[str, Any] | None = Field( default=None, json_schema_extra={"description": "kwargs for model quantization config"}, ) @field_validator("trust_remote_code") @classmethod def hint_trust_remote_code(cls, trust_remote_code): if trust_remote_code: LOG.warning( "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model." ) return trust_remote_code class ModelOutputConfig(BaseModel): """model save configuration subset""" output_dir: str = Field( default="./model-out", json_schema_extra={"description": "Where to save the full-finetuned model to"}, ) hub_model_id: str | None = Field( default=None, json_schema_extra={"description": "push checkpoints to hub"} ) hub_strategy: str | None = Field( default=None, json_schema_extra={"description": "how to push checkpoints to hub"}, ) hub_revision: str | None = Field( default=None, json_schema_extra={ "description": "branch/revision to push to on hub (default: main)" }, ) save_safetensors: bool | None = Field( default=True, json_schema_extra={ "description": "Whether to save the model using safetensors format. Defaults to True." }, ) @field_validator("save_safetensors") @classmethod def validate_save_safetensors(cls, v): if v is False: raise ValueError( "save_safetensors=False is not supported in Transformers V5. " "Transformers V5 always uses safetensors format for model serialization. " "This field is deprecated and will be removed in a future version." ) # Allow None and True, will default to True if None return True if v is None else v class SpecialTokensConfig(BaseModel): """Special tokens configuration subset""" bos_token: str | None = None eos_token: str | None = None pad_token: str | None = None unk_token: str | None = None additional_special_tokens: list[str] | None = None ================================================ FILE: src/axolotl/utils/schemas/multimodal.py ================================================ """Pydantic models for multimodal-related configuration""" from typing import Literal from PIL.Image import Resampling from pydantic import BaseModel, Field, field_validator class MultiModalConfig(BaseModel): """Multi-modal configuration subset""" image_size: int | tuple[int, int] | None = Field( default=None, json_schema_extra={ "description": ( "The size of the image to resize to. It can be an integer (resized into padded-square image) or a tuple (width, height)." "If not provided, we will attempt to load from preprocessor.size, otherwise, images won't be resized." ) }, ) image_resize_algorithm: ( Literal["bilinear", "bicubic", "lanczos"] | Resampling | None ) = Field( default=None, json_schema_extra={ "description": "The resampling algorithm to use for image resizing. Default is bilinear. Please refer to PIL.Image.Resampling for more details." }, ) @field_validator("image_resize_algorithm", mode="before") @classmethod def convert_image_resize_algorithm(cls, image_resize_algorithm): """ Convert the image resize algorithm to a PIL.Image.Resampling enum. """ if isinstance(image_resize_algorithm, str): image_resize_algorithm = image_resize_algorithm.lower() if image_resize_algorithm == "bilinear": image_resize_algorithm = Resampling.BILINEAR elif image_resize_algorithm == "bicubic": image_resize_algorithm = Resampling.BICUBIC elif image_resize_algorithm == "lanczos": image_resize_algorithm = Resampling.LANCZOS else: raise ValueError( f"Invalid image resize algorithm: {image_resize_algorithm}" ) return image_resize_algorithm ================================================ FILE: src/axolotl/utils/schemas/peft.py ================================================ """Pydantic models for PEFT-related configuration""" from typing import Any, Literal from pydantic import BaseModel, Field, field_validator, model_validator class LoftQConfig(BaseModel): """LoftQ configuration subset""" loftq_bits: int = Field( default=4, json_schema_extra={"description": "typically 4 bits"} ) # loftq_iter: int = Field(default=1, json_schema_extra={"description": "Alternating iterations for LoftQ"}) class PeftConfig(BaseModel): """peftq configuration subset""" loftq_config: LoftQConfig | None = Field( default=None, json_schema_extra={ "description": "Configuration options for loftq initialization for LoRA" }, ) class LoraConfig(BaseModel): """Peft / LoRA configuration subset""" load_in_8bit: bool | None = Field( default=False, json_schema_extra={ "description": "This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer" }, ) load_in_4bit: bool | None = Field( default=False, json_schema_extra={"description": "Use bitsandbytes 4 bit"} ) adapter: Literal["lora", "qlora", "llama-adapter"] | None = Field( default=None, json_schema_extra={ "description": "If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all parameters in original model" }, ) lora_model_dir: str | None = Field( default=None, json_schema_extra={ "description": "If you already have a lora model trained that you want to load, put that here. This means after training, if you want to test the model, you should set this to the value of `output_dir`. Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`." }, ) lora_r: int | None = None lora_alpha: int | None = None lora_fan_in_fan_out: bool | None = None lora_target_modules: str | list[str] | None = None lora_target_parameters: str | list[str] | None = None lora_target_linear: bool | None = Field( default=None, json_schema_extra={"description": "If true, will target all linear modules"}, ) lora_modules_to_save: list[str] | None = Field( default=None, json_schema_extra={ "description": "If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities." }, ) lora_dropout: float | None = 0.0 peft_layers_to_transform: list[int] | None = Field( default=None, json_schema_extra={ "description": "The layer indices to transform, otherwise, apply to all layers" }, ) peft_layers_pattern: list[str] | None = None peft: PeftConfig | None = None peft_use_dora: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use DoRA."} ) peft_use_rslora: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use RSLoRA."} ) peft_layer_replication: list[tuple[int, int]] | None = Field( default=None, json_schema_extra={"description": "List of layer indices to replicate."}, ) peft_init_lora_weights: bool | str | None = Field( default=None, json_schema_extra={ "description": "How to initialize LoRA weights. Default to True which is MS original implementation." }, ) peft_trainable_token_indices: list[int] | dict[str, list[int]] | None = Field( default=None, json_schema_extra={ "description": ( "A list of token indices to fine-tune on the `embed_tokens` layer.\n" "Otherwise, a dict mapping an embedding layer name to its trainable token indices.\n" "See https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-tokens-alongside-lora" ) }, ) peft_ensure_weight_tying: bool | None = Field( default=None, json_schema_extra={ "description": ( "Whether to tie adapter weights for tied model weights. " "See https://github.com/huggingface/peft/issues/2864" ) }, ) peft_autocast_adapter_dtype: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT." }, ) qlora_sharded_model_loading: bool | None = Field( default=False, json_schema_extra={ "description": "load qlora model in sharded format for FSDP using answer.ai technique." }, ) lora_on_cpu: bool | None = Field( default=None, json_schema_extra={ "description": "Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge" }, ) gptq: bool | None = Field( default=None, json_schema_extra={ "description": "Whether you are training a 4-bit GPTQ quantized model" }, ) bnb_config_kwargs: dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "optional overrides to the bnb 4bit quantization configuration" }, ) loraplus_lr_ratio: float | None = Field( default=None, json_schema_extra={ "description": "loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4." }, ) loraplus_lr_embedding: float | None = Field( default=1e-6, json_schema_extra={ "description": "loraplus learning rate for lora embedding layers. Default value is 1e-6." }, ) merge_lora: bool | None = None @model_validator(mode="before") @classmethod def validate_adapter(cls, data): if ( not data.get("adapter") and not data.get("inference") and (data.get("load_in_8bit") or data.get("load_in_4bit")) ): raise ValueError( "load_in_8bit and load_in_4bit are not supported without setting an adapter for training." "If you want to full finetune, please turn off load_in_8bit and load_in_4bit." ) return data @model_validator(mode="after") def validate_qlora(self): if self.adapter == "qlora": if self.merge_lora: # can't merge qlora if loaded in 8bit or 4bit if self.load_in_8bit: raise ValueError("Can't merge qlora if loaded in 8bit") if self.gptq: raise ValueError("Can't merge qlora if gptq") if self.load_in_4bit: raise ValueError("Can't merge qlora if loaded in 4bit") else: if self.load_in_8bit: raise ValueError("Can't load qlora in 8bit") if self.gptq: raise ValueError("Can't load qlora if gptq") if not self.load_in_4bit: raise ValueError("Require cfg.load_in_4bit to be True for qlora") return self @field_validator("loraplus_lr_embedding") @classmethod def convert_loraplus_lr_embedding(cls, loraplus_lr_embedding): if loraplus_lr_embedding and isinstance(loraplus_lr_embedding, str): loraplus_lr_embedding = float(loraplus_lr_embedding) return loraplus_lr_embedding @model_validator(mode="before") @classmethod def validate_lora_dropout(cls, data): if data.get("adapter") is not None and data.get("lora_dropout") is None: data["lora_dropout"] = 0.0 return data @model_validator(mode="after") def validate_lora_target_parameters_dropout(self): if ( self.lora_target_parameters and self.lora_dropout and self.lora_dropout != 0.0 ): raise ValueError( "lora_dropout must be 0 when lora_target_parameters is set. " "PEFT's ParamWrapper does not support lora_dropout != 0." ) return self class ReLoRAConfig(BaseModel): """ReLoRA configuration subset""" relora: bool | None = Field( default=None, json_schema_extra={ "description": "Whether to use ReLoRA. Use with jagged_restart_*steps options." }, ) relora_prune_ratio: float | None = Field( default=None, json_schema_extra={ "description": "threshold for optimizer magnitude when pruning" }, ) relora_cpu_offload: bool | None = Field( default=None, json_schema_extra={ "description": "True to perform lora weight merges on cpu during restarts, for modest gpu memory savings" }, ) ================================================ FILE: src/axolotl/utils/schemas/quantization.py ================================================ """ QAT Config Schema """ from typing import Any from pydantic import BaseModel, Field, field_validator from axolotl.utils.schemas.enums import TorchAOQuantDType def validate_ao_dtype(v: Any) -> TorchAOQuantDType | None: if v is None: return None if v == "int4": return TorchAOQuantDType.int4 if v == "int8": return TorchAOQuantDType.int8 if v in ["float8_e4m3fn", "fp8", "float8"]: return TorchAOQuantDType.float8_e4m3fn if v == "nvfp4": return TorchAOQuantDType.nvfp4 if v == "mxfp4": return TorchAOQuantDType.mxfp4 raise ValueError( f"Invalid dtype: '{v}'. Must be one of: {[e.name for e in TorchAOQuantDType] + ['fp8', 'float8']}" ) class QATConfig(BaseModel): """ QAT Config Schema """ activation_dtype: TorchAOQuantDType | None = Field( default=None, description="Fake quantization layout to use for activation quantization.", ) weight_dtype: TorchAOQuantDType = Field( default=TorchAOQuantDType.int8, description="Fake quantization layout to use for weight quantization.", ) quantize_embedding: bool | None = Field( default=False, description="Quantize embedding" ) group_size: int | None = Field( default=32, description="The number of elements in each group for per-group fake quantization", ) fake_quant_after_n_steps: int | None = Field( default=None, description="The number of steps to apply fake quantization after" ) @field_validator("activation_dtype", "weight_dtype", mode="before") @classmethod def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None: return validate_ao_dtype(v) class PTQConfig(BaseModel): """ PTQ Config Schema """ weight_dtype: TorchAOQuantDType = Field( default=TorchAOQuantDType.int8, description="Fake quantization layout to use for weight quantization.", ) activation_dtype: TorchAOQuantDType | None = Field( default=None, description="Fake quantization layout to use for activation quantization.", ) quantize_embedding: bool | None = Field( default=None, description="Whether to quantize the embedding layer." ) group_size: int | None = Field( default=32, description="The number of elements in each group for per-group fake quantization", ) @field_validator("activation_dtype", "weight_dtype", mode="before") @classmethod def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None: return validate_ao_dtype(v) ================================================ FILE: src/axolotl/utils/schemas/training.py ================================================ """Pydantic models for training hyperparameters""" from typing import Any, Literal from pydantic import BaseModel, Field, field_validator from transformers import SchedulerType from transformers.training_args import OptimizerNames from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import CustomSupportedOptimizers LOG = get_logger(__name__) class LrGroup(BaseModel): """Custom learning rate group configuration""" name: str modules: list[str] lr: float class HyperparametersConfig(BaseModel): """Training hyperparams configuration subset""" gradient_accumulation_steps: int | None = Field( default=1, json_schema_extra={ "description": "If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps." }, ) micro_batch_size: int | None = Field( default=1, json_schema_extra={ "description": "The number of samples to include in each batch. This is the number of samples sent to each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps" }, ) batch_size: int | None = Field( default=None, json_schema_extra={ "description": "Total batch size, we do not recommended setting this manually" }, ) eval_batch_size: int | None = Field( default=None, json_schema_extra={ "description": "per gpu micro batch size for evals, defaults to value of micro_batch_size" }, ) auto_find_batch_size: bool | None = Field( default=None, json_schema_extra={ "description": "whether to find batch size that fits in memory. Passed to underlying transformers Trainer" }, ) train_on_inputs: bool | None = Field( default=False, json_schema_extra={ "description": "Whether to mask out or include the human's prompt from the training labels" }, ) group_by_length: bool | None = Field( default=None, json_schema_extra={ "description": "Group similarly sized data to minimize padding. May be slower to start, as it must download and sort the entire dataset. Note that training loss may have an oscillating pattern with this enabled." }, ) learning_rate: str | float embedding_lr: float | None = None embedding_lr_scale: float | None = None weight_decay: float | None = Field( default=0.0, json_schema_extra={"description": "Specify weight decay"} ) optimizer: (OptimizerNames | CustomSupportedOptimizers) | None = Field( default=OptimizerNames.ADAMW_TORCH_FUSED, json_schema_extra={"description": "Specify optimizer"}, ) optim_args: (str | dict[str, Any]) | None = Field( default=None, json_schema_extra={ "description": "Dictionary of arguments to pass to the optimizer" }, ) optim_target_modules: (list[str] | Literal["all_linear"]) | None = Field( default=None, json_schema_extra={ "description": "The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm" }, ) torchdistx_path: str | None = Field( default=None, json_schema_extra={ "description": "Path to torch distx for optim 'adamw_anyprecision'" }, ) lr_scheduler: ( SchedulerType | Literal["one_cycle"] | Literal["rex"] ) | None = SchedulerType.COSINE lr_scheduler_kwargs: dict[str, Any] | None = Field( default=None, json_schema_extra={ "description": "Specify a scheduler and kwargs to use with the optimizer" }, ) lr_quadratic_warmup: bool | None = None cosine_min_lr_ratio: float | None = Field( default=None, json_schema_extra={ "description": "decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr" }, ) cosine_constant_lr_ratio: float | None = Field( default=None, json_schema_extra={ "description": "freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step" }, ) lr_div_factor: float | None = Field( default=None, json_schema_extra={"description": "Learning rate div factor"} ) lr_groups: list[LrGroup] | None = None adam_epsilon: float | None = Field( default=None, json_schema_extra={"description": "adamw hyperparams"} ) adam_epsilon2: float | None = Field( default=None, json_schema_extra={"description": "only used for CAME Optimizer"} ) adam_beta1: float | None = Field( default=None, json_schema_extra={"description": "adamw hyperparams"} ) adam_beta2: float | None = Field( default=None, json_schema_extra={"description": "adamw hyperparams"} ) adam_beta3: float | None = Field( default=None, json_schema_extra={"description": "only used for CAME Optimizer"} ) dion_lr: float | None = Field( default=None, json_schema_extra={"description": "Dion Optimizer learning rate"} ) dion_momentum: float | None = Field( default=None, json_schema_extra={"description": "Dion Optimizer momentum"} ) dion_rank_fraction: float | None = Field( default=1.0, json_schema_extra={ "description": "Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank dimension." }, ) dion_rank_multiple_of: int | None = Field( default=1, json_schema_extra={ "description": "Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may be useful to ensure even sharding." }, ) max_grad_norm: float | None = Field( default=None, json_schema_extra={"description": "Gradient clipping max norm"} ) num_epochs: float = Field(default=1.0) @field_validator("batch_size") @classmethod def hint_batch_size_set(cls, batch_size): if batch_size: LOG.warning( "%s\n%s", "batch_size is not recommended. Please use gradient_accumulation_steps instead.", "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.", ) return batch_size @field_validator("learning_rate") @classmethod def convert_learning_rate(cls, learning_rate): if learning_rate and isinstance(learning_rate, str): learning_rate = float(learning_rate) return learning_rate class JaggedLRConfig(BaseModel): """JaggedLR configuration subset, can be used w/ ReLoRA training""" jagged_restart_steps: int | None = Field( default=None, json_schema_extra={"description": "how often to reset for jagged restarts"}, ) jagged_restart_warmup_steps: int | None = Field( default=None, json_schema_extra={ "description": "how many warmup steps to take after reset for jagged restarts" }, ) jagged_restart_anneal_steps: int | None = Field( default=None, json_schema_extra={ "description": "how many anneal steps to take before reset for jagged restarts" }, ) ================================================ FILE: src/axolotl/utils/schemas/trl.py ================================================ """Pydantic models for TRL trainer configuration""" from typing import Literal from pydantic import BaseModel, Field class TRLConfig(BaseModel): """ Input args for TRL. """ beta: float | None = Field( default=None, json_schema_extra={ "description": "Beta parameter for the RL training. Same as `rl_beta`. Use" }, ) max_completion_length: int | None = Field( default=None, json_schema_extra={ "description": "Maximum length of the completion for RL training." }, ) # GRPO specific args # Ref: https://github.com/huggingface/trl/blob/26d86757a7c7e24e397ea44f57ecce6031dfac01/trl/trainer/grpo_config.py#L23 use_vllm: bool = Field( default=False, json_schema_extra={"description": "Whether to use VLLM for RL training."}, ) vllm_mode: Literal["server", "colocate"] | None = Field( default=None, json_schema_extra={ "description": "VLLM mode to use, one of 'server' or 'colocate'" }, ) vllm_server_host: str | None = Field( default="0.0.0.0", # nosec B104 json_schema_extra={"description": "Host of the vLLM server to connect to."}, ) vllm_server_port: int | None = Field( default=8000, json_schema_extra={"description": "Port of the vLLM server to connect to."}, ) vllm_server_timeout: int | None = Field( default=None, json_schema_extra={ "description": "Total timeout (in seconds) to wait for the vLLM server to respond." }, ) vllm_guided_decoding_regex: str | None = Field( default=None, json_schema_extra={"description": "Regex for vLLM guided decoding."}, ) reward_funcs: list[str] | None = Field( default=None, json_schema_extra={ "description": "List of reward functions to load. Paths must be importable from current dir." }, ) reward_weights: list[float] | None = Field( default=None, json_schema_extra={ "description": "List of reward weights for the reward functions." }, ) num_generations: int | None = Field( default=None, json_schema_extra={"description": "Number of generations to sample."}, ) log_completions: bool | None = Field( default=False, json_schema_extra={"description": "Whether to log completions."}, ) num_completions_to_print: int | None = Field( default=None, json_schema_extra={ "description": "Number of completions to print when log_completions is True." }, ) importance_sampling_level: Literal["sequence", "token"] | None = Field( default=None, json_schema_extra={ "description": "Controls whether importance sampling ratios are computed at the `'token'` or `'sequence'` level. " "For GSPO, use `sequence`, default is None which corresponds to the original GRPO paper." }, ) sync_ref_model: bool | None = Field( default=False, json_schema_extra={"description": "Whether to sync the reference model."}, ) ref_model_mixup_alpha: float | None = Field( default=0.9, json_schema_extra={"description": "Mixup alpha for the reference model."}, ) ref_model_sync_steps: int | None = Field( default=64, json_schema_extra={"description": "Sync steps for the reference model."}, ) scale_rewards: bool = Field( default=True, json_schema_extra={ "description": "Whether to scale rewards by their standard deviation." }, ) temperature: float | None = Field( default=None, json_schema_extra={"description": "Sampling temperature for the GRPO policy."}, ) top_p: float | None = Field( default=None, json_schema_extra={ "description": "Top-p sampling probability for the generation policy." }, ) top_k: int | None = Field( default=None, json_schema_extra={"description": "Top-k sampling for the generation policy."}, ) min_p: float | None = Field( default=None, json_schema_extra={ "description": "Minimum probability for the generation policy." }, ) repetition_penalty: float | None = Field( default=None, json_schema_extra={ "description": "Penalty for tokens that appear in prompt and generated text." }, ) num_iterations: int | None = Field( default=None, json_schema_extra={ "description": "Number of iterations per batch (μ) for GRPO." }, ) epsilon: float | None = Field( default=None, json_schema_extra={ "description": "Epsilon value for clipping in the GRPO algorithm." }, ) epsilon_high: float | None = Field( default=None, json_schema_extra={ "description": "Upper-bound epsilon value for clipping in the GRPO algorithm." }, ) use_liger_loss: bool | None = Field( default=None, json_schema_extra={"description": "Whether to use Liger loss for GRPO."}, ) loss_type: str | None = Field( default=None, json_schema_extra={ "description": "Loss formulation to use. Supported values: grpo, bnpo, dr_grpo." }, ) mask_truncated_completions: bool = Field( default=False, json_schema_extra={ "description": "Whether to exclude truncated completions from loss calculation." }, ) vllm_enable_sleep_mode: bool | None = Field( default=None, json_schema_extra={ "description": "Enable sleep mode for vLLM to offload VRAM when idle" }, ) rollout_func: str | None = Field( default=None, json_schema_extra={ "description": "Path to custom rollout function. Must be importable from current dir." }, ) multi_objective_aggregation: ( Literal["sum_then_normalize", "normalize_then_sum"] | None ) = Field( default=None, json_schema_extra={ "description": "Multi-objective reward aggregation strategy. " "'sum_then_normalize' (GRPO default): weights and sums rewards first, then normalizes. " "'normalize_then_sum' (GDPO): normalizes each reward independently, then sums." }, ) # Async GRPO fields use_data_producer: bool = Field( default=False, json_schema_extra={ "description": "Use the GRPODataProducer protocol for online data generation." }, ) async_prefetch: bool = Field( default=False, json_schema_extra={ "description": "Generate rollouts in a background thread while training on the previous rollout." }, ) prefetch_depth: int | None = Field( default=None, json_schema_extra={ "description": "Number of rollouts to prefetch ahead of training." }, ) vllm_sync_interval: int | None = Field( default=None, json_schema_extra={ "description": "Sync model weights to vLLM every N optimizer steps (async mode only)." }, ) streaming_partial_batch: bool | None = Field( default=None, json_schema_extra={ "description": "Score prompt groups incrementally instead of the full batch at once." }, ) streaming_min_groups: int | None = Field( default=None, json_schema_extra={ "description": "Minimum prompt groups to score per streaming chunk." }, ) vllm_importance_sampling_correction: bool | None = Field( default=None, json_schema_extra={ "description": "Apply IS correction for distribution mismatch between vLLM and training model." }, ) vllm_importance_sampling_mode: ( Literal["token_truncate", "token_mask", "sequence_truncate", "sequence_mask"] | None ) = Field( default=None, json_schema_extra={ "description": "IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask." }, ) vllm_importance_sampling_cap: float | None = Field( default=None, json_schema_extra={"description": "Cap C for IS ratio clipping/masking."}, ) off_policy_mask_threshold: float | None = Field( default=None, json_schema_extra={ "description": "KL threshold for off-policy sequence masking (OPSM). None = disabled." }, ) use_bias_correction_kl: bool | None = Field( default=None, json_schema_extra={"description": "Apply IS correction to KL divergence term."}, ) reward_num_workers: int = Field( default=1, json_schema_extra={ "description": "Number of persistent subprocess workers for parallel reward computation. Each worker has its " "own main thread so signal.alarm() (used by math_verify) works correctly. Work is sharded across " "workers by prompt groups. Only used with use_data_producer=True and non-nn.Module reward functions." }, ) replay_buffer_size: int = Field( default=0, json_schema_extra={ "description": "[Experimental, disabled by default] Size of the replay buffer for storing high-signal rollout " "groups. When > 0, groups with reward variance are cached and used to replace zero-signal groups " "(where all rewards are identical). Set to 0 to disable. Only used with use_data_producer=True." }, ) replay_recompute_logps: bool = Field( default=True, json_schema_extra={ "description": "When True (default), recompute old_per_token_logps for replayed groups using the current " "training model. This fixes the importance sampling mismatch that occurs when replaying stale data. " "Only relevant when replay_buffer_size > 0." }, ) reroll_start_fraction: float = Field( default=1.0, json_schema_extra={ "description": "Fraction of total training steps after which deferred re-rolling begins. Zero-signal prompts " "(where all rewards in a group are identical) are buffered and re-injected into later batches when the " "model is more likely to solve them. Set to 1.0 to disable. Only used with use_data_producer=True." }, ) reroll_max_groups: int = Field( default=1, json_schema_extra={ "description": "Maximum number of prompt groups to replace with re-roll candidates per batch. Higher values " "increase data utilization but reduce prompt diversity. Only used with use_data_producer=True." }, ) skip_zero_advantage_batches: bool = Field( default=True, json_schema_extra={ "description": "When True, skip gradient computation for micro-batches where all advantages are zero (no learning " "signal). This avoids the forward/backward pass entirely when no learning signal is present. The step is " "logged with skipped_zero_adv_batches=1 for monitoring." }, ) vllm_lora_sync: bool = Field( default=False, json_schema_extra={ "description": "Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. " "Auto-selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged model." }, ) ================================================ FILE: src/axolotl/utils/schemas/utils.py ================================================ """Utilities for Axolotl Pydantic models""" from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def handle_legacy_message_fields_logic(data: dict) -> dict: """ Handle backwards compatibility between legacy message field mapping and new property mapping system. Previously, the config only supported mapping 'role' and 'content' fields via dedicated config options: - message_field_role: Mapped to the role field - message_field_content: Mapped to the content field The new system uses message_property_mappings to support arbitrary field mappings: message_property_mappings: role: source_role_field content: source_content_field additional_field: source_field Args: data: Dictionary containing configuration data Returns: Updated dictionary with message field mappings consolidated Raises: ValueError: If there are conflicts between legacy and new mappings """ data = data.copy() # Create a copy to avoid modifying the original if data.get("message_property_mappings") is None: data["message_property_mappings"] = {} # Check for conflicts and handle role if "message_field_role" in data: LOG.warning( "message_field_role is deprecated, use message_property_mappings instead. " f"Example: message_property_mappings: {{role: {data['message_field_role']}}}" ) if ( "role" in data["message_property_mappings"] and data["message_property_mappings"]["role"] != data["message_field_role"] ): raise ValueError( f"Conflicting message role fields: message_field_role='{data['message_field_role']}' " f"conflicts with message_property_mappings.role='{data['message_property_mappings']['role']}'" ) data["message_property_mappings"]["role"] = data["message_field_role"] or "role" del data["message_field_role"] elif "role" not in data["message_property_mappings"]: data["message_property_mappings"]["role"] = "role" # Check for conflicts and handle content if "message_field_content" in data: LOG.warning( "message_field_content is deprecated, use message_property_mappings instead. " f"Example: message_property_mappings: {{content: {data['message_field_content']}}}" ) if ( "content" in data["message_property_mappings"] and data["message_property_mappings"]["content"] != data["message_field_content"] ): raise ValueError( f"Conflicting message content fields: message_field_content='{data['message_field_content']}' " f"conflicts with message_property_mappings.content='{data['message_property_mappings']['content']}'" ) data["message_property_mappings"]["content"] = ( data["message_field_content"] or "content" ) del data["message_field_content"] elif "content" not in data["message_property_mappings"]: data["message_property_mappings"]["content"] = "content" return data ================================================ FILE: src/axolotl/utils/schemas/validation.py ================================================ """Module with validation methods for config pydantic model.""" import json import sys import tempfile from pathlib import Path from pydantic import ( field_validator, model_validator, ) from transformers.utils.import_utils import is_torch_npu_available from axolotl.utils.logging import get_logger from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType LOG = get_logger(__name__) SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"} class DatasetValidationMixin: """Validation methods related to dataset configuration.""" @field_validator("seed", mode="after") @classmethod def set_default_seed(cls, seed): if seed is None: LOG.info("`seed` not set in config; setting to 42") seed = 42 return seed @field_validator("datasets", mode="before") @classmethod def deprecate_sharegpt_datasets(cls, datasets): for _, ds_cfg in enumerate(datasets): ds_type = ( ds_cfg.get("type") if isinstance(ds_cfg, dict) else getattr(ds_cfg, "type", None) ) if not ds_type: continue if isinstance(ds_type, dict): continue if isinstance(ds_type, str) and ds_type.startswith("sharegpt"): raise ValueError( "`type: sharegpt.*` is deprecated. Please use `type: chat_template` instead." ) return datasets @model_validator(mode="before") @classmethod def check_dataset_or_pretraining_dataset(cls, data): if data.get("datasets") is None and data.get("pretraining_dataset") is None: raise ValueError("either datasets or pretraining_dataset is required") return data @model_validator(mode="before") @classmethod def check_pretraining_streaming_deprecation(cls, data): # TODO(djsaunde): remove this check + implement change for 0.13.0 release if data.get("pretraining_dataset") and not data.get("streaming"): LOG.warning( "Setting `pretraining_dataset` without explicitly setting `streaming: " "true` is deprecated. In a future release, streaming will not be " "automatically enabled when using pretraining_dataset. Please " "explicitly set `streaming: true` in your configuration to maintain " "current behavior." ) return data @model_validator(mode="before") @classmethod def check_push_ds_auth(cls, data): if ( data.get("push_dataset_to_hub") and data.get("hf_use_auth_token") is not True ): raise ValueError( "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub" ) return data @model_validator(mode="before") @classmethod def check_val_w_test_datasets(cls, data): if data.get("test_datasets") and data.get("val_set_size"): raise ValueError( "non-zero val_set_size should not be used with test_datasets configuration" ) return data @model_validator(mode="before") @classmethod def check_test_datasets_bench(cls, data): if ( data.get("do_bench_eval") and not data.get("test_datasets") and not data.get("val_set_size") ): LOG.warning( "`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset." ) data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}] return data @model_validator(mode="before") @classmethod def check_eval_packing(cls, data): # TODO also should check test_datasets and val_set_size as we can skip # if there are no eval datasets/splits if ( data.get("sample_packing") and data.get("eval_table_size") and data.get("eval_sample_packing") is not False ): raise ValueError( "eval_table_size and eval_sample_packing are not supported together with sample_packing. Please set 'eval_sample_packing' to false." ) if ( data.get("sample_packing") and data.get("eval_sample_packing") is None and not data.get("eval_table_size") ): LOG.info( "explicitly setting `eval_sample_packing` to match `sample_packing`", ) data["eval_sample_packing"] = True if ( data.get("sample_packing") and data.get("eval_sample_packing") is False and data.get("remove_unused_columns") is None ): LOG.info( "setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match" ) data["remove_unused_columns"] = False return data @model_validator(mode="before") @classmethod def check_mm_prepare(cls, data): if data.get("skip_prepare_dataset"): if data.get("remove_unused_columns") is None: LOG.info( "setting `remove_unused_columns: false` for skip_prepare_dataset" ) data["remove_unused_columns"] = False return data class AttentionValidationMixin: """Validation methods related to attention mechanisms.""" @model_validator(mode="before") @classmethod def check_attention_fields(cls, data): fields = ( "xformers_attention", "sdp_attention", # "s2_attention", # requires both FA and this to be enabled "flash_attention", "flex_attention", "sage_attention", ) non_empty_count = sum(1 for field in fields if data.get(field)) if non_empty_count > 1: raise ValueError(f"Only one of {', '.join(fields)} must be set") return data @model_validator(mode="before") @classmethod def check_sample_packing_without_attention(cls, data): if ( data.get("sample_packing") and not data.get("flash_attention") and not data.get("sdp_attention") and not data.get("flex_attention") and not data.get("xformers_attention") and not data.get("sage_attention") ): LOG.warning( "sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination." ) return data @model_validator(mode="before") @classmethod def check_sample_packing_with_s2attn(cls, data): if data.get("sample_packing") and data.get("s2_attention"): raise ValueError( "Received `sample_packing=true` and `s2_attention=true`; however, \ shifted-sparse attention does not currently support sample packing." ) return data @model_validator(mode="before") @classmethod def check_scaling_softmax_requires_flex(cls, data): if data.get("scaling_softmax") and not data.get("flex_attention"): raise ValueError( "scaling_softmax requires flex_attention: true\n" "Add 'flex_attention: true' to your config file.\n" ) return data class TrainingValidationMixin: """Validation methods related to training configuration.""" @model_validator(mode="before") @classmethod def check_batch_size_fields(cls, data): fields = ("micro_batch_size", "gradient_accumulation_steps", "batch_size") non_empty_count = sum(1 for field in fields if data.get(field)) if non_empty_count < 2: raise ValueError(f"At least two of {', '.join(fields)} must be set") return data @model_validator(mode="before") @classmethod def hint_sample_packing_padding(cls, data): if data.get("sample_packing"): pad_to_sequence_len = data.get("pad_to_sequence_len") if pad_to_sequence_len is False: LOG.warning( "`pad_to_sequence_len: true` is recommended when using sample_packing" ) elif pad_to_sequence_len is None: LOG.info( "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing" ) data["pad_to_sequence_len"] = True return data @model_validator(mode="before") @classmethod def hint_reward_model_pad(cls, data): if data.get("reward_model") and not data.get("pad_to_sequence_len"): LOG.warning( "`pad_to_sequence_len: true` is recommended when using reward_model" ) if data.get("pad_to_sequence_len") is None: data["pad_to_sequence_len"] = True return data @model_validator(mode="before") @classmethod def set_reward_model_defaults(cls, data): if data.get("reward_model"): if data.get("num_labels") is None: data["num_labels"] = 1 if not (data.get("type_of_model") or data.get("model_type")): data["model_type"] = "AutoModelForSequenceClassification" if data.get("process_reward_model"): if data.get("num_labels") is None: data["num_labels"] = 2 if not (data.get("type_of_model") or data.get("model_type")): data["model_type"] = "AutoModelForTokenClassification" return data @model_validator(mode="before") @classmethod def check_gas_bsz(cls, data): if data.get("gradient_accumulation_steps") and data.get("batch_size"): raise ValueError( "please set only one of gradient_accumulation_steps or batch_size" ) return data @model_validator(mode="before") @classmethod def hint_eval_train_mbsz(cls, data): if ( data.get("eval_batch_size") and data.get("micro_batch_size") and data.get("eval_batch_size") != data.get("micro_batch_size") ): LOG.warning( "eval_batch_size != micro_batch_size. This can lead to VRAM instability." ) return data @model_validator(mode="before") @classmethod def check_warmup(cls, data): if data.get("warmup_steps") and data.get("warmup_ratio"): raise ValueError("warmup_steps and warmup_ratio are mutually exclusive") return data @model_validator(mode="before") @classmethod def check_saves(cls, data): if ( data.get("save_strategy") and data.get("save_steps") and data.get("save_strategy") != "steps" ): raise ValueError( "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps." ) if data.get("saves_per_epoch") and data.get("save_steps"): raise ValueError( "save_steps and saves_per_epoch are mutually exclusive and cannot be used together." ) return data @model_validator(mode="before") @classmethod def check_push_save(cls, data): if data.get("hub_model_id") and ( data.get("save_strategy") not in ["steps", "epoch", None] ): LOG.warning( "hub_model_id is set without any models being saved. To save a model, set save_strategy." ) return data @model_validator(mode="before") @classmethod def check_evals(cls, data): if ( data.get("eval_strategy") and data.get("eval_steps") and data.get("eval_strategy") != "steps" ): raise ValueError( "eval_strategy and eval_steps mismatch. Please set eval_strategy to 'steps' or remove eval_steps." ) if ( data.get("val_set_size") == 0 and (data.get("eval_steps") or data.get("eval_strategy")) and not data.get("test_datasets") and data.get("eval_strategy") != "no" ): raise ValueError( "eval_steps and eval_strategy are not supported with val_set_size == 0" ) if data.get("evals_per_epoch") and data.get("eval_steps"): raise ValueError( "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together." ) if ( data.get("evals_per_epoch") and data.get("eval_strategy") and data.get("eval_strategy") != "steps" ): raise ValueError( "eval_strategy must be empty or set to `steps` when used with evals_per_epoch." ) if data.get("do_bench_eval") and not ( data.get("evals_per_epoch") or data.get("eval_steps") ): raise ValueError( "do_bench_eval requires evals_per_epoch or eval_steps to be set." ) return data @model_validator(mode="before") @classmethod def check_neftune(cls, data): if data.get("noisy_embedding_alpha") and not data.get("neftune_noise_alpha"): data["neftune_noise_alpha"] = data["noisy_embedding_alpha"] del data["noisy_embedding_alpha"] elif data.get("noisy_embedding_alpha") and data.get("neftune_noise_alpha"): raise ValueError( "noisy_embedding_alpha is deprecated, use neftune_noise_alpha; both are set, please remove the deprecated noisy_embedding_alpha setting" ) return data @model_validator(mode="before") @classmethod def check_multipack_buffer_size(cls, data): if data.get("pretrain_multipack_buffer_size") and not data.get( "streaming_multipack_buffer_size" ): LOG.warning( "`pretrain_multipack_buffer_size` is deprecated in v0.13.0, will be " "removed in v0.14.0. Use `streaming_multipack_buffer_size` instead." ) data["streaming_multipack_buffer_size"] = data[ "pretrain_multipack_buffer_size" ] del data["pretrain_multipack_buffer_size"] elif data.get("pretrain_multipack_buffer_size") and data.get( "streaming_multipack_buffer_size" ): raise ValueError( "pretrain_multipack_buffer_size is deprecated, use " "streaming_multipack_buffer_size; both are set, please remove the " "deprecated pretrain_multipack_buffer_size setting" ) return data @model_validator(mode="after") def check_fft_possible_bad_config(self): if ( not (self.bf16 or self.bfloat16) and (self.fp16 or self.float16) and not self.adapter and not self.flash_attention and self.sample_packing ): LOG.warning( "Full fine tune w/o FA2 w/ sample packing and fp16/float16 is likely to raise errors. Try LoRA." ) # ValueError: Attempting to unscale FP16 gradients. # OR # RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half return self @model_validator(mode="before") @classmethod def check_fp8_config(cls, data): if data.get("fp8") and not data.get("torch_compile"): LOG.warning( "torch_compile is strongly recommended for FP8 training in order to " "see speed improvements. Please consider setting `torch_compile: " "true` in your config." ) fsdp_config = data.get("fsdp_config") or {} if data.get("fp8") and ( fsdp_config.get("activation_checkpointing", False) is True or fsdp_config.get("fsdp_activation_checkpointing", False) is True ): LOG.warning( "FP8 + FSDP2 + activation checkpointing may be slower than BF16 " "training. Please considering setting `activation_checkpointing: false` " "in your FSDP config." ) if ( data.get("fp8_enable_fsdp_float8_all_gather") and not data.get("fsdp_version", None) == 2 ): raise ValueError( "fp8_enable_fsdp_float8_all_gather requires FSDP2 (fsdp_version: 2) " "to be used." ) return data @model_validator(mode="before") @classmethod def check_use_reentrant_mismatch(cls, data): if ( data.get("unfrozen_parameters") and data.get("gradient_checkpointing_kwargs") and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant") is True ): # https://github.com/huggingface/transformers/issues/21381 raise ValueError( "`use_reentrant` must be false when used with partially frozen model." ) return data @model_validator(mode="before") @classmethod def check_eval_strategy(cls, data): if ( data.get("evaluation_strategy") is not None and data.get("eval_strategy") is None ): LOG.info( "explicitly setting `eval_strategy` from the `evaluation_strategy`" ) data["eval_strategy"] = data.get("evaluation_strategy") return data @model_validator(mode="before") @classmethod def check_causal_lm_evals(cls, data): if data.get("do_causal_lm_eval") and data.get("eval_sample_packing"): raise ValueError( "do_causal_lm_eval is enabled, eval_sample_packing must be set to False" ) if data.get("eval_causal_lm_metrics"): if not isinstance(data.get("eval_causal_lm_metrics"), list): raise ValueError("eval_causal_lm_metrics must be a list") # only ["sacrebleu", "comet", "ter", "chrf"] supported if set(data.get("eval_causal_lm_metrics")) - SUPPORTED_METRICS: raise ValueError( f"eval_causal_lm_metrics must be one of {SUPPORTED_METRICS}" ) return data @model_validator(mode="before") @classmethod def check_tokenizer_use_mistral_common(cls, data): if data.get("tokenizer_use_mistral_common") is None: if any( "magistral" in name.lower() for name in [ data.get("base_model", ""), data.get("base_model_config", ""), data.get("tokenizer_config", ""), ] ): LOG.warning( "tokenizer_use_mistral_common auto inferred to True for Magistral models. Please set it to True explicitly if you want to use mistral-common tokenizer." ) data["tokenizer_use_mistral_common"] = True return data @field_validator("tokenizer_use_mistral_common", mode="after") @classmethod def check_mistral_common_import(cls, tokenizer_use_mistral_common): if tokenizer_use_mistral_common: import importlib.util if importlib.util.find_spec("mistral_common") is None: raise ImportError( "mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`." ) return tokenizer_use_mistral_common @model_validator(mode="before") @classmethod def check_mistral_common_incompatible_options(cls, data): if not data.get("tokenizer_use_mistral_common"): return data # NOTE: mistral-common tokenizer is not compatible with editing tokenizer at the moment if data.get("added_tokens_overrides"): raise ValueError( "added_tokens_overrides is not supported with mistral-common tokenizer" ) if data.get("special_tokens"): raise ValueError( "special_tokens override is not supported with mistral-common tokenizer" ) if data.get("tokens"): raise ValueError( "tokens override is not supported with mistral-common tokenizer" ) if data.get("chat_template"): raise ValueError( "Setting chat_template is not supported with mistral-common tokenizer" ) return data @model_validator(mode="before") @classmethod def pretrain_with_tps(cls, data): if data.get("pretraining_dataset") and data.get( "include_tokens_per_second", False ): # combining these would raise `TypeError: cannot pickle 'dict_keys' object` # due to trying to count the number of tokens total in the dataset raise ValueError( "pretraining_dataset and include_tokens_per_second cannot be used together." ) return data class LoRAValidationMixin: """Validation methods related to LoRA/QLoRA configuration.""" @model_validator(mode="before") @classmethod def check_lr_groups(cls, data): if data.get("lr_groups") and data.get("loraplus_lr_ratio"): raise ValueError("lr_groups and loraplus_lr_ratio cannot be used together.") return data @model_validator(mode="before") @classmethod def check_frozen(cls, data): if ( data.get("adapter") and data.get("peft_layers_to_transform") and data.get("unfrozen_parameters") ): raise ValueError( "`unfrozen_parameters` used with `peft_layers_to_transform` can have unexpected behavior." ) return data @model_validator(mode="before") @classmethod def check_peft_layers_pattern(cls, data): if data.get("peft_layers_pattern") and not data.get("peft_layers_to_transform"): raise ValueError( "peft_layers_pattern requires peft_layers_to_transform to be set" ) return data @model_validator(mode="before") @classmethod def check_qlora_unsloth(cls, data): if ( data.get("unsloth_lora_mlp") or data.get("unsloth_lora_qkv") or data.get("unsloth_lora_o") ): if data.get("adapter") == "lora" and data.get("load_in_8bit"): raise ValueError( "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA" ) return data @model_validator(mode="before") @classmethod def check_lora_axolotl_unsloth(cls, data): is_lora_kernel = any( data.get(k) for k in ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"] ) is_unsloth_lora = any( data.get(k) for k in ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"] ) if is_lora_kernel and is_unsloth_lora: raise ValueError( "both lora_mlp_kernel and unsloth_lora_mlp cannot be true (similarly for lora_qkv_kernel, lora_o_kernel)" ) return data @model_validator(mode="after") def check_fused_lora(self): if self.adapter in ["lora", "qlora"] and self.flash_attn_fuse_mlp: raise ValueError("Fused modules are not supported with LoRA/QLoRA") return self @model_validator(mode="before") @classmethod def warn_qlora_zero3_w_use_reentrant(cls, data): if ( data.get("adapter") == "qlora" and data.get("gradient_checkpointing_kwargs", {}) and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant") is False and data.get("deepspeed", "") is not None and "zero3" in data.get("deepspeed", "") ): # may result in: # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: # Recomputed values for the following tensors have different metadata # than during the forward pass. LOG.warning( "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values" ) return data @model_validator(mode="before") @classmethod def check_lora_kernels_8bit(cls, data): if ( data.get("lora_mlp_kernel") or data.get("lora_qkv_kernel") or data.get("lora_o_kernel") ): if data.get("adapter") == "lora" and data.get("load_in_8bit"): raise ValueError( "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not " "compatible with 8-bit LoRA a the moment." ) return data @model_validator(mode="before") @classmethod def check_lora_kernels_dora(cls, data): if ( data.get("lora_mlp_kernel") or data.get("lora_qkv_kernel") or data.get("lora_o_kernel") ) and data.get("peft_use_dora"): raise ValueError( "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not " "compatible with DoRA at the moment." ) return data @model_validator(mode="before") @classmethod def check_lora_kernels_trust_remote_code(cls, data): if ( data.get("lora_mlp_kernel") or data.get("lora_qkv_kernel") or data.get("lora_o_kernel") ) and data.get("trust_remote_code"): raise ValueError( "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not " "compatible with trust_remote_code. Please disable trust_remote_code " "or explicitly set lora_*_kernel to false." ) return data class RLValidationMixin: """Validation methods related to RL training configuration.""" @model_validator(mode="before") @classmethod def check_sample_packing_w_rl(cls, data): if data.get("sample_packing") and data.get("rl"): raise ValueError("`sample_packing: true` does not work with RLHF training") return data @model_validator(mode="before") @classmethod def check_kto_config(cls, data): if data.get("rl") == "kto": if data.get("sample_packing") or data.get("eval_sample_packing"): raise ValueError("sample_packing is not supported with kto") if data.get("remove_unused_columns") is not False: raise ValueError("Set `remove_unused_columns: False` when using kto") return data @model_validator(mode="before") @classmethod def check_grpo_liger_sequence_parallel(cls, data): if ( data.get("rl") == "grpo" and data.get("trl", {}) and data.get("trl").get("use_liger_loss") and data.get("context_parallel_size", 1) > 1 ): raise ValueError("GRPO + SP + Liger not currently supported") return data @model_validator(mode="before") @classmethod def check_rl_config_gradient_checkpointing(cls, data): # TODO: SalmanMohammadi # Distributed RL with QLoRA + gradient checkpointing # and use_reentrant = True is broken upstream in TRL if ( data.get("rl") and data.get("gradient_checkpointing") and data.get("gradient_checkpointing_kwargs") and data.get("gradient_checkpointing_kwargs").get("use_reentrant") and data.get("load_in_4bit") and data.get("adapter") == "qlora" and data.get("capabilities") and data.get("capabilities").get("n_gpu", 1) > 1 ): raise ValueError( "The `use_reentrant: True` implementation of gradient checkpointing " "is not supported for distributed RL training with QLoRA. Please set " "`use_reentrant: False` in `gradient_checkpointing_kwargs`." ) return data @model_validator(mode="before") @classmethod def check_gdpo(cls, data): if ( data.get("rl") == "gdpo" and data.get("trl", {}).get("multi_objective_aggregation") == "sum_then_normalize" ): raise ValueError( "`multi_objective_aggregation` value set as `sum_then_normalize` => GRPO, but GDPO was selected" ) return data class OptimizationValidationMixin: """Validation methods related to optimization and performance.""" @model_validator(mode="after") def check_adamw_optimizer_params(self): if any([self.adam_beta1, self.adam_beta2, self.adam_epsilon]) and ( not self.optimizer or "adamw" not in str(self.optimizer).lower() ): LOG.warning("adamw hyperparameters found, but no adamw optimizer set") return self @staticmethod def _resolve_fsdp_version(data): """Resolve FSDP version from top-level fsdp_version or fsdp_config.fsdp_version.""" fsdp_version = data.get("fsdp_version") if fsdp_version is None: fsdp_version = data.get("fsdp_config", {}).get("fsdp_version", 1) return fsdp_version @model_validator(mode="before") @classmethod def check_muon_deepspeed_fsdp(cls, data): if data.get("optimizer") == "muon": if data.get("deepspeed"): raise ValueError( "Muon optimizer is currently incompatible with DeepSpeed" ) if data.get("fsdp") or data.get("fsdp_config"): fsdp_version = cls._resolve_fsdp_version(data) if str(fsdp_version) != "2": raise ValueError( "Muon optimizer is only compatible with FSDP2. Set fsdp_version: 2 to use Muon with FSDP." ) return data @model_validator(mode="before") @classmethod def check_flashoptim_deepspeed_fsdp(cls, data): optimizer = data.get("optimizer") or "" if str(optimizer).startswith("flash_"): if data.get("deepspeed"): raise ValueError( f"{optimizer} optimizer is incompatible with DeepSpeed. " "Flash optimizers only support DDP and FSDP2." ) if data.get("fsdp") or data.get("fsdp_config"): fsdp_version = cls._resolve_fsdp_version(data) if str(fsdp_version) != "2": raise ValueError( f"{optimizer} optimizer is only compatible with FSDP2. " "Set fsdp_version: 2 to use flash optimizers with FSDP." ) return data @model_validator(mode="before") @classmethod def check_batch_flattening_fa(cls, data): if data.get("batch_flattening"): batch_flattening_auto = data.get("batch_flattening") == "auto" if not data.get("flash_attention") and not batch_flattening_auto: raise ValueError("batch_flattening requires flash attention") if data.get("sample_packing") and not batch_flattening_auto: raise ValueError("batch_flattening not compatible with sample_packing") if data.get("micro_batch_size") == 1 and not batch_flattening_auto: LOG.warning("batch_flattening has no effect with micro_batch_size == 1") if ( batch_flattening_auto and data.get("flash_attention") and not data.get("sample_packing") and data.get("micro_batch_size") > 1 ): data["batch_flattening"] = True elif batch_flattening_auto: data["batch_flattening"] = False return data @model_validator(mode="before") @classmethod def check_xentropy_patch_conflicts(cls, data): if data.get("flash_attn_cross_entropy") and data.get( "unsloth_cross_entropy_loss" ): raise ValueError( "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled" ) return data @model_validator(mode="before") @classmethod def check_cross_entropy_conflicts(cls, data): """Check for mutual exclusivity between cross entropy patch options. Only one of the following can be enabled at a time: - cut_cross_entropy (CutCrossEntropyPlugin) - chunked_cross_entropy - liger_cross_entropy (LigerPlugin) - liger_fused_linear_cross_entropy (LigerPlugin) """ ce_options = { "cut_cross_entropy": data.get("cut_cross_entropy"), "chunked_cross_entropy": data.get("chunked_cross_entropy"), "liger_cross_entropy": data.get("liger_cross_entropy"), "liger_fused_linear_cross_entropy": data.get( "liger_fused_linear_cross_entropy" ), } enabled_options = [k for k, v in ce_options.items() if v] if len(enabled_options) > 1: raise ValueError( f"Only one cross entropy optimization can be enabled at a time. " f"Found {len(enabled_options)} enabled: {', '.join(enabled_options)}. " "Please disable all but one." ) return data @model_validator(mode="before") @classmethod def check_fsdp_version(cls, data): fsdp_config = data.get("fsdp_config", {}) if fsdp_config and str(data.get("fsdp_version")) != "2": LOG.info( "FSDP1 will be deprecated in an upcoming release of Axolotl." "We recommend that you use FSDP version 2 for better performance and compatibility. " "Please see this link for more details: https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp " "For more details on migrating your config. " ) return data @model_validator(mode="before") @classmethod def check_fsdp2_cpu_offload_pin_memory(cls, data): if not (fsdp_config := data.get("fsdp_config")): return data if fsdp_config.get("cpu_offload_pin_memory") is False: if str(data.get("fsdp_version")) != "2": raise ValueError( "FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2" ) if not fsdp_config.get("offload_params"): raise ValueError( "disabling cpu_offload_pin_memory requires enabling offload_params" ) return data @model_validator(mode="before") @classmethod def check_fsdp2_base_model_quant_rl(cls, data): if data.get("fsdp_version") == 2 and data.get("rl") in [ RLType.DPO, RLType.KTO, RLType.ORPO, RLType.IPO, ]: if data.get("load_in_8bit") or data.get("load_in_4bit"): raise ValueError( f"FSDP2 does not support load_in_8bit or load_in_4bit with {data.get('rl')}. Please use DeepSpeed or set `fsdp_version` to 1." ) return data @model_validator(mode="before") @classmethod def check_fsdp_config_kwargs_prefix(cls, data): if fsdp_config := data.get("fsdp_config"): should_fix = False for key, _ in fsdp_config.items(): if key.startswith("fsdp_"): should_fix = True LOG.warning_once( "Configuring FSDP fields with the `fsdp_` prefix is deprecated. " "Please omit the `fsdp_` prefix from the any fields in `fsdp_config`." ) if should_fix: update_fsdp_config = {} for key, value in fsdp_config.items(): if key.startswith("fsdp_") and key != "fsdp_version": update_fsdp_config[key.replace("fsdp_", "")] = value else: update_fsdp_config[key] = value data["fsdp_config"] = update_fsdp_config return data @model_validator(mode="before") @classmethod def check_fsdp_version_in_fsdp_config(cls, data): fsdp_config = data.get("fsdp_config") or {} fsdp_version = data.get("fsdp_version", None) if not fsdp_version and fsdp_config and fsdp_config.get("version"): fsdp_cfg_version = fsdp_config.pop("version") data["fsdp_version"] = fsdp_cfg_version data["fsdp_config"]["fsdp_version"] = fsdp_cfg_version elif not fsdp_version and fsdp_config and fsdp_config.get("fsdp_version"): data["fsdp_version"] = fsdp_config.get("fsdp_version") if fsdp_version and fsdp_config and not fsdp_config.get("fsdp_version"): data["fsdp_config"]["fsdp_version"] = fsdp_version return data @model_validator(mode="after") def check_fsdp_offload_w_8bit_optimizer(self): if ( hasattr(self, "fsdp_config") and self.fsdp_config and self.optimizer and "8bit" in self.optimizer.value and self.fsdp_config.offload_params and str(self.fsdp_version) != "2" ): raise ValueError( f"FSDP Offload not compatible with {str(self.optimizer.value)}" ) return self @model_validator(mode="after") def check_fsdp2_w_8bit_optimizer(self): if ( hasattr(self, "fsdp_config") and self.fsdp_config and self.optimizer and "8bit" in self.optimizer.value and str(self.fsdp_version) == "2" ): if self.optimizer in ["adamw_8bit", "adamw_bnb_8bit"]: # CUDA ops errors with bnb 8bit optimizer + FSDP2 raise ValueError( f"FSDP2 not compatible with {self.optimizer.value}, use `adamw_torch_8bit` instead" ) return self @model_validator(mode="before") @classmethod def check_tensor_parallel_size_update_ds_json(cls, data): tensor_parallel_size = data.get("tensor_parallel_size") if tensor_parallel_size is not None and tensor_parallel_size > 1: if data.get("deepspeed"): with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin: ds_config = json.load(ds_fin) should_save = False if "tensor_parallel" not in ds_config: ds_config["tensor_parallel"] = { "autotp_size": tensor_parallel_size } should_save = True if ( "gather_16bit_weights_on_model_save" not in ds_config["zero_optimization"] ): ds_config["zero_optimization"][ "gather_16bit_weights_on_model_save" ] = True should_save = True if should_save: temp_dir = tempfile.mkdtemp() with open( Path(temp_dir) / "autotp_ds.json", "w", encoding="utf-8" ) as ds_fout: json.dump(ds_config, ds_fout, indent=4) data["deepspeed"] = str(Path(temp_dir) / "autotp_ds.json") return data @model_validator(mode="before") @classmethod def check_deepcompile(cls, data): deepcompile = data.get("deepcompile") if deepcompile: if not data.get("deepspeed"): raise ValueError("DeepCompile is only supported with DeepSpeed") with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin: ds_config = json.load(ds_fin) if "compile" not in ds_config: ds_config["compile"] = {"deepcompile": True} temp_dir = tempfile.mkdtemp() with open( Path(temp_dir) / "deepcompile_ds.json", "w", encoding="utf-8" ) as ds_fout: json.dump(ds_config, ds_fout, indent=4) data["deepspeed"] = str(Path(temp_dir) / "deepcompile_ds.json") return data class SystemValidationMixin: """Validation methods related to system and hardware configuration.""" @model_validator(mode="before") @classmethod def check_mem_mismatch(cls, data): if ( data.get("max_memory") is not None and data.get("gpu_memory_limit") is not None ): raise ValueError( "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together." ) return data @model_validator(mode="before") @classmethod def check_fsdp_deepspeed(cls, data): if data.get("deepspeed") and data.get("fsdp"): raise ValueError("deepspeed and fsdp cannot be used together.") return data @model_validator(mode="before") @classmethod def check_model_quantization_config_vs_bnb(cls, data): if data.get("model_quantization_config"): if data.get("load_in_8bit") or data.get("load_in_4bit"): raise ValueError( "model_quantization_config and load_in_8bit or load_in_4bit cannot be used together." ) return data @model_validator(mode="before") @classmethod def check_npu_config(cls, data): if is_torch_npu_available(): # check attention config attn_list = ["flash_attention", "sdp_attention", "s2_attention"] for attn in attn_list: if data.get(attn): raise NotImplementedError( f"{attn} is currently not supported in Ascend npu, please disable this configuration." ) # check quant config if data.get("optimizer") is not None and "bit" in data.get("optimizer"): optimizer = data.get("optimizer") raise NotImplementedError( f"{optimizer} is currently not supported in Ascend npu, choose another one please." ) quant_list = ["load_in_8bit", "load_in_4bit"] for quant in quant_list: if data.get(quant): raise NotImplementedError( f"Quantification is currently not supported in Ascend npu, please disable {quant}." ) # check dtype config if data.get("tf32"): raise NotImplementedError( "tf32 dtype is currently not supported in Ascend npu, please disable this configuration" ) return data class ChatTemplateValidationMixin: """Validation methods related to chat template configuration.""" @model_validator(mode="before") @classmethod def check_chat_template_config(cls, data): # if chat_template is set to jinja, chat_template_jinja is required if data.get("chat_template") == ChatTemplate.jinja and not data.get( "chat_template_jinja" ): raise ValueError( "chat_template_jinja is required when chat_template is set to jinja" ) # If chat_template_jinja is set, set chat_template to jinja if data.get("chat_template_jinja") and not data.get("chat_template"): data["chat_template"] = ChatTemplate.jinja return data class PretrainingValidationMixin: """Validation methods related to pretraining configuration.""" @model_validator(mode="before") @classmethod def check_pretraining_w_max_steps(cls, data): if data.get("pretraining_dataset") and not data.get("max_steps"): raise ValueError( "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!" ) return data @model_validator(mode="before") @classmethod def check_pretraining_w_group_by_length(cls, data): if data.get("pretraining_dataset") and data.get("group_by_length"): LOG.warning( "You probably want to disable group_by_length as it will force a streamed dataset to download completely." ) return data @model_validator(mode="before") @classmethod def check_pretraining_split_batches_accelerate(cls, data): # alternatively set ACCELERATE_SPLIT_BATCHES=False if data.get("pretraining_dataset"): accelerator_config = data.get("accelerator_config", {}) if not accelerator_config: data["accelerator_config"] = { "split_batches": False, "dispatch_batches": False, } else: if accelerator_config.get("split_batches") is None: data["accelerator_config"]["split_batches"] = False if accelerator_config.get("dispatch_batches") is None: data["accelerator_config"]["dispatch_batches"] = False return data @model_validator(mode="before") @classmethod def check_pretraining_w_val_set_size(cls, data): if data.get("pretraining_dataset") and data.get("val_set_size"): raise ValueError( "val_set_size is not supported with pretraining_dataset. " "Use test_datasets to specify evaluation datasets for pretraining." ) return data @model_validator(mode="before") @classmethod def check_streaming_w_val_set_size(cls, data): if data.get("streaming") and data.get("val_set_size"): raise ValueError( "val_set_size is not supported with streaming datasets. " "Use test_datasets to specify evaluation datasets when streaming is enabled." ) return data @model_validator(mode="before") @classmethod def check_streaming_w_max_steps(cls, data): if data.get("streaming") and not data.get("max_steps"): raise ValueError( "max_steps must be set when using streaming datasets. " "Trainer cannot infer dataset length for iterable datasets." ) return data @model_validator(mode="before") @classmethod def check_streaming_w_multiple_datasets(cls, data): if ( data.get("streaming") and data.get("sample_packing") and data.get("datasets") and len(data.get("datasets")) > 1 ): raise NotImplementedError( "Sample packing with multiple streaming datasets is not yet supported" ) return data class ModelCompatibilityValidationMixin: """Validation methods for specific model compatibility.""" @model_validator(mode="after") def check_falcon_fsdp(self): if (self.base_model and "falcon" in self.base_model.lower()) and self.fsdp: raise ValueError("FSDP is not supported for falcon models") return self @model_validator(mode="after") def check_mpt_checkpointing(self): if ( self.base_model and "mpt" in self.base_model.lower() ) and self.gradient_checkpointing: raise ValueError("gradient_checkpointing is not supported for MPT models") return self @model_validator(mode="after") def check_gradient_checkpointing_w_offload(self): if self.gradient_checkpointing == "offload": LOG.warning( "`offload` is deprecated for gradient_checkpointing, use `activation_offloading: true` or `activation_offloading: legacy`" ) self.gradient_checkpointing = True LOG.warning( "`offload` now uses a new stream implementation; to use the previous implementation, use `activation_offloading: legacy`" ) self.activation_offloading = True if self.gradient_checkpointing == "offload_disk": LOG.warning( "`offload_disk` is deprecated for gradient_checkpointing, use `activation_offloading: disk`" ) self.gradient_checkpointing = True self.activation_offloading = "disk" return self @model_validator(mode="after") def check_activation_offloading_wo_gc(self): if self.activation_offloading and not self.gradient_checkpointing: raise ValueError("activation_offloading requires gradient_checkpointing") return self @model_validator(mode="after") def check_better_transformers(self): if self.flash_optimum is True: if self.adapter: LOG.warning( "BetterTransformers probably doesn't work with PEFT adapters" ) if self.fp16 or self.bf16: raise ValueError("AMP is not supported with BetterTransformer") if self.float16 is not True and self.bfloat16 is not True: LOG.warning( "You should probably set bfloat16 or float16 to true to " "load the model in float16 for BetterTransformers" ) return self @model_validator(mode="before") @classmethod def check_gptq_w_revision(cls, data): if data.get("gptq") and data.get("revision_of_model"): raise ValueError( "revision_of_model is not supported for GPTQ models. " + "Please download the model from HuggingFace Hub manually for correct branch, " + "point to its path, and remove revision_of_model from the config." ) return data @model_validator(mode="before") @classmethod def check_gpt_oss_fsdp_loading(cls, data): if data.get("model_quantization_config", "") == "Mxfp4Config": fsdp_config = data.get("fsdp_config") or {} if fsdp_config.get("cpu_ram_efficient_loading", False) is True: raise ValueError( "FSDP cpu_ram_efficient_loading is not supported for Mxfp4Config model quantization." ) return data class ComplexValidationMixin: """Complex validation methods that involve multiple systems.""" @field_validator("neftune_noise_alpha") @classmethod def validate_neftune_noise_alpha(cls, neftune_noise_alpha): if neftune_noise_alpha is not None and neftune_noise_alpha <= 0.0: raise ValueError("neftune_noise_alpha must be > 0.0") return neftune_noise_alpha @model_validator(mode="after") def check_rl_beta(self): if self.dpo_beta and not self.rl_beta: self.rl_beta = self.dpo_beta del self.dpo_beta return self @model_validator(mode="after") def check_simpo_warmup(self): if self.rl is RLType.SIMPO and self.warmup_ratio: raise ValueError( "warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead" ) return self @model_validator(mode="after") def check_relora(self): if self.relora: if not self.jagged_restart_steps: raise ValueError("jagged_restart_steps must be set to use ReLoRA") if self.adapter not in ("lora", "qlora"): raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA") if self.fsdp or self.fsdp_config: raise ValueError("fsdp not supported with ReLoRA") if self.deepspeed: raise ValueError("deepspeed not supported with ReLoRA") if self.lr_scheduler == "one_cycle": raise ValueError( "ReLoRA is not compatible with the one_cycle scheduler" ) if self.flash_attn_fuse_mlp: raise ValueError("Fused modules are not supported with ReLoRA") return self @model_validator(mode="after") def check_early_stopping(self): if self.early_stopping_patience: if not self.save_steps or not self.eval_steps: raise ValueError( "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps." ) if self.save_steps % self.eval_steps != 0: raise ValueError( "`early_stopping_patience` requires that eval_steps should evenly divide save_steps." ) return self @model_validator(mode="after") def check_tensor_parallel_size(self): if not self.tensor_parallel_size: self.tensor_parallel_size = 1 return self @model_validator(mode="after") def check_context_parallel_size(self): if self.sequence_parallel_degree and not self.context_parallel_size: LOG.warning( "`sequence_parallel_degree` is deprecated, use `context_parallel_size`" ) self.context_parallel_size = self.sequence_parallel_degree if not self.context_parallel_size: self.context_parallel_size = 1 elif self.context_parallel_size > 1: if not self.flash_attention: raise ValueError( "flash_attention: true must be set with context_parallel_size > 1" ) if self.sample_packing and self.micro_batch_size > 1: raise ValueError( "micro_batch_size must be set to 1 when sample_packing is enabled " "due to a `ring-flash-attn` requirement" ) try: import transformers.modeling_flash_attention_utils from transformers.utils import is_flash_attn_greater_or_equal transformers.modeling_flash_attention_utils._flash_supports_window = ( True ) sys.modules[ "transformers.modeling_flash_attention_utils" ]._flash_supports_window = True sys.modules[ "transformers.modeling_flash_attention_utils" ]._flash_supports_window_size = True sys.modules[ "transformers.modeling_flash_attention_utils" ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal import ring_flash_attn # noqa: F401 # Required after monkey-patching except ImportError as exception: raise ImportError( "context_parallel_size > 1 but ring_flash_attn is not installed. " "Please install it with `pip install axolotl[ring-flash-attn] " "or `pip install ring-flash-attn>=0.1.4`." ) from exception LOG.warning( "Sequence parallelism (SP) is enabled with " f"context_parallel_size={self.context_parallel_size}. " "Please note that logged losses may differ slightly to the non-SP " "losses due to transformers Trainer implementation details. " "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 " "for more details." ) return self @model_validator(mode="after") def validate_ring_attn_func(self): if getattr(self, "context_parallel_size", 1) == 1: return self if self.ring_attn_func is not None: self.ring_attn_func = RingAttnFunc(self.ring_attn_func) else: # Default ring attention function selection sample_packing = getattr(self, "sample_packing", False) self.ring_attn_func = ( RingAttnFunc.VARLEN_LLAMA3 if sample_packing else RingAttnFunc.BATCH_RING ) return self def hint_gradient_checkpointing_dpo_lora_ddp(self): if ( (self.gradient_checkpointing is True or self.gradient_checkpointing is None) and self.capabilities and self.capabilities.get("n_gpu", 1) > 1 and self.adapter in ("lora", "qlora") and self.rl == RLType.DPO and not self.fsdp and not self.deepspeed ): LOG.warning( "gradient_checkpointing with DPO + DDP + LoRA is not recommended." ) return self class DistributedValidationMixin: """validation for distributed training.""" @model_validator(mode="after") def check_tensor_parallel_optimizer(self): if self.tensor_parallel_size > 1: if self.optimizer in ["paged_adamw_8bit", "adamw_8bit", "adamw_bnb_8bit"]: raise ValueError( "tensor_parallel_size is not supported with paged_adamw_8bit, adamw_8bit, and adamw_bnb_8bit optimizers" ) return self class GRPOVllmValidationMixin: """Validation mixin for vllm when using GRPO.""" @model_validator(mode="after") def check_vllm_mode_set(self): if self.trl and self.trl.use_vllm and not self.trl.vllm_mode: LOG.warning( "vllm_mode must be set to either `server` or `colocate` when using vllm, using default value `server`" ) self.trl.vllm_mode = "server" return self class ValidationMixin( DatasetValidationMixin, AttentionValidationMixin, TrainingValidationMixin, LoRAValidationMixin, RLValidationMixin, OptimizationValidationMixin, SystemValidationMixin, ChatTemplateValidationMixin, PretrainingValidationMixin, ModelCompatibilityValidationMixin, ComplexValidationMixin, GRPOVllmValidationMixin, ): """Full validation mixin for Axolotl configuration.""" ================================================ FILE: src/axolotl/utils/schemas/vllm.py ================================================ """ Pydantic models for VLLM configuration, used primarily for RL training with TRL + grpo """ from pydantic import BaseModel, Field class VllmConfig(BaseModel): """ Configuration for VLLM server """ device: str | None = Field( default="auto", json_schema_extra={"description": "Device to use for VLLM"}, ) tensor_parallel_size: int | None = Field( default=None, json_schema_extra={"description": "Tensor parallel size for VLLM"}, ) data_parallel_size: int | None = Field( default=None, json_schema_extra={"description": "Data parallel size for VLLM"}, ) gpu_memory_utilization: float | None = Field( default=0.9, json_schema_extra={"description": "GPU memory utilization for VLLM"}, ) dtype: str | None = Field( default="auto", json_schema_extra={"description": "Data type for VLLM"}, ) max_model_len: int | None = Field( default=None, json_schema_extra={ "description": "Maximum length of the model context for VLLM" }, ) enable_prefix_caching: bool | None = Field( default=None, json_schema_extra={"description": "Enable prefix caching for VLLM"}, ) host: str | None = Field( default="0.0.0.0", # nosec B104 json_schema_extra={"description": "Host for the vLLM server to start on"}, ) port: int | None = Field( default=8000, json_schema_extra={"description": "Port of the vLLM server to start on"}, ) enable_reasoning: bool | None = Field( default=None, json_schema_extra={"description": "Enable reasoning for VLLM"}, ) reasoning_parser: str | None = Field( default=None, json_schema_extra={"description": "Reasoning parser for VLLM"}, ) serve_module: str | None = Field( default=None, json_schema_extra={ "description": "Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' " "for native LoRA support, or leave None for default TRL serve." }, ) ================================================ FILE: src/axolotl/utils/tee.py ================================================ """ Utilities for managing the debug log file and providing a file-only stream for logging handlers. """ from __future__ import annotations import io import os import sys import threading from pathlib import Path from typing import TextIO, cast _lock = threading.Lock() _file_handle: io.TextIOWrapper | None = None _log_path: str | None = None _tee_installed: bool = False _orig_stdout: TextIO | None = None _orig_stderr: TextIO | None = None class _FileOnlyWriter(io.TextIOBase): """A stream-like object that writes only to the tee file. Before the file is prepared, writes are dropped (no-op). """ def write(self, s: str) -> int: # type: ignore[override] with _lock: if _file_handle is not None: _file_handle.write(s) return len(s) return len(s) def flush(self) -> None: # type: ignore[override] with _lock: if _file_handle is not None: try: _file_handle.flush() except Exception: pass file_only_stream: io.TextIOBase = _FileOnlyWriter() class _StreamTee(io.TextIOBase): """A minimal tee that mirrors writes to the debug log file. Installed only after the debug log is prepared; no buffering. """ def __init__(self, stream: io.TextIOBase): self._stream = stream def write(self, s: str) -> int: # type: ignore[override] with _lock: n = self._stream.write(s) if _file_handle is not None: _file_handle.write(s) return n def flush(self) -> None: # type: ignore[override] with _lock: self._stream.flush() if _file_handle is not None: try: _file_handle.flush() except Exception: pass @property def encoding(self): # type: ignore[override] return getattr(self._stream, "encoding", None) @property def errors(self): # type: ignore[override] return getattr(self._stream, "errors", None) def isatty(self): # type: ignore[override] return getattr(self._stream, "isatty", lambda: False)() def fileno(self): # type: ignore[override] if hasattr(self._stream, "fileno"): return self._stream.fileno() raise OSError("Underlying stream has no fileno") def prepare_debug_log(cfg, filename: str = "debug.log") -> str: """ Prepare the debug log. Creates the output directory, handles append/truncate logic based on cfg, and opens the debug log file for subsequent writes via file-only handlers. """ global _file_handle, _log_path, _tee_installed with _lock: # If already initialized, reuse existing path if _log_path is not None: return _log_path output_dir = cfg.output_dir os.makedirs(output_dir, exist_ok=True) log_path = Path(output_dir) / filename append = bool( cfg.get("resume_from_checkpoint") or cfg.get("auto_resume_from_checkpoints") ) if not append: log_path.unlink(missing_ok=True) fh = open(log_path, "a", encoding="utf-8") fh.flush() _file_handle = fh _log_path = str(log_path) # Install a tee so stdout/stderr are mirrored to the debug file # Allow disabling via env for testing or advanced usage. tee_enabled = os.getenv("AXOLOTL_TEE_STDOUT", "1").lower() not in { "0", "false", "no", } if tee_enabled and not _tee_installed: # Save originals so we can restore later (e.g., tests) global _orig_stdout, _orig_stderr _orig_stdout = sys.stdout _orig_stderr = sys.stderr sys.stdout = _StreamTee(cast(io.TextIOBase, sys.stdout)) sys.stderr = _StreamTee(cast(io.TextIOBase, sys.stderr)) _tee_installed = True return _log_path def close_debug_log() -> None: """Flush and close the debug log and uninstall the stdout/stderr tee. Safe to call even if not initialized. """ global _file_handle, _log_path, _tee_installed, _orig_stdout, _orig_stderr with _lock: # Restore original stdout/stderr if we installed a tee if _tee_installed: if _orig_stdout is not None: sys.stdout = _orig_stdout if _orig_stderr is not None: sys.stderr = _orig_stderr _tee_installed = False _orig_stdout = None _orig_stderr = None # Close the file handle if open if _file_handle is not None: try: _file_handle.flush() _file_handle.close() except Exception: pass finally: _file_handle = None _log_path = None ================================================ FILE: src/axolotl/utils/tokenization.py ================================================ """Module for tokenization utilities""" from termcolor import colored from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def check_dataset_labels( dataset, tokenizer, num_examples=5, text_only=False, rl_mode=False, ): # the dataset is already shuffled, so let's just check the first 5 elements for idx in range(num_examples): if not rl_mode: check_example_labels(dataset[idx], tokenizer, text_only=text_only) else: check_rl_example_labels(dataset[idx], tokenizer, text_only=text_only) def check_example_labels(example, tokenizer, text_only=False): # Get the input_ids, labels, and attention_mask from the dataset input_ids = example["input_ids"] labels = example["labels"] target_mask = example.pop("target_mask", None) # You can compare the input_ids and labels element-wise # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0 colored_tokens = [] for _, (input_id, label_id) in enumerate(zip(input_ids, labels, strict=False)): decoded_input_token = tokenizer.decode(input_id) # Choose the color based on whether the label has the ignore value or not color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green") colored_token = colored(decoded_input_token, color) + ( not text_only and colored(f"({label_id}, {input_id})", "white") or "" ) colored_tokens.append(colored_token) delimiter = "" if text_only else " " LOG.info(delimiter.join(colored_tokens)) LOG.info("\n\n\n") target_labels_count = sum(label_id != -100 for label_id in labels) total_len = len(input_ids) LOG.info(f"Total input len: {total_len}") LOG.info(f"Count of labels: {target_labels_count}") if target_mask: target_mask_positions = sum(m[0] for m in target_mask) LOG.info(f"Number of positions in target_mask: {target_mask_positions}") return " ".join(colored_tokens) def color_token_for_rl_debug(decoded_token, encoded_token, color, text_only): """Helper function to color tokens based on their type.""" colored_text = colored(decoded_token, color) return ( colored_text if text_only else f"{colored_text}{colored(f'({encoded_token})', 'white')}" ) def process_tokens_for_rl_debug(tokens, color, tokenizer, text_only): """Helper function to process and color tokens.""" colored_tokens = [ color_token_for_rl_debug(tokenizer.decode(token), token, color, text_only) for token in tokenizer.encode(tokens, add_special_tokens=False) ] return colored_tokens def check_rl_example_labels(example, tokenizer, text_only=False): field_prompt, field_chosen, field_rejected, field_completion = ( "prompt", "chosen", "rejected", "completion", ) input_tokens = example[field_prompt] labels_chosen = example.get(field_chosen) labels_rejected = example.get(field_rejected) labels_completion = example.get(field_completion) # Create a delimiter based on text_only flag delimiter = "" if text_only else " " # Process and color each type of token colored_tokens = process_tokens_for_rl_debug( input_tokens, "yellow", tokenizer, text_only ) # Process tokens if labels_completion is None: colored_chosens = process_tokens_for_rl_debug( labels_chosen, "green", tokenizer, text_only ) colored_rejecteds = process_tokens_for_rl_debug( labels_rejected, "red", tokenizer, text_only ) else: colored_completion = process_tokens_for_rl_debug( labels_completion, "green", tokenizer, text_only ) # Logging information LOG.info(f"INPUT PROMPT: {delimiter.join(colored_tokens)}\n\n") if labels_completion is None: LOG.info(f"CHOSEN RESPONSE: {delimiter.join(colored_chosens)}\n\n") LOG.info(f"REJECTED RESPONSE: {delimiter.join(colored_rejecteds)}\n\n\n") else: LOG.info(f"COMPLETION RESPONSE: {delimiter.join(colored_completion)}\n\n\n") return delimiter.join(colored_tokens) ================================================ FILE: src/axolotl/utils/trackio_.py ================================================ """Module for trackio utilities""" import os from axolotl.utils.dict import DictDefault def setup_trackio_env_vars(cfg: DictDefault): for key in cfg.keys(): if key.startswith("trackio_"): value = cfg.get(key, "") if value and isinstance(value, str) and len(value) > 0: os.environ[key.upper()] = value if cfg.trackio_project_name and len(cfg.trackio_project_name) > 0: cfg.use_trackio = True ================================================ FILE: src/axolotl/utils/train.py ================================================ """Training utils for checkpoints""" from pathlib import Path from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | None: """ Determine the checkpoint to resume from based on configuration. Args: cfg: Dictionary mapping `axolotl` config keys to values. update: Whether to update the config with the determined checkpoint Returns: Path to the checkpoint to resume from, or `None` if not resuming. """ last_checkpoint = None checkpoints = sorted( ( p for p in Path(cfg.output_dir).glob("checkpoint-*") if p.name.split("-")[-1].isdigit() ), key=lambda p: int(p.name.split("-")[-1]), ) if checkpoints: last_checkpoint = str(checkpoints[-1]) if not update: LOG.info(f"Resuming from last checkpoint at {last_checkpoint}") return last_checkpoint if ( cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints and last_checkpoint is not None ): cfg.resume_from_checkpoint = last_checkpoint LOG.info( "Using auto-resume functionality to resume from checkpoint at " f"{cfg.resume_from_checkpoint}" ) return cfg.resume_from_checkpoint ================================================ FILE: src/axolotl/utils/trainer.py ================================================ """Module containing the Trainer class and related functions""" import json import math import os import random from contextlib import contextmanager from functools import partial from tempfile import NamedTemporaryFile from typing import List, Optional import numpy as np import torch import torch.cuda from datasets import IterableDataset, disable_caching, enable_caching from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from transformers.utils import is_torch_bf16_gpu_available from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast from axolotl.utils.environment import check_cuda_p2p_ib_support from axolotl.utils.logging import get_logger from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths LOG = get_logger(__name__) @torch.jit.script def weighted_cross_entropy( logits: torch.Tensor, labels: torch.Tensor, weights: torch.Tensor ): # Flatten the logits, labels, and weights tensors logits = logits.view( -1, logits.size(-1) ) # logits becomes of shape [batch_size*sequence_length, vocab_size] labels = labels.view(-1) # labels becomes of shape [batch_size*sequence_length] weights = weights.view(-1) # weights becomes of shape [batch_size*sequence_length] # Compute the unweighted cross entropy loss losses = torch.nn.functional.cross_entropy(logits, labels, reduction="none") # Apply the weights to the losses and compute their sum return (weights * losses).sum() @torch.jit.script def create_weighted_mask(labels: torch.Tensor): # Check if the tensor is 2D. If not, unsqueeze it to make it 2D if len(labels.shape) == 1: labels = labels.unsqueeze(0) weights = torch.zeros_like(labels).float() for i in range(labels.shape[0]): mask = labels[i] != -100 # Create a tensor to track group ids group_ids = torch.zeros_like(labels[i]).int() curr_group_id = 0 for j in range(1, len(labels[i])): if mask[j] and not mask[j - 1]: # switch from masked to unmasked label curr_group_id += 1 # start new group group_ids[j] = ( curr_group_id if mask[j] else 0 ) # assign group id if unmasked label # Count only unmasked labels in each group group_counts = torch.bincount(group_ids[mask]) mask_weights = torch.zeros_like(labels[i]).float() mask_weights[mask] = 1.0 / group_counts[group_ids[mask]] weights[i] = mask_weights return weights.squeeze() # squeeze the output to match the input dimension def trainer_weighted_loss(model_output, labels, shift_labels=True): logits = ( model_output["logits"] if isinstance(model_output, dict) else model_output[0] ) if shift_labels: logits = logits[..., :-1, :].contiguous() labels = labels[..., 1:].contiguous() weights = create_weighted_mask(labels) return weighted_cross_entropy(logits, labels, weights) @contextmanager def disable_datasets_caching(): try: disable_caching() yield finally: enable_caching() def add_position_ids(sample): """ Handle both single-example and batched data. - single example: sample['input_ids'] is a list[int] - batched data: sample['input_ids'] is a list[list[int]] """ # Return sample unchanged if "input_ids" is not present, or is empty if "input_ids" not in sample or not sample["input_ids"]: return sample input_ids = sample["input_ids"] # If first element is an int, it’s a single example # If first element is a list, it’s a batch if isinstance(input_ids[0], int): # ---- SINGLE EXAMPLE ---- seq_len = len(input_ids) # Position IDs for a single example # As a list sample["position_ids"] = list(range(seq_len)) sample["length"] = seq_len else: # ---- BATCHED EXAMPLES ---- # input_ids is a list of lists position_ids_batch = [] lengths_batch = [] for seq in input_ids: seq_len = len(seq) position_ids_batch.append(list(range(seq_len))) lengths_batch.append(seq_len) # Now store them back sample["position_ids"] = position_ids_batch sample["length"] = lengths_batch return sample def add_pose_position_ids( sample, max_context_len=32768, split_on_token_ids: Optional[List[int]] = None, chunks: int = 2, ): """ use the PoSE technique to extend the context length by randomly skipping positions in the context. We only want to skip right before tokens in the split_on_token_ids list. We should attempt to randomly distribute the skips, but we don't need the final position_ids to be the full context_len. There may be multiple turns in the context, so we want to make sure we take into account the maximum possible number of skips remaining in each sample. """ input_ids = sample["input_ids"] sample_len = len(input_ids) max_skips = max_context_len - sample_len if split_on_token_ids is None: split_on_token_ids = [] if split_on_token_ids: split_indices = [ i for i, token_id in enumerate(input_ids) if token_id in split_on_token_ids ] else: chunk_len = sample_len // chunks split_indices = [i * chunk_len for i in range(1, chunks)] split_indices.append(len(input_ids)) # make sure we go to the end of the sample if split_indices[0] < 2: # drop the first split index if it's too close to the beginning split_indices = split_indices[1:] position_ids = [] prev_index = 0 total_skips = 0 for split_index in split_indices: num_skips = ( random.randint(0, max_skips) # nosec B311 if prev_index != 0 and max_skips else 0 ) max_skips -= num_skips total_skips += num_skips segment_position_ids = list( range(prev_index + total_skips, split_index + total_skips) ) position_ids.extend(segment_position_ids) prev_index = split_index sample["sequence_len"] = position_ids[-1] position_ids = torch.tensor(position_ids) sample["position_ids"] = position_ids sample["length"] = len(position_ids) assert len(position_ids) == len(input_ids) return sample def add_length(sample): sample["length"] = len(sample["input_ids"]) return sample def filter_sequences_by_length( sample, sequence_len=2048, min_sequence_len=2, raise_on_drop=False ): """ Filter sequences outside valid length range [min_sequence_len, sequence_len]. Drops samples that are either too short (< min_sequence_len) or too long (> sequence_len). Works for both single-example (list[int]) or batched (list[list[int]]). If raise_on_drop is set, the code raises a ValueError if a sample is encountered that is too long and would have been dropped. """ min_sequence_len = min_sequence_len or 2 input_ids = sample["input_ids"] # Edge case: if input_ids is empty if not input_ids: # Decide if you want to drop or keep empty. Let's drop. return False # Check if single example or batched by looking at the first element if isinstance(input_ids[0], int): # Single example (input_ids is a list of int) length = len(input_ids) if raise_on_drop and length > sequence_len: raise ValueError( f"Sequence encountered with {length} tokens, which exceeds the maximum {sequence_len}." ) return min_sequence_len <= length <= sequence_len # Batched (input_ids is a list of lists) results = [] for seq in input_ids: length = len(seq) if raise_on_drop and length > sequence_len: raise ValueError( f"Sequence encountered with {length} tokens, which exceeds the maximum {sequence_len}." ) results.append(min_sequence_len <= length <= sequence_len) return results def process_datasets_for_packing(cfg, train_dataset, eval_dataset): drop_attn_mask = cfg.model_config_type in ["mamba", "gemma3"] if drop_attn_mask: LOG.info("dropping attention_mask column") train_dataset = train_dataset.remove_columns("attention_mask") if eval_dataset: eval_dataset = eval_dataset.remove_columns("attention_mask") if cfg.model_config_type in ["falcon", "mistral"]: LOG.info("dropping token_type_ids column if it exists") if "token_type_ids" in train_dataset.column_names: train_dataset = train_dataset.remove_columns("token_type_ids") if eval_dataset and "token_type_ids" in eval_dataset.column_names: eval_dataset = eval_dataset.remove_columns("token_type_ids") def drop_no_trainable_tokens(sample): """ Drop samples if all labels are -100 (i.e., zero trainable tokens). Works for both single-example or batched input. """ labels = sample["labels"] if not labels: return True # Check if single example or batch # If first element is an int, we assume a single example # If it's a list, we assume we're dealing with a batch if isinstance(labels[0], int): # Single example: return a single bool return np.any(labels != -100) # Batched: 'labels' is a list of lists # Return a list of booleans, one per sub-list results = [np.any(row_labels != -100) for row_labels in labels] return results try: prior_len = len(train_dataset) except TypeError: # handle iterable datasets case prior_len = None filter_map_kwargs = {} if not isinstance(train_dataset, IterableDataset): filter_map_kwargs["num_proc"] = cfg.dataset_num_proc filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess drop_long_kwargs = {} if filter_map_kwargs: drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens" train_dataset = train_dataset.filter( drop_no_trainable_tokens, batched=True, **filter_map_kwargs, **drop_long_kwargs, ) if prior_len: dropped = prior_len - len(train_dataset) if dropped: LOG.warning( f"Dropped {dropped} samples with no trainable tokens from train dataset" ) if eval_dataset: try: prior_len = len(eval_dataset) except TypeError: # handle iterable datasets case prior_len = None eval_dataset = eval_dataset.filter( drop_no_trainable_tokens, **filter_map_kwargs, **drop_long_kwargs, ) if prior_len: dropped = prior_len - len(eval_dataset) if dropped: LOG.warning( f"Dropped {dropped} samples with no trainable tokens from eval dataset" ) if cfg.group_by_length: train_dataset = train_dataset.map( add_length, num_proc=cfg.dataset_num_proc, load_from_cache_file=not cfg.is_preprocess, desc="Group By Length", ) if cfg.use_pose: pose_kwargs = {} if cfg.pose_num_chunks is not None: pose_kwargs["chunks"] = cfg.pose_num_chunks pose_fn = partial( add_pose_position_ids, max_context_len=cfg.pose_max_context_len, split_on_token_ids=cfg.pose_split_on_token_ids, **pose_kwargs, ) train_dataset = train_dataset.map( pose_fn, num_proc=cfg.dataset_num_proc, load_from_cache_file=not cfg.is_preprocess, desc="Add position_id column (PoSE)", ) train_dataset = train_dataset.sort("sequence_len") if cfg.eval_sample_packing is not False: if eval_dataset: eval_dataset = eval_dataset.map( pose_fn, num_proc=cfg.dataset_num_proc, load_from_cache_file=not cfg.is_preprocess, desc="Add position_id column (PoSE)", ) elif cfg.sample_packing: drop_long_kwargs = {} if filter_map_kwargs: drop_long_kwargs["desc"] = "Add position_id column (Sample Packing)" train_dataset = train_dataset.map( add_position_ids, batched=True, **filter_map_kwargs, **drop_long_kwargs, ) if cfg.eval_sample_packing: if eval_dataset: eval_dataset = eval_dataset.map( add_position_ids, **filter_map_kwargs, **drop_long_kwargs, ) return train_dataset, eval_dataset def process_pretraining_datasets_for_packing( train_dataset, sequence_len, skip_position_ids=True, drop_attention_mask=False ): drop_outside_range = partial(filter_sequences_by_length, sequence_len=sequence_len) train_dataset = train_dataset.filter( drop_outside_range, desc="Dropping Long Sequences", load_from_cache_file=False, ) if not skip_position_ids: train_dataset = train_dataset.map( add_position_ids, batched=True, desc="Add position_id column (Pretraining Sample Packing)", ) if drop_attention_mask: train_dataset = train_dataset.remove_columns("attention_mask") return train_dataset def calculate_total_num_steps(cfg, train_dataset, update=True): if ( not cfg.total_num_tokens and not cfg.skip_prepare_dataset and not cfg.reward_model ): total_num_tokens = np.sum( train_dataset.select_columns("input_ids") .to_pandas()["input_ids"] .apply(len) .values ) LOG.debug(f"total_num_tokens: {total_num_tokens:_}") if update: cfg.total_num_tokens = total_num_tokens skip_estimates = cfg.model_config_type == "mamba" if ( not skip_estimates and not cfg.total_supervised_tokens and not cfg.skip_prepare_dataset and not cfg.reward_model ): total_supervised_tokens = ( train_dataset.data.column("labels") .to_pandas() .apply(lambda x: np.sum(np.array(x) != -100)) .sum() ) LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens:_}`") if update: cfg.total_supervised_tokens = total_supervised_tokens if not skip_estimates and cfg.sample_packing: # we have to drop anything longer then sequence len otherwise # flash attention with position ids fails if cfg.sample_packing_eff_est: total_num_steps = ( # match count to len est in dataloader int( math.floor( 0.99 * cfg.total_num_tokens / cfg.sample_packing_eff_est / cfg.sequence_len // cfg.batch_size ) - 1 ) * cfg.num_epochs ) LOG.debug( f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}" ) else: if cfg.flash_attention and not cfg.multipack_real_batches: sampler_batch_size = 1 batch_max_len = cfg.micro_batch_size * cfg.sequence_len else: sampler_batch_size = cfg.micro_batch_size batch_max_len = cfg.sequence_len if cfg.curriculum_sampling: sampler = SequentialSampler(train_dataset) else: sampler = RandomSampler(train_dataset) sampler = MultipackBatchSampler( sampler=sampler, lengths=get_dataset_lengths(train_dataset), batch_size=sampler_batch_size, batch_max_len=batch_max_len, group_size=cfg.sample_packing_group_size, bin_size=cfg.sample_packing_bin_size, sequential=cfg.sample_packing_sequentially, drop_last=True, num_processes=cfg.dataset_num_proc, mp_start_method=cfg.sample_packing_mp_start_method or "fork", ) data_loader = DataLoader( train_dataset.remove_columns(["length"]), batch_sampler=sampler, ) data_loader_len = max( 1, len(data_loader) * cfg.micro_batch_size // cfg.batch_size ) LOG.debug(f"data_loader_len: {data_loader_len}") # FIXME: is there a bug here somewhere? the total num steps depends # on the agreed on value for sample_packing_eff_est total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs)) if cfg.dataloader_drop_last: # drop the last batch for each epoch total_num_steps -= int(math.ceil(cfg.num_epochs)) def calc_sample_packing_eff_est(estimates: List[float]): LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}") return max(estimates) sample_packing_actual_eff_all = reduce_and_broadcast( lambda: sampler.efficiency(), calc_sample_packing_eff_est, ) sample_packing_eff_est = ( math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0 ) if update: cfg.sample_packing_eff_est = sample_packing_eff_est LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}") else: total_num_steps = int( math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) LOG.debug(f"total_num_steps: {total_num_steps}") return total_num_steps def setup_torch_compile_env(cfg): if cfg.torch_compile: if not cfg.torch_compile_backend: os.environ["ACCELERATE_DYNAMO_BACKEND"] = "INDUCTOR" else: os.environ["ACCELERATE_DYNAMO_BACKEND"] = cfg.torch_compile_backend.upper() def setup_deepspeed_env(cfg, stage=None): from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig from axolotl.utils.distributed import distributed_state if distributed_state and distributed_state.initialized: raise RuntimeError( "Distributed State already initialized before Deepspeed setup" ) os.environ["ACCELERATE_USE_DEEPSPEED"] = "true" if isinstance(cfg.deepspeed, DictDefault): with NamedTemporaryFile( mode="w", delete=False, suffix=".json", prefix="deepspeed_config_" ) as temp_file: temp_file.write(json.dumps(cfg.deepspeed.to_dict(), indent=4)) temp_file.close() cfg.deepspeed = str(temp_file.name) os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str( cfg.gradient_accumulation_steps ) if stage: os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage) if stage == 3: os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true" device_count = torch.cuda.device_count() if device_count == 1: os.environ.setdefault("WORLD_SIZE", "1") os.environ.setdefault("LOCAL_RANK", "0") os.environ.setdefault("MASTER_ADDR", "0.0.0.0") # nosec B104 os.environ.setdefault("MASTER_PORT", "29500") # NOTE(djsaunde): The distribued state cannot be initialized prior to the # ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior # to model load. if ( int(os.environ.get("WORLD_SIZE", "1")) == 1 and os.environ.get("AXOLOTL_IS_PREPROCESS", "0") != "1" and cfg.use_ray is not True ): os.environ["WORLD_SIZE"] = "1" # force it in case not set os.environ["LOCAL_RANK"] = "0" # force it in case not set os.environ["RANK"] = os.environ.get("LOCAL_RANK", "0") import deepspeed.comm as dist dist.init_distributed( dist_backend="nccl", auto_mpi_discovery=False, dist_init_required=True ) init_distributed_state() # If we don't assign this, it doesn't actually get set in the accelerate weakref _ = HfTrainerDeepSpeedConfig(cfg.deepspeed) def setup_fsdp_envs(cfg): os.environ["ACCELERATE_USE_FSDP"] = "true" # TODO @SalmanMohammadi remove FSDP1 args in 0.12 if str(cfg.fsdp_version) == "2": os.environ["FSDP_VERSION"] = "2" if cfg.fsdp_config.activation_checkpointing: os.environ["FSDP_ACTIVATION_CHECKPOINTING"] = "true" if cfg.fsdp_config.offload_params: os.environ["FSDP_OFFLOAD_PARAMS"] = "true" if cfg.fsdp_config.sync_module_states: os.environ["FSDP_SYNC_MODULE_STATES"] = "true" if cfg.fsdp_config.cpu_ram_efficient_loading: os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true" if cfg.fsdp_config.use_orig_params: os.environ["FSDP_USE_ORIG_PARAMS"] = "true" if cfg.fsdp_config.state_dict_type: os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.state_dict_type if cfg.fsdp_config.cpu_offload_pin_memory is not None: os.environ["FSDP_CPU_OFFLOAD_PIN_MEMORY"] = str( cfg.fsdp_config.cpu_offload_pin_memory ).lower() if cfg.fsdp_config.auto_wrap_policy: os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.auto_wrap_policy if cfg.fsdp_config.transformer_layer_cls_to_wrap: os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ( cfg.fsdp_config.transformer_layer_cls_to_wrap ) if cfg.fsdp_config.reshard_after_forward: os.environ["FSDP_RESHARD_AFTER_FORWARD"] = "true" def setup_parallelism_envs(cfg): set_accelerate_parallelism_config = False if cfg.tensor_parallel_size and cfg.tensor_parallel_size > 1: set_accelerate_parallelism_config = True os.environ["PARALLELISM_CONFIG_TP_SIZE"] = str(cfg.tensor_parallel_size) if cfg.dp_shard_size and cfg.dp_shard_size > 1: set_accelerate_parallelism_config = True os.environ["PARALLELISM_CONFIG_DP_SHARD_SIZE"] = str(cfg.dp_shard_size) if cfg.dp_replicate_size and cfg.dp_replicate_size > 1: set_accelerate_parallelism_config = True os.environ["PARALLELISM_CONFIG_DP_REPLICATE_SIZE"] = str(cfg.dp_replicate_size) if cfg.context_parallel_size and cfg.context_parallel_size > 1: set_accelerate_parallelism_config = True os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size) os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true" from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp patch_prepare_cp() if set_accelerate_parallelism_config: os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true" def prepare_optim_env(cfg): if not check_cuda_p2p_ib_support(): if os.getenv("NCCL_P2P_DISABLE") is None: LOG.warning("P2P support not detected, setting `NCCL_P2P_DISABLE=1`") os.environ["NCCL_P2P_DISABLE"] = "1" # TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12 if cfg.fsdp or cfg.fsdp_config: cfg.fsdp = True if not cfg.fsdp else cfg.fsdp setup_fsdp_envs(cfg) elif cfg.deepspeed: stage = None deepspeed_config = None # check if the cfg.deepspeed is a file if isinstance(cfg.deepspeed, DictDefault): deepspeed_config = cfg.deepspeed elif os.path.isfile(cfg.deepspeed): # parse with json with open(cfg.deepspeed, "r", encoding="utf-8") as fin: deepspeed_config = json.load(fin) if deepspeed_config: stage = deepspeed_config.get("zero_optimization", {}).get("stage", None) setup_deepspeed_env(cfg, stage=stage) setup_parallelism_envs(cfg) setup_torch_compile_env(cfg) if cfg.fp8: os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8" elif (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True: os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16" elif cfg.fp16: os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16" else: os.environ["ACCELERATE_MIXED_PRECISION"] = "no" def setup_trainer( cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps, model_ref=None, peft_config=None, ): """ Helper method for instantiating and building a (causal or RLHF) trainer. Args: cfg: Axolotl config object containing training parameters. train_dataset: Dataset to use for training. eval_dataset: Dataset to use for evaluation. model: The model to train. tokenizer: Tokenizer for processing text input. processor: Processor for data preparation. total_num_steps: The total number of training steps. model_ref: Optional reference model for RLHF training. Default is None. peft_config: Optional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None. Returns: A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based on the provided parameters. """ from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder if cfg.rl: trainer_builder = HFRLTrainerBuilder(cfg, model, tokenizer, processor) trainer_builder.model_ref = model_ref trainer_builder.peft_config = peft_config else: trainer_builder = HFCausalTrainerBuilder(cfg, model, tokenizer, processor) trainer_builder.train_dataset = train_dataset trainer_builder.eval_dataset = eval_dataset return trainer_builder.build(total_num_steps) ================================================ FILE: src/axolotl/utils/wandb_.py ================================================ """Module for wandb utilities""" import os from axolotl.utils.dict import DictDefault def setup_wandb_env_vars(cfg: DictDefault): for key in cfg.keys(): if key.startswith("wandb_"): value = cfg.get(key, "") if value and isinstance(value, str) and len(value) > 0: os.environ[key.upper()] = value # Enable wandb if project name is present if cfg.wandb_project and len(cfg.wandb_project) > 0: cfg.use_wandb = True ================================================ FILE: src/setuptools_axolotl_dynamic_dependencies.py ================================================ """ dynamic requirements for axolotl """ import platform import re from importlib.metadata import PackageNotFoundError, version from setuptools.command.build_py import build_py as _build_py def parse_requirements(): _install_requires = [] _dependency_links = [] with open("./requirements.txt", encoding="utf-8") as requirements_file: lines = [r.strip() for r in requirements_file.readlines()] for line in lines: is_extras = ( "flash-attn" in line or "flash-attention" in line or "deepspeed" in line or "mamba-ssm" in line or "lion-pytorch" in line ) if line.startswith("--extra-index-url"): # Handle custom index URLs _, url = line.split() _dependency_links.append(url) elif not is_extras and line and line[0] != "#": # Handle standard packages _install_requires.append(line) try: xformers_version = [req for req in _install_requires if "xformers" in req][0] torchao_version = [req for req in _install_requires if "torchao" in req][0] if "Darwin" in platform.system(): # don't install xformers on MacOS _install_requires.pop(_install_requires.index(xformers_version)) else: # detect the version of torch already installed # and set it so dependencies don't clobber the torch version try: torch_version = version("torch") except PackageNotFoundError: torch_version = "2.5.1" _install_requires.append(f"torch=={torch_version}") version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version) if version_match: major, minor, patch = version_match.groups() major, minor = int(major), int(minor) patch = ( int(patch) if patch is not None else 0 ) # Default patch to 0 if not present else: raise ValueError("Invalid version format") if (major, minor) >= (2, 5): _install_requires.pop(_install_requires.index(xformers_version)) if patch == 0: _install_requires.append("xformers==0.0.28.post2") else: _install_requires.append("xformers==0.0.28.post3") elif (major, minor) >= (2, 4): if patch == 0: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.27") else: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers==0.0.28.post1") elif (major, minor) >= (2, 3): _install_requires.pop(_install_requires.index(torchao_version)) if patch == 0: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.26.post1") else: _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.27") elif (major, minor) >= (2, 2): _install_requires.pop(_install_requires.index(torchao_version)) _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.25.post1") else: _install_requires.pop(_install_requires.index(torchao_version)) _install_requires.pop(_install_requires.index(xformers_version)) _install_requires.append("xformers>=0.0.23.post1") except PackageNotFoundError: pass return _install_requires, _dependency_links class BuildPyCommand(_build_py): """ custom build_py command to parse dynamic requirements """ def finalize_options(self): super().finalize_options() install_requires, _ = parse_requirements() self.distribution.install_requires = install_requires ================================================ FILE: styles.css ================================================ /* TYPOGRAPHY SECTION */ /* Import fonts */ @import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;500&display=swap'); @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400&display=swap'); /* Typography hierarchy */ :root { --font-title: 'Be Vietnam Pro', sans-serif; --font-body: 'JetBrains Mono', monospace; } /* Title (h1) */ h1 { font-family: var(--font-title); font-weight: 400; font-size: 3rem; line-height: 1.1; letter-spacing: -0.05em; font-feature-settings: "ss01" on; } /* Heading (h2) */ h2 { font-family: var(--font-title); font-weight: 500; font-size: 1.5rem; line-height: 1.2; letter-spacing: -0.03em; font-feature-settings: "ss01" on; } /* Subtitle/Preamble */ h3, h4 { font-family: var(--font-body); font-weight: 400; font-size: 1.25rem; line-height: 1.5; letter-spacing: -0.02em; } /* Body text */ body { font-family: var(--font-body); font-weight: 400; font-size: 1rem; line-height: 1.5; letter-spacing: -0.02em; } /* Links */ a { font-family: var(--font-body); font-weight: 400; font-size: 0.875rem; line-height: 1; letter-spacing: -0.02em; } /* NAV BAR SECTION */ /* Navbar logo styling */ .navbar-brand img { height: 32px; margin-right: 10px; } /* COLORS SECTION */ /* Brand colors */ :root { --white: #ffffff; --greige-300: #EEEEE7; --greige-600: #CCCAC0; --black: #141310; --lime: #E3F8A8; --cyan: #A0F4EA; --purple: #C8D0F8; } /* Base styles */ body { background-color: var(--black); color: var(--greige-300); } /* Navigation */ .navbar { background-color: var(--black) !important; } .navbar-dark .navbar-nav .nav-link { color: var(--greige-300); } .navbar-dark .navbar-nav .nav-link:hover { color: var(--lime); } /* Sidebar */ .sidebar-navigation { background-color: var(--black); border-right: 1px solid var(--greige-600); } .sidebar nav[role="doc-toc"] ul>li>a { color: var(--greige-300); } .sidebar nav[role="doc-toc"] ul>li>a:hover { color: var(--lime); } /* Links */ a { color: var(--lime); } a:hover { color: var(--cyan); } /* Headers */ h1, h2, h3, h4, h5, h6 { color: var(--white); } /* Code blocks */ pre { background-color: #1a1a1a !important; border: 1px solid var(--greige-600); } /* Tables */ .table { color: var(--greige-300); } /* TOC */ #toc-title { color: var(--white); } .toc-active { color: var(--lime) !important; } /* Buttons */ .btn-primary { background-color: var(--lime); color: var(--black); border: none; } .btn-primary:hover { background-color: var(--cyan); color: var(--black); } /* For inline code (single backtick) */ code { background-color: #1a1a1a !important; color: var(--lime) !important; padding: 2px 4px; border-radius: 4px; } /* For inline code that is also a link */ a code { color: var(--cyan) !important; } /* For code blocks (triple backtick) */ pre.sourceCode { background-color: #1a1a1a !important; } /* Make comments in bash/shell scripts green */ code span.co { color: #5cb85c !important; } /* Remove underlines from JSON comments and make them green */ code span.er { color: #5cb85c !important; text-decoration: none !important; } /* API Documentation Styling */ /* Improve docstring section rendering */ .level3 p { white-space: pre-line !important; } /* Format docstring sections */ .level3 p strong { display: block; margin-top: 1em; font-weight: bold; color: var(--cyan); } /* Add spacing after sections */ .level3 p:has(strong) { margin-bottom: 0.5em; } /* Format Args and Returns sections */ p:has(code) { line-height: 1.6; } /* Function signatures */ .sourceCode { margin-bottom: 1.5em; } /* Parameter tables */ .doc-section-parameters table, .doc-section-returns table { margin-top: 1em; margin-bottom: 1.5em; } /* Make parameter and returns headers smaller */ h2.anchored[data-anchor-id="parameters"], h2.anchored[data-anchor-id="returns"], .doc-section-parameters h4, .doc-section-returns h4 { font-size: 1.25rem; margin-top: 2rem; margin-bottom: 1rem; color: var(--lime); border-bottom: 1px solid var(--lime); padding-bottom: 0.3rem; font-family: var(--font-body); font-weight: 500; letter-spacing: normal; } /* Style documentation tables */ table { width: 100%; margin-bottom: 1.5rem; border-collapse: collapse; } table th { background-color: #1a1a1a; padding: 0.5rem 1rem; border-bottom: 2px solid var(--greige-600); text-align: left; } table td { padding: 0.5rem 1rem; border-bottom: 1px solid var(--greige-600); } /* Code in table cells */ table td code { background-color: transparent !important; padding: 0; } /* Improve spacing in parameter and return tables */ .doc-section-parameters, .doc-section-returns { margin-top: 1rem; } ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/cli/__init__.py ================================================ ================================================ FILE: tests/cli/conftest.py ================================================ """Shared pytest fixtures for cli module.""" import pytest from click.testing import CliRunner VALID_TEST_CONFIG = """ base_model: HuggingFaceTB/SmolLM2-135M datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca sequence_len: 2048 max_steps: 1 micro_batch_size: 1 gradient_accumulation_steps: 1 learning_rate: 1e-3 special_tokens: pad_token: <|endoftext|> """ @pytest.fixture def cli_runner(): return CliRunner() @pytest.fixture def valid_test_config(): return VALID_TEST_CONFIG @pytest.fixture def config_path(tmp_path): """Creates a temporary config file""" path = tmp_path / "config.yml" path.write_text(VALID_TEST_CONFIG) return path ================================================ FILE: tests/cli/test_cli_base.py ================================================ """Base test class for CLI commands.""" from pathlib import Path from unittest.mock import patch from axolotl.cli.main import cli class BaseCliTest: """Base class for CLI command tests.""" def _test_cli_validation(self, cli_runner, command: str): """Test CLI validation for a command. Args: cli_runner: CLI runner fixture command: Command to test (train/evaluate) """ # Test missing config file result = cli_runner.invoke(cli, [command, "--launcher", "python"]) assert result.exit_code != 0 # Test non-existent config file result = cli_runner.invoke( cli, [command, "nonexistent.yml", "--launcher", "python"] ) assert result.exit_code != 0 assert "Error: Invalid value for 'CONFIG'" in result.output def _test_basic_execution( self, cli_runner, tmp_path: Path, valid_test_config: str, command: str, train: bool = True, ): """Test basic execution with accelerate. Args: cli_runner: CLI runner fixture tmp_path: Temporary path fixture valid_test_config: Valid config fixture command: Command to test (train/evaluate) train: Whether to test training (default) or evaluation """ config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) mock_fn = "os.execvpe" if command == "train" else "subprocess.run" with patch(mock_fn) as mock: result = cli_runner.invoke(cli, [command, str(config_path)]) assert mock.called expected = [ "accelerate", "launch", "-m", f"axolotl.cli.{command}", str(config_path), "--debug=False", "--debug-text-only=False", "--debug-num-examples=0", ] if train: expected.append("--shard=False") if command == "train": assert mock.call_args.args[0] == "accelerate" assert mock.call_args.args[1] == expected else: assert mock.call_args.args[0] == expected assert mock.call_args.kwargs == {"check": True} assert result.exit_code == 0 def _test_cli_overrides(self, tmp_path: Path, valid_test_config: str): """Test CLI argument overrides. Args: tmp_path: Temporary path fixture valid_test_config: Valid config fixture command: Command to test (train/evaluate) """ config_path = tmp_path / "config.yml" output_dir = tmp_path / "model-out" test_config = valid_test_config.replace( "output_dir: model-out", f"output_dir: {output_dir}" ) config_path.write_text(test_config) return config_path ================================================ FILE: tests/cli/test_cli_evaluate.py ================================================ """Tests for evaluate CLI command.""" from unittest.mock import patch from axolotl.cli.main import cli from .test_cli_base import BaseCliTest class TestEvaluateCommand(BaseCliTest): """Test cases for evaluate command.""" cli = cli def test_evaluate_cli_validation(self, cli_runner): """Test CLI validation""" self._test_cli_validation(cli_runner, "evaluate") def test_evaluate_basic_execution(self, cli_runner, tmp_path, valid_test_config): """Test basic successful execution""" self._test_basic_execution( cli_runner, tmp_path, valid_test_config, "evaluate", train=False ) def test_evaluate_basic_execution_no_accelerate( self, cli_runner, tmp_path, valid_test_config ): """Test basic successful execution without accelerate""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate: result = cli_runner.invoke( cli, [ "evaluate", str(config_path), "--launcher", "python", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_evaluate.assert_called_once() def test_evaluate_cli_overrides(self, cli_runner, tmp_path, valid_test_config): """Test CLI arguments properly override config values""" config_path = self._test_cli_overrides(tmp_path, valid_test_config) with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate: result = cli_runner.invoke( cli, [ "evaluate", str(config_path), "--micro-batch-size", "2", "--sequence-len", "128", "--launcher", "python", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_evaluate.assert_called_once() cfg = mock_evaluate.call_args[0][0] assert cfg.micro_batch_size == 2 assert cfg.sequence_len == 128 def test_evaluate_with_launcher_args_torchrun( self, cli_runner, tmp_path, valid_test_config ): """Test evaluate with torchrun launcher arguments""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "evaluate", str(config_path), "--launcher", "torchrun", "--", "--nproc_per_node=2", "--nnodes=1", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to torchrun called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "torchrun" assert "--nproc_per_node=2" in called_cmd assert "--nnodes=1" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.evaluate" in called_cmd def test_evaluate_with_launcher_args_accelerate( self, cli_runner, tmp_path, valid_test_config ): """Test evaluate with accelerate launcher arguments""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "evaluate", str(config_path), "--launcher", "accelerate", "--", "--config_file=accelerate_config.yml", "--num_processes=4", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to accelerate called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" assert "--config_file=accelerate_config.yml" in called_cmd assert "--num_processes=4" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.evaluate" in called_cmd def test_evaluate_backward_compatibility_no_launcher_args( self, cli_runner, tmp_path, valid_test_config ): """Test that existing evaluate commands work without launcher args""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "evaluate", str(config_path), "--launcher", "accelerate", "--micro-batch-size", "2", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify no launcher args contamination called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" # Should not contain any extra launcher args launcher_section = called_cmd[2 : called_cmd.index("-m")] assert ( len(launcher_section) == 0 ) # No launcher args between 'launch' and '-m' ================================================ FILE: tests/cli/test_cli_fetch.py ================================================ """pytest tests for axolotl CLI fetch command.""" from unittest.mock import patch from axolotl.cli.main import fetch def test_fetch_cli_examples(cli_runner): """Test fetch command with examples directory""" with patch("axolotl.cli.main.fetch_from_github") as mock_fetch: result = cli_runner.invoke(fetch, ["examples"]) assert result.exit_code == 0 mock_fetch.assert_called_once_with("examples/", None) def test_fetch_cli_deepspeed(cli_runner): """Test fetch command with deepspeed_configs directory""" with patch("axolotl.cli.main.fetch_from_github") as mock_fetch: result = cli_runner.invoke(fetch, ["deepspeed_configs"]) assert result.exit_code == 0 mock_fetch.assert_called_once_with("deepspeed_configs/", None) def test_fetch_cli_with_dest(cli_runner, tmp_path): """Test fetch command with custom destination""" with patch("axolotl.cli.main.fetch_from_github") as mock_fetch: custom_dir = tmp_path / "tmp_examples" result = cli_runner.invoke(fetch, ["examples", "--dest", str(custom_dir)]) assert result.exit_code == 0 mock_fetch.assert_called_once_with("examples/", str(custom_dir)) def test_fetch_cli_invalid_directory(cli_runner): """Test fetch command with invalid directory choice""" result = cli_runner.invoke(fetch, ["invalid"]) assert result.exit_code != 0 ================================================ FILE: tests/cli/test_cli_inference.py ================================================ """pytest tests for axolotl CLI inference command.""" from unittest.mock import patch from axolotl.cli.main import cli def test_inference_basic(cli_runner, config_path): """Test basic inference""" with patch("axolotl.cli.inference.do_inference") as mock: result = cli_runner.invoke( cli, ["inference", str(config_path), "--launcher", "python"], catch_exceptions=False, ) assert mock.called assert result.exit_code == 0 def test_inference_gradio(cli_runner, config_path): """Test basic inference (gradio path)""" with patch("axolotl.cli.inference.do_inference_gradio") as mock: result = cli_runner.invoke( cli, ["inference", str(config_path), "--launcher", "python", "--gradio"], catch_exceptions=False, ) assert mock.called assert result.exit_code == 0 def test_inference_with_launcher_args_torchrun(cli_runner, config_path): """Test inference with torchrun launcher arguments""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "inference", str(config_path), "--launcher", "torchrun", "--", "--nproc_per_node=2", "--nnodes=1", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to torchrun called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "torchrun" assert "--nproc_per_node=2" in called_cmd assert "--nnodes=1" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.inference" in called_cmd def test_inference_with_launcher_args_accelerate(cli_runner, config_path): """Test inference with accelerate launcher arguments""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "inference", str(config_path), "--launcher", "accelerate", "--", "--config_file=accelerate_config.yml", "--num_processes=4", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to accelerate called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" assert "--config_file=accelerate_config.yml" in called_cmd assert "--num_processes=4" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.inference" in called_cmd def test_inference_gradio_with_launcher_args(cli_runner, config_path): """Test inference with gradio and launcher arguments""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "inference", str(config_path), "--launcher", "accelerate", "--gradio", "--", "--num_processes=2", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify both gradio flag and launcher args are present called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" assert "--num_processes=2" in called_cmd assert "--gradio" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.inference" in called_cmd def test_inference_backward_compatibility_no_launcher_args(cli_runner, config_path): """Test that existing inference commands work without launcher args""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "inference", str(config_path), "--launcher", "accelerate", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify no launcher args contamination called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" # Should not contain any extra launcher args launcher_section = called_cmd[2 : called_cmd.index("-m")] assert len(launcher_section) == 0 # No launcher args between 'launch' and '-m' ================================================ FILE: tests/cli/test_cli_interface.py ================================================ """General pytest tests for axolotl.cli.main interface.""" from axolotl.cli.main import build_command, cli def test_build_command(): """Test converting dict of options to CLI arguments""" base_cmd = ["accelerate", "launch"] options = { "learning_rate": 1e-4, "batch_size": 8, "debug": True, "use_fp16": False, "null_value": None, } result = build_command(base_cmd, options) assert result == [ "accelerate", "launch", "--learning-rate=0.0001", "--batch-size=8", "--debug=True", "--use-fp16=False", ] def test_invalid_command_options(cli_runner): """Test handling of invalid command options""" result = cli_runner.invoke( cli, [ "train", "config.yml", "--invalid-option", "value", ], ) assert result.exit_code != 0 assert "does not exist" in result.output def test_required_config_argument(cli_runner): """Test commands fail properly when config argument is missing""" result = cli_runner.invoke(cli, ["train"]) assert result.exit_code != 0 assert "Missing argument 'CONFIG'" in result.output ================================================ FILE: tests/cli/test_cli_merge_lora.py ================================================ """pytest tests for axolotl CLI merge_lora command.""" from unittest.mock import patch from axolotl.cli.main import cli def test_merge_lora_basic(cli_runner, config_path): """Test basic merge_lora command""" with patch("axolotl.cli.merge_lora.do_cli") as mock_do_cli: result = cli_runner.invoke(cli, ["merge-lora", str(config_path)]) assert result.exit_code == 0 mock_do_cli.assert_called_once() assert mock_do_cli.call_args.kwargs["config"] == str(config_path) def test_merge_lora_with_dirs(cli_runner, config_path, tmp_path): """Test merge_lora with custom lora and output directories""" lora_dir = tmp_path / "lora" output_dir = tmp_path / "output" lora_dir.mkdir() with patch("axolotl.cli.merge_lora.do_cli") as mock_do_cli: result = cli_runner.invoke( cli, [ "merge-lora", str(config_path), "--lora-model-dir", str(lora_dir), "--output-dir", str(output_dir), ], ) assert result.exit_code == 0 mock_do_cli.assert_called_once() assert mock_do_cli.call_args.kwargs["config"] == str(config_path) assert mock_do_cli.call_args.kwargs["lora_model_dir"] == str(lora_dir) assert mock_do_cli.call_args.kwargs["output_dir"] == str(output_dir) def test_merge_lora_nonexistent_config(cli_runner, tmp_path): """Test merge_lora with nonexistent config""" config_path = tmp_path / "nonexistent.yml" result = cli_runner.invoke(cli, ["merge-lora", str(config_path)]) assert result.exit_code != 0 def test_merge_lora_nonexistent_lora_dir(cli_runner, config_path, tmp_path): """Test merge_lora with nonexistent lora directory""" lora_dir = tmp_path / "nonexistent" result = cli_runner.invoke( cli, ["merge-lora", str(config_path), "--lora-model-dir", str(lora_dir)] ) assert result.exit_code != 0 ================================================ FILE: tests/cli/test_cli_merge_sharded_fsdp_weights.py ================================================ """pytest tests for axolotl CLI merge_sharded_fsdp_weights command.""" from unittest.mock import patch from axolotl.cli.main import cli def test_merge_sharded_fsdp_weights_no_accelerate(cli_runner, config_path): """Test merge_sharded_fsdp_weights command without accelerate""" with patch("axolotl.cli.merge_sharded_fsdp_weights.do_cli") as mock: result = cli_runner.invoke( cli, ["merge-sharded-fsdp-weights", str(config_path), "--launcher", "python"], ) assert mock.called assert mock.call_args.kwargs["config"] == str(config_path) assert result.exit_code == 0 def test_merge_sharded_fsdp_weights_with_launcher_args_torchrun( cli_runner, config_path ): """Test merge-sharded-fsdp-weights with torchrun launcher arguments""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "merge-sharded-fsdp-weights", str(config_path), "--launcher", "torchrun", "--", "--nproc_per_node=2", "--nnodes=1", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to torchrun called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "torchrun" assert "--nproc_per_node=2" in called_cmd assert "--nnodes=1" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd def test_merge_sharded_fsdp_weights_with_launcher_args_accelerate( cli_runner, config_path ): """Test merge-sharded-fsdp-weights with accelerate launcher arguments""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "merge-sharded-fsdp-weights", str(config_path), "--launcher", "accelerate", "--", "--config_file=accelerate_config.yml", "--num_processes=4", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to accelerate called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" assert "--config_file=accelerate_config.yml" in called_cmd assert "--num_processes=4" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd def test_merge_sharded_fsdp_weights_backward_compatibility_no_launcher_args( cli_runner, config_path ): """Test that existing merge-sharded-fsdp-weights commands work without launcher args""" with patch("subprocess.run") as mock_subprocess: result = cli_runner.invoke( cli, [ "merge-sharded-fsdp-weights", str(config_path), "--launcher", "accelerate", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify no launcher args contamination called_cmd = mock_subprocess.call_args.args[0] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" # Should not contain any extra launcher args launcher_section = called_cmd[2 : called_cmd.index("-m")] assert len(launcher_section) == 0 # No launcher args between 'launch' and '-m' ================================================ FILE: tests/cli/test_cli_preprocess.py ================================================ """pytest tests for axolotl CLI preprocess command.""" import shutil from pathlib import Path from unittest.mock import MagicMock, patch import pytest from axolotl.cli.main import cli @pytest.fixture(autouse=True) def cleanup_last_run_prepared(): yield if Path("last_run_prepared").exists(): shutil.rmtree("last_run_prepared") def test_preprocess_config_not_found(cli_runner): """Test preprocess fails when config not found""" result = cli_runner.invoke(cli, ["preprocess", "nonexistent.yml"]) assert result.exit_code != 0 def test_preprocess_basic(cli_runner, config_path): """Test basic preprocessing with minimal config""" with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli: with patch("axolotl.cli.preprocess.load_datasets") as mock_load_datasets: mock_load_datasets.return_value = MagicMock() result = cli_runner.invoke(cli, ["preprocess", str(config_path)]) assert result.exit_code == 0 mock_do_cli.assert_called_once() assert mock_do_cli.call_args.kwargs["config"] == str(config_path) assert mock_do_cli.call_args.kwargs["download"] is True def test_preprocess_without_download(cli_runner, config_path): """Test preprocessing without model download""" with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli: result = cli_runner.invoke( cli, ["preprocess", str(config_path), "--no-download"] ) assert result.exit_code == 0 mock_do_cli.assert_called_once() assert mock_do_cli.call_args.kwargs["config"] == str(config_path) assert mock_do_cli.call_args.kwargs["download"] is False def test_preprocess_custom_path(cli_runner, tmp_path, valid_test_config): """Test preprocessing with custom dataset path""" config_path = tmp_path / "config.yml" custom_path = tmp_path / "custom_prepared" config_path.write_text(valid_test_config) with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli: with patch("axolotl.cli.preprocess.load_datasets") as mock_load_datasets: mock_load_datasets.return_value = MagicMock() result = cli_runner.invoke( cli, [ "preprocess", str(config_path), "--dataset-prepared-path", str(custom_path.absolute()), ], ) assert result.exit_code == 0 mock_do_cli.assert_called_once() assert mock_do_cli.call_args.kwargs["config"] == str(config_path) assert mock_do_cli.call_args.kwargs["dataset_prepared_path"] == str( custom_path.absolute() ) ================================================ FILE: tests/cli/test_cli_sweeps.py ================================================ """ unit tests for generating sweep configurations """ from axolotl.cli.utils import generate_sweep_configs def test_generate_sweep_configs_no_pairs(): base_config = { "learning_rate": 0.1, "micro_batch_size": 1, "sample_packing": True, } sweeps_config = {"micro_batch_size": [1, 2, 4], "weight_decay": [0.0, 0.1]} generate_sweep_configs(base_config, sweeps_config) assert len(generate_sweep_configs(base_config, sweeps_config)) == 6 cfg_1 = { "learning_rate": 0.1, "micro_batch_size": 2, "weight_decay": 0.0, "sample_packing": True, } assert any( cfg_1 == cfg for cfg in generate_sweep_configs(base_config, sweeps_config) ) def test_generate_sweep_configs_with_pairs(): base_config = { "learning_rate": 0.1, "micro_batch_size": 1, "sample_packing": True, } sweeps_config = { "_": [ { "micro_batch_size": 1, "gradient_accumulation_steps": 8, }, { "micro_batch_size": 2, "gradient_accumulation_steps": 4, }, { "micro_batch_size": 4, "gradient_accumulation_steps": 2, }, { "micro_batch_size": 8, "gradient_accumulation_steps": 1, }, ], "weight_decay": [0.0, 0.1], } generate_sweep_configs(base_config, sweeps_config) assert len(generate_sweep_configs(base_config, sweeps_config)) == 8 assert all( cfg["gradient_accumulation_steps"] * cfg["micro_batch_size"] == 8 for cfg in generate_sweep_configs(base_config, sweeps_config) ) ================================================ FILE: tests/cli/test_cli_train.py ================================================ """Tests for train CLI command.""" from unittest.mock import MagicMock, patch from axolotl.cli.main import cli from .test_cli_base import BaseCliTest class TestTrainCommand(BaseCliTest): """Test cases for train command.""" cli = cli def test_train_cli_validation(self, cli_runner): """Test CLI validation""" self._test_cli_validation(cli_runner, "train") def test_train_basic_execution(self, cli_runner, tmp_path, valid_test_config): """Test basic successful execution""" self._test_basic_execution( cli_runner, tmp_path, valid_test_config, "train", train=True ) def test_train_basic_execution_no_accelerate( self, cli_runner, tmp_path, valid_test_config ): """Test basic successful execution without accelerate""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("axolotl.cli.train.train") as mock_train: mock_train.return_value = (MagicMock(), MagicMock(), MagicMock()) with patch("axolotl.cli.train.load_datasets") as mock_load_datasets: mock_load_datasets.return_value = MagicMock() result = cli_runner.invoke( cli, [ "train", str(config_path), "--launcher", "python", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_train.assert_called_once() def test_train_cli_overrides(self, cli_runner, tmp_path, valid_test_config): """Test CLI arguments properly override config values""" config_path = self._test_cli_overrides(tmp_path, valid_test_config) with patch("axolotl.cli.train.train") as mock_train: mock_train.return_value = (MagicMock(), MagicMock(), MagicMock()) with patch("axolotl.cli.train.load_datasets") as mock_load_datasets: mock_load_datasets.return_value = MagicMock() result = cli_runner.invoke( cli, [ "train", str(config_path), "--learning-rate=1e-4", "--micro-batch-size=2", "--launcher", "python", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_train.assert_called_once() cfg = mock_train.call_args[1]["cfg"] assert cfg["learning_rate"] == 1e-4 assert cfg["micro_batch_size"] == 2 def test_train_with_launcher_args_torchrun( self, cli_runner, tmp_path, valid_test_config ): """Test train with torchrun launcher arguments""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("os.execvpe") as mock_subprocess: result = cli_runner.invoke( cli, [ "train", str(config_path), "--launcher", "torchrun", "--", "--nproc_per_node=2", "--nnodes=1", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to torchrun called_cmd = mock_subprocess.call_args.args[1] assert called_cmd[0] == "torchrun" assert "--nproc_per_node=2" in called_cmd assert "--nnodes=1" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.train" in called_cmd def test_train_with_launcher_args_accelerate( self, cli_runner, tmp_path, valid_test_config ): """Test train with accelerate launcher arguments""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("os.execvpe") as mock_subprocess: result = cli_runner.invoke( cli, [ "train", str(config_path), "--launcher", "accelerate", "--", "--config_file=accelerate_config.yml", "--num_processes=4", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify launcher args are passed to accelerate assert mock_subprocess.call_args.args[0] == "accelerate" called_cmd = mock_subprocess.call_args.args[1] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" assert "--config_file=accelerate_config.yml" in called_cmd assert "--num_processes=4" in called_cmd assert "-m" in called_cmd assert "axolotl.cli.train" in called_cmd def test_train_backward_compatibility_no_launcher_args( self, cli_runner, tmp_path, valid_test_config ): """Test that existing train commands work without launcher args""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("os.execvpe") as mock_subprocess: result = cli_runner.invoke( cli, [ "train", str(config_path), "--launcher", "accelerate", "--learning-rate", "1e-4", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() # Verify no launcher args contamination assert mock_subprocess.call_args.args[0] == "accelerate" called_cmd = mock_subprocess.call_args.args[1] assert called_cmd[0] == "accelerate" assert called_cmd[1] == "launch" # Should not contain any extra launcher args launcher_section = called_cmd[2 : called_cmd.index("-m")] assert ( len(launcher_section) == 0 ) # No launcher args between 'launch' and '-m' def test_train_mixed_args_with_launcher_args( self, cli_runner, tmp_path, valid_test_config ): """Test train with both regular CLI args and launcher args""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) with patch("os.execvpe") as mock_subprocess: result = cli_runner.invoke( cli, [ "train", str(config_path), "--launcher", "torchrun", "--learning-rate", "2e-4", "--micro-batch-size", "4", "--", "--nproc_per_node=8", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_subprocess.assert_called_once() assert mock_subprocess.call_args.args[0] == "torchrun" called_cmd = mock_subprocess.call_args.args[1] # Verify launcher args assert "--nproc_per_node=8" in called_cmd # Verify axolotl args are also present assert "--learning-rate=2e-4" in called_cmd assert "--micro-batch-size=4" in called_cmd def test_train_cloud_with_launcher_args( self, cli_runner, tmp_path, valid_test_config ): """Test train with cloud and launcher arguments""" config_path = tmp_path / "config.yml" config_path.write_text(valid_test_config) cloud_path = tmp_path / "cloud.yml" cloud_path.write_text("provider: modal\ngpu: a100") with patch("axolotl.cli.cloud.do_cli_train") as mock_cloud_train: result = cli_runner.invoke( cli, [ "train", str(config_path), "--cloud", str(cloud_path), "--launcher", "torchrun", "--", "--nproc_per_node=4", "--nnodes=2", ], catch_exceptions=False, ) assert result.exit_code == 0 mock_cloud_train.assert_called_once() # Verify cloud training was called with launcher args call_kwargs = mock_cloud_train.call_args.kwargs assert call_kwargs["launcher"] == "torchrun" assert call_kwargs["launcher_args"] == ["--nproc_per_node=4", "--nnodes=2"] ================================================ FILE: tests/cli/test_cli_version.py ================================================ """pytest tests for axolotl CLI --version""" from axolotl.cli.main import cli def test_print_version(cli_runner): """Test that version is printed when --version is used.""" result = cli_runner.invoke(cli, ["--version"]) assert result.exit_code == 0 assert "axolotl, version " in result.output ================================================ FILE: tests/cli/test_nested_options.py ================================================ """Tests for nested config option handling via CLI dot-notation.""" import click from click.testing import CliRunner from pydantic import BaseModel, Field from axolotl.cli.utils.args import add_options_from_config, filter_none_kwargs class InnerConfig(BaseModel): """A nested config model for testing.""" beta: float | None = Field( default=None, description="Beta parameter.", ) host: str | None = Field( default=None, description="Server host.", ) use_feature: bool = Field( default=False, description="Whether to use the feature.", ) class OuterConfig(BaseModel): """A top-level config model for testing.""" learning_rate: float | None = Field( default=None, description="Learning rate.", ) inner: InnerConfig | None = Field( default=None, description="Inner config.", ) name: str | None = Field( default=None, description="Model name.", ) class TestAddOptionsFromConfigNested: """Test that add_options_from_config handles nested BaseModel fields.""" def setup_method(self): self.runner = CliRunner() def test_nested_dot_notation_options_are_registered(self): """Nested model fields should create --parent.child CLI options.""" @click.command() @add_options_from_config(OuterConfig) @filter_none_kwargs def cmd(**kwargs): for k, v in sorted(kwargs.items()): click.echo(f"{k}={v}") result = self.runner.invoke(cmd, ["--inner.beta=0.5", "--inner.host=localhost"]) assert result.exit_code == 0, result.output assert "inner__beta=0.5" in result.output assert "inner__host=localhost" in result.output def test_nested_bool_option(self): """Nested bool fields should support --parent.field/--no-parent.field.""" @click.command() @add_options_from_config(OuterConfig) @filter_none_kwargs def cmd(**kwargs): for k, v in sorted(kwargs.items()): click.echo(f"{k}={v}") result = self.runner.invoke(cmd, ["--inner.use-feature"]) assert result.exit_code == 0, result.output assert "inner__use_feature=True" in result.output def test_flat_and_nested_options_together(self): """Flat and nested options should work together.""" @click.command() @add_options_from_config(OuterConfig) @filter_none_kwargs def cmd(**kwargs): for k, v in sorted(kwargs.items()): click.echo(f"{k}={v}") result = self.runner.invoke( cmd, ["--learning-rate=0.001", "--inner.beta=0.1", "--name=test"] ) assert result.exit_code == 0, result.output assert "learning_rate=0.001" in result.output assert "inner__beta=0.1" in result.output assert "name=test" in result.output def test_no_nested_options_passed(self): """When no nested options are passed, they should not appear in kwargs.""" @click.command() @add_options_from_config(OuterConfig) @filter_none_kwargs def cmd(**kwargs): click.echo(f"keys={sorted(kwargs.keys())}") result = self.runner.invoke(cmd, ["--learning-rate=0.01"]) assert result.exit_code == 0, result.output assert "inner__" not in result.output class TestLoadCfgNestedKwargs: """Test that load_cfg correctly applies nested (double-underscore) kwargs.""" @staticmethod def _apply_nested_kwargs(cfg, kwargs): """Helper that mirrors the nested kwargs handling from load_cfg, including type coercion for string CLI values.""" from axolotl.cli.config import _coerce_value nested_kwargs: dict = {} flat_kwargs: dict = {} for key, value in kwargs.items(): if "__" in key: parent, child = key.split("__", 1) nested_kwargs.setdefault(parent, {})[child] = value else: flat_kwargs[key] = value cfg_keys = cfg.keys() for key, value in flat_kwargs.items(): if key in cfg_keys: cfg[key] = _coerce_value(value, cfg.get(key)) for parent, children in nested_kwargs.items(): if cfg[parent] is None: cfg[parent] = {} if not isinstance(cfg[parent], dict): cfg[parent] = {} for child_key, child_value in children.items(): existing = cfg[parent].get(child_key) cfg[parent][child_key] = _coerce_value(child_value, existing) return cfg def test_nested_kwargs_applied_to_cfg(self, tmp_path): """Double-underscore kwargs should set nested config values.""" from axolotl.utils.dict import DictDefault cfg = DictDefault({"trl": {"beta": 0.1}, "learning_rate": 0.01}) # CLI passes strings, so simulate that kwargs = { "trl__beta": "0.5", "trl__host": "192.168.1.1", "learning_rate": "0.02", } cfg = self._apply_nested_kwargs(cfg, kwargs) assert cfg["learning_rate"] == 0.02 assert isinstance(cfg["learning_rate"], float) assert cfg["trl"]["beta"] == 0.5 assert isinstance(cfg["trl"]["beta"], float) assert cfg["trl"]["host"] == "192.168.1.1" def test_nested_kwargs_creates_parent_if_none(self): """If the parent key is None, nested kwargs should create the dict.""" from axolotl.utils.dict import DictDefault cfg = DictDefault({"trl": None, "learning_rate": 0.01}) cfg = self._apply_nested_kwargs(cfg, {"trl__beta": "0.5"}) # No existing value, YAML-style inference: "0.5" -> 0.5 assert cfg["trl"]["beta"] == 0.5 assert isinstance(cfg["trl"]["beta"], float) def test_nested_kwargs_overwrites_string_parent(self): """If the parent key is a string, it should be replaced with a dict.""" from axolotl.utils.dict import DictDefault cfg = DictDefault({"trl": "some_string", "learning_rate": 0.01}) cfg = self._apply_nested_kwargs(cfg, {"trl__beta": "0.5"}) assert cfg["trl"]["beta"] == 0.5 class TestCoerceValue: """Test YAML-style type coercion for CLI string values.""" def test_coerce_with_existing_float(self): from axolotl.cli.config import _coerce_value assert _coerce_value("0.5", 0.1) == 0.5 assert isinstance(_coerce_value("0.5", 0.1), float) def test_coerce_with_existing_int(self): from axolotl.cli.config import _coerce_value assert _coerce_value("42", 10) == 42 assert isinstance(_coerce_value("42", 10), int) def test_coerce_with_existing_bool(self): from axolotl.cli.config import _coerce_value assert _coerce_value("true", False) is True assert _coerce_value("false", True) is False assert _coerce_value("1", False) is True assert _coerce_value("0", True) is False def test_coerce_yaml_inference_no_existing(self): """Without an existing value, use YAML-style inference.""" from axolotl.cli.config import _coerce_value assert _coerce_value("true", None) is True assert _coerce_value("false", None) is False assert _coerce_value("42", None) == 42 assert isinstance(_coerce_value("42", None), int) assert _coerce_value("3.14", None) == 3.14 assert isinstance(_coerce_value("3.14", None), float) assert _coerce_value("null", None) is None assert _coerce_value("hello", None) == "hello" def test_coerce_non_string_passthrough(self): """Non-string values should pass through unchanged.""" from axolotl.cli.config import _coerce_value assert _coerce_value(0.5, 0.1) == 0.5 assert _coerce_value(True, False) is True ================================================ FILE: tests/cli/test_utils.py ================================================ """pytest tests for axolotl CLI utils.""" import json from unittest.mock import Mock, patch import click import pytest import requests from axolotl.cli.utils import fetch_from_github # Sample GitHub API response MOCK_TREE_RESPONSE = { "tree": [ {"path": "examples/config1.yml", "type": "blob", "sha": "abc123"}, {"path": "examples/config2.yml", "type": "blob", "sha": "def456"}, {"path": "other/file.txt", "type": "blob", "sha": "xyz789"}, ] } @pytest.fixture def mock_responses(): """Mock responses for API and file downloads""" def mock_get(url, timeout=None): response = Mock() if "api.github.com" in url: response.text = json.dumps(MOCK_TREE_RESPONSE) else: response.content = b"file content" return response return mock_get def test_fetch_from_github_new_files(tmp_path, mock_responses): """Test fetching new files""" with patch("requests.get", mock_responses): fetch_from_github("examples/", tmp_path) # Verify files were created assert (tmp_path / "config1.yml").exists() assert (tmp_path / "config2.yml").exists() assert not (tmp_path / "file.txt").exists() def test_fetch_from_github_unchanged_files(tmp_path, mock_responses): """Test handling of unchanged files""" # Create existing file with matching SHA existing_file = tmp_path / "config1.yml" existing_file.write_bytes(b"file content") with patch("requests.get", mock_responses): fetch_from_github("examples/", tmp_path) # File should not be downloaded again assert existing_file.read_bytes() == b"file content" def test_fetch_from_github_invalid_prefix(mock_responses): """Test error handling for invalid directory prefix""" with patch("requests.get", mock_responses): with pytest.raises(click.ClickException): fetch_from_github("nonexistent/", None) def test_fetch_from_github_network_error(): """Test handling of network errors""" with patch("requests.get", side_effect=requests.RequestException): with pytest.raises(requests.RequestException): fetch_from_github("examples/", None) def assert_launcher_args_in_command( mock_subprocess_call, launcher: str, expected_launcher_args: list[str], command_module: str, ): """ Helper function to verify launcher arguments are properly passed in subprocess calls. Args: mock_subprocess_call: The mock subprocess.run call launcher: Expected launcher ("accelerate", "torchrun", etc.) expected_launcher_args: List of expected launcher arguments command_module: Expected module name (e.g., "axolotl.cli.train") """ assert mock_subprocess_call.called, "subprocess.run should have been called" called_cmd = mock_subprocess_call.call_args.args[0] # Verify launcher assert called_cmd[0] == launcher, ( f"Expected launcher {launcher}, got {called_cmd[0]}" ) # Verify launcher args are present for arg in expected_launcher_args: assert arg in called_cmd, ( f"Expected launcher arg '{arg}' not found in command: {called_cmd}" ) # Verify module is present assert "-m" in called_cmd, "Expected -m flag for module execution" assert command_module in called_cmd, ( f"Expected module {command_module} not found in command: {called_cmd}" ) def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str): """ Helper function to verify no unwanted launcher arguments are present. Args: mock_subprocess_call: The mock subprocess.run call launcher: Expected launcher ("accelerate", "torchrun", etc.) """ assert mock_subprocess_call.called, "subprocess.run should have been called" called_cmd = mock_subprocess_call.call_args.args[0] if launcher == "accelerate": # For accelerate, launcher args should be between 'launch' and '-m' launch_idx = called_cmd.index("launch") m_idx = called_cmd.index("-m") launcher_section = called_cmd[launch_idx + 1 : m_idx] assert len(launcher_section) == 0, ( f"Unexpected launcher args found: {launcher_section}" ) elif launcher == "torchrun": # For torchrun, launcher args should be between 'torchrun' and '-m' torchrun_idx = called_cmd.index("torchrun") m_idx = called_cmd.index("-m") launcher_section = called_cmd[torchrun_idx + 1 : m_idx] assert len(launcher_section) == 0, ( f"Unexpected launcher args found: {launcher_section}" ) @pytest.fixture def common_launcher_args(): """Fixture providing common launcher argument combinations for testing.""" return { "torchrun": ["--nproc_per_node=2", "--nnodes=1"], "accelerate": ["--config_file=accelerate_config.yml", "--num_processes=4"], } def test_add_default_rdzv_args_with_endpoint(): """Test that default RDZV args are added when rdzv_endpoint is present.""" from axolotl.cli.utils.train import _add_default_rdzv_args launcher_args = ["--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400"] result = _add_default_rdzv_args(launcher_args) # Should have added rdzv_backend assert "--rdzv_backend" in result assert "c10d" in result # Original args should still be present assert "--nnodes=2" in result assert "--rdzv_endpoint=127.0.0.1:29400" in result def test_add_default_rdzv_args_with_existing_backend(): """Test that existing rdzv_backend is not overridden.""" from axolotl.cli.utils.train import _add_default_rdzv_args launcher_args = [ "--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400", "--rdzv_backend=static", ] result = _add_default_rdzv_args(launcher_args) # Should not add another rdzv_backend backend_count = sum(1 for arg in result if "--rdzv_backend" in arg) assert backend_count == 1 assert "--rdzv_backend=static" in result def test_add_default_rdzv_args_with_existing_id(): """Test that existing rdzv_id is not overridden.""" from axolotl.cli.utils.train import _add_default_rdzv_args launcher_args = [ "--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400", "--rdzv_id=my_job_123", ] result = _add_default_rdzv_args(launcher_args) # Should not add another rdzv_id id_count = sum(1 for arg in result if "--rdzv_id" in arg) assert id_count == 1 assert "--rdzv_id=my_job_123" in result # Should still add rdzv_backend assert "--rdzv_backend" in result assert "c10d" in result def test_add_default_rdzv_args_without_endpoint(): """Test that no RDZV args are added when rdzv_endpoint is not present.""" from axolotl.cli.utils.train import _add_default_rdzv_args launcher_args = ["--nnodes=2", "--nproc_per_node=4"] result = _add_default_rdzv_args(launcher_args) # Should not add any rdzv args assert "--rdzv_backend" not in result assert result == launcher_args def test_add_default_rdzv_args_with_all_existing(): """Test that no defaults are added when all RDZV args are present.""" from axolotl.cli.utils.train import _add_default_rdzv_args launcher_args = [ "--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400", "--rdzv_backend=static", "--rdzv_id=existing_job", ] result = _add_default_rdzv_args(launcher_args) # Should not add any additional args assert len(result) == len(launcher_args) assert result == launcher_args ================================================ FILE: tests/conftest.py ================================================ """Shared pytest fixtures""" import functools import importlib import logging import os import shutil import sys import tempfile import time from pathlib import Path from typing import Generator import datasets import pytest import requests import torch from huggingface_hub import snapshot_download from huggingface_hub.errors import LocalEntryNotFoundError from tokenizers import AddedToken from transformers import AutoTokenizer from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import ( enable_hf_offline, hf_offline_context, ) logging.getLogger("filelock").setLevel(logging.CRITICAL) def retry_on_request_exceptions(max_retries=3, delay=1): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): for attempt in range(max_retries): try: return func(*args, **kwargs) except ( requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError, ) as exc: if attempt < max_retries - 1: wait = 2**attempt * delay # in seconds time.sleep(wait) else: raise exc return wrapper return decorator @retry_on_request_exceptions(max_retries=3, delay=5) def snapshot_download_w_retry(*args, **kwargs): """ download a model or dataset from HF Hub, retrying in requests failures. We also try to fetch it from the local cache first using hf_hub_offline to avoid hitting HF Hub API rate limits. If it doesn't exist in the cache, disable hf_hub_offline and actually fetch from the hub """ with hf_offline_context(True): try: return snapshot_download(*args, local_files_only=True, **kwargs) except LocalEntryNotFoundError: pass with hf_offline_context(False): return snapshot_download(*args, **kwargs) @pytest.fixture(scope="session", autouse=True) def download_ds_fixture_bundle(): ds_dir = snapshot_download_w_retry( "axolotl-ai-internal/axolotl-oss-dataset-fixtures", repo_type="dataset" ) return Path(ds_dir) @pytest.fixture(scope="session", autouse=True) def download_smollm2_135m_model(): # download the model snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_smollm2_135m_instruct_model(): # download the model snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_smollm2_135m_gptq_model(): # download the model snapshot_download_w_retry("lilmeaty/SmolLM2-135M-Instruct-GPTQ", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_qwen_2_5_half_billion_model(): # download the model snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_qwen3_half_billion_model(): # download the model snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_tatsu_lab_alpaca_dataset(): # download the dataset snapshot_download_w_retry("tatsu-lab/alpaca", repo_type="dataset") @pytest.fixture(scope="session", autouse=True) def download_mhenrichsen_alpaca_2k_dataset(): # download the dataset snapshot_download_w_retry("mhenrichsen/alpaca_2k_test", repo_type="dataset") @pytest.fixture(scope="session", autouse=True) def download_mhenrichsen_alpaca_2k_w_revision_dataset(): # download the dataset snapshot_download_w_retry( "mhenrichsen/alpaca_2k_test", repo_type="dataset", revision="d05c1cb" ) @pytest.fixture(scope="session", autouse=True) def download_mlabonne_finetome_100k_dataset(): # download the dataset snapshot_download_w_retry("mlabonne/FineTome-100k", repo_type="dataset") @pytest.fixture(scope="session", autouse=True) def download_argilla_distilabel_capybara_dpo_7k_binarized_dataset(): # download the dataset snapshot_download_w_retry( "argilla/distilabel-capybara-dpo-7k-binarized", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_argilla_distilabel_intel_orca_dpo_dataset(): # download the dataset snapshot_download_w_retry( "argilla/distilabel-intel-orca-dpo-pairs", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset(): # download the dataset snapshot_download_w_retry( "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset(): # download the dataset snapshot_download_w_retry( "argilla/ultrafeedback-binarized-preferences-cleaned-kto", repo_type="dataset" ) # @pytest.fixture(scope="session", autouse=True) # def download_fozzie_alpaca_dpo_dataset(): # # download the dataset # snapshot_download_w_retry( # "fozziethebeat/alpaca_messages_2k_dpo_test", repo_type="dataset" # ) # snapshot_download_w_retry( # "fozziethebeat/alpaca_messages_2k_dpo_test", # repo_type="dataset", # revision="ea82cff", # ) # @pytest.fixture(scope="session") # @disable_hf_offline # def dataset_fozzie_alpaca_dpo_dataset( # download_fozzie_alpaca_dpo_dataset, # ): # return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train") # # # @pytest.fixture(scope="session") # @disable_hf_offline # def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff( # download_fozzie_alpaca_dpo_dataset, # ): # return load_dataset( # "fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff" # ) @pytest.fixture(scope="session", autouse=True) def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset(): # download the dataset snapshot_download_w_retry( "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_argilla_dpo_pairs_dataset(): # download the dataset snapshot_download_w_retry( "argilla/distilabel-intel-orca-dpo-pairs", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_tiny_shakespeare_dataset(): # download the dataset snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset") @pytest.fixture(scope="session", autouse=True) def download_evolkit_kd_sample_dataset(): # download the dataset snapshot_download_w_retry( "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample", repo_type="dataset" ) @pytest.fixture(scope="session", autouse=True) def download_deepseek_model_fixture(): snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model") @pytest.fixture(scope="session", autouse=True) def download_huggyllama_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "huggyllama/llama-7b", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_llama33_70b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_llama_1b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "NousResearch/Llama-3.2-1B", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_llama3_8b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "NousResearch/Meta-Llama-3-8B", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_llama3_8b_instruct_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "NousResearch/Meta-Llama-3-8B-Instruct", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_phi_35_mini_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "microsoft/Phi-3.5-mini-instruct", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_phi_4_reasoning_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "microsoft/Phi-4-reasoning", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_phi_3_medium_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "microsoft/Phi-3-medium-128k-instruct", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_mistral_7b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "casperhansen/mistral-7b-instruct-v0.1-awq", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_gemma3_4b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "mlx-community/gemma-3-4b-it-8bit", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_gemma_2b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "unsloth/gemma-2b-it", revision="703fb4a", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_gemma2_9b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "mlx-community/gemma-2-9b-it-4bit", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_mlx_mistral_7b_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "mlx-community/Mistral-7B-Instruct-v0.3-4bit", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture def download_llama2_model_fixture(): # download the tokenizer only snapshot_download_w_retry( "NousResearch/Llama-2-7b-hf", repo_type="model", allow_patterns=["*token*", "config.json"], ) @pytest.fixture(scope="session", autouse=True) def download_llama32_1b_model_fixture(): snapshot_download_w_retry( "osllmai-community/Llama-3.2-1B", repo_type="model", ) @pytest.fixture @enable_hf_offline def tokenizer_huggyllama( download_huggyllama_model_fixture, ): tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") tokenizer.pad_token = "" return tokenizer @pytest.fixture @enable_hf_offline def tokenizer_huggyllama_w_special_tokens( tokenizer_huggyllama, ): tokenizer_huggyllama.add_special_tokens( { "bos_token": "", "eos_token": "", "unk_token": "", } ) return tokenizer_huggyllama @pytest.fixture @enable_hf_offline def tokenizer_llama2_7b( download_llama2_model_fixture, ): tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf") return tokenizer @pytest.fixture @enable_hf_offline def tokenizer_mistral_7b_instruct( download_mlx_mistral_7b_model_fixture, ): return AutoTokenizer.from_pretrained("casperhansen/mistral-7b-instruct-v0.1-awq") @pytest.fixture def tokenizer_mistral_7b_instruct_chatml(tokenizer_mistral_7b_instruct): tokenizer_mistral_7b_instruct.add_special_tokens( { "eos_token": AddedToken( "<|im_end|>", rstrip=False, lstrip=False, normalized=False ) } ) tokenizer_mistral_7b_instruct.add_tokens( [ AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False), ] ) return tokenizer_mistral_7b_instruct @pytest.fixture def temp_dir() -> Generator[str, None, None]: # Create a temporary directory _temp_dir = tempfile.mkdtemp() yield _temp_dir # Clean up the directory after the test shutil.rmtree(_temp_dir) @pytest.fixture(scope="function", autouse=True) def torch_manual_seed(): torch.manual_seed(42) @pytest.fixture(scope="function", autouse=True) def cleanup_monkeypatches(): from transformers import Trainer from transformers.models.llama.modeling_llama import ( # LlamaFlashAttention2, LlamaAttention, LlamaForCausalLM, ) # original_fa2_forward = LlamaFlashAttention2.forward original_llama_attn_forward = LlamaAttention.forward original_llama_forward = LlamaForCausalLM.forward original_trainer_inner_training_loop = Trainer._inner_training_loop original_trainer_training_step = Trainer.training_step # monkey patches can happen inside the tests yield # Reset LlamaFlashAttention2 forward # LlamaFlashAttention2.forward = original_fa2_forward LlamaAttention.forward = original_llama_attn_forward LlamaForCausalLM.forward = original_llama_forward Trainer._inner_training_loop = original_trainer_inner_training_loop Trainer.training_step = original_trainer_training_step # Reset other known monkeypatches modules_to_reset: list[tuple[str, list[str]]] = [ ("transformers.models.llama",), ( "transformers.models.llama.modeling_llama", [ # "LlamaFlashAttention2", "LlamaAttention", ], ), ("transformers.trainer",), ("transformers", ["Trainer"]), ("transformers.loss.loss_utils",), ] for module_name_tuple in modules_to_reset: module_name = module_name_tuple[0] spec = importlib.util.spec_from_file_location( module_name, sys.modules[module_name].__file__ ) sys.modules[module_name] = importlib.util.module_from_spec(spec) spec.loader.exec_module(sys.modules[module_name]) sys.modules[module_name] = importlib.reload(sys.modules[module_name]) if len(module_name_tuple) > 1: module_globals = module_name_tuple[1] for module_global in module_globals: globals().pop(module_global, None) @pytest.fixture def dataset_winglian_tiny_shakespeare( download_ds_fixture_bundle: Path, ): ds_path = download_ds_fixture_bundle / "winglian__tiny-shakespeare" return datasets.load_from_disk(ds_path) @pytest.fixture def dataset_tatsu_lab_alpaca( download_ds_fixture_bundle: Path, ): ds_path = download_ds_fixture_bundle / "tatsu-lab__alpaca" return datasets.load_from_disk(ds_path)["train"] @pytest.fixture def dataset_mhenrichsen_alpaca_2k_test( download_ds_fixture_bundle: Path, ): ds_path = download_ds_fixture_bundle / "mhenrichsen__alpaca_2k_test" return datasets.load_from_disk(ds_path)["train"] @pytest.fixture def dataset_argilla_ultrafeedback_binarized_preferences_cleaned( download_ds_fixture_bundle: Path, ): ds_path = ( download_ds_fixture_bundle / "argilla__ultrafeedback-binarized-preferences-cleaned" ) return datasets.load_from_disk(ds_path)["train"] @pytest.fixture def dataset_fozziethebeat_alpaca_messages_2k_dpo_test( download_ds_fixture_bundle: Path, ): ds_path = download_ds_fixture_bundle / "fozziethebeat__alpaca_messages_2k_dpo_test" return datasets.load_from_disk(ds_path)["train"] @pytest.fixture def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff( download_ds_fixture_bundle: Path, ): ds_path = ( download_ds_fixture_bundle / "fozziethebeat__alpaca_messages_2k_dpo_test__rev_ea82cff" ) return datasets.load_from_disk(ds_path)["train"] @pytest.fixture(name="min_base_cfg") def fixture_min_base_cfg(): return DictDefault( base_model="HuggingFaceTB/SmolLM2-135M", learning_rate=1e-3, datasets=[ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], micro_batch_size=1, gradient_accumulation_steps=1, ) # @pytest.mark.skipif( os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1", reason="Not running in CI cache preload", ) def test_load_fixtures( download_smollm2_135m_model, download_qwen_2_5_half_billion_model, download_tatsu_lab_alpaca_dataset, download_mhenrichsen_alpaca_2k_dataset, download_mhenrichsen_alpaca_2k_w_revision_dataset, download_mlabonne_finetome_100k_dataset, download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset, download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset, download_argilla_distilabel_capybara_dpo_7k_binarized_dataset, download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset, download_argilla_dpo_pairs_dataset, download_tiny_shakespeare_dataset, download_deepseek_model_fixture, download_huggyllama_model_fixture, download_llama_1b_model_fixture, download_llama3_8b_model_fixture, download_llama3_8b_instruct_model_fixture, download_phi_35_mini_model_fixture, download_phi_3_medium_model_fixture, download_phi_4_reasoning_model_fixture, download_mistral_7b_model_fixture, download_gemma_2b_model_fixture, download_gemma2_9b_model_fixture, download_mlx_mistral_7b_model_fixture, download_llama2_model_fixture, ): pass @pytest.fixture(autouse=True) def disable_telemetry(monkeypatch): monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1") yield ================================================ FILE: tests/constants.py ================================================ # constants.py """ This module contains constants and configuration dictionaries used for datasets and other utilities in the Axolotl project, specifically for testing. """ # Configuration for Alpaca Messages Dataset ALPACA_MESSAGES_CONFIG_OG = { "path": "fozziethebeat/alpaca_messages_2k_dpo_test", "type": "chat_template.default", "chat_template": "llama3", "field_messages": "conversation", "field_chosen": "chosen", "field_rejected": "rejected", "message_field_role": "role", "message_field_content": "content", "roles": { "system": ["system"], "user": ["user"], "assistant": ["assistant"], }, } # Revision configuration extending the original ALPACA_MESSAGES_CONFIG_REVISION = ALPACA_MESSAGES_CONFIG_OG.copy() ALPACA_MESSAGES_CONFIG_REVISION["revision"] = "ea82cff" SPECIAL_TOKENS = { "bos_token": "", "eos_token": "", "unk_token": "", } ================================================ FILE: tests/core/chat/__init__.py ================================================ ================================================ FILE: tests/core/chat/format/__init__.py ================================================ ================================================ FILE: tests/core/chat/test_messages.py ================================================ """ Tests for the chat messages module """ import unittest import pytest from transformers import AddedToken, AutoTokenizer from axolotl.core.chat.format.chatml import format_message from axolotl.core.chat.messages import ChatFormattedChats, Chats from tests.hf_offline_utils import enable_hf_offline # noqa @pytest.fixture(scope="session", name="llama_tokenizer") @enable_hf_offline def llama_tokenizer_fixture(): return AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B") @pytest.fixture(scope="session", name="chatml_tokenizer") def llama_tokenizer_w_chatml(llama_tokenizer): llama_tokenizer.add_special_tokens( { "eos_token": AddedToken( "<|im_end|>", rstrip=False, lstrip=False, normalized=False ) } ) llama_tokenizer.add_tokens( [ AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False), ] ) return llama_tokenizer @pytest.fixture(scope="session", name="chat_msgs") def chat_msgs_fixture(): return { "conversation": [ { "role": "system", "content": [ {"type": "text", "value": "You are a helpful assistant."}, ], }, { "role": "user", "content": [ {"type": "text", "value": "What is today's stock price of Apple?"}, ], }, { "role": "assistant", "content": [ { "type": "tool_call", "value": { "name": "get_date", "arguments": {}, }, }, { "type": "tool_call", "value": { "name": "get_stock_price", "arguments": {"symbol": "AAPL"}, }, }, ], "weight": 1, }, { "role": "tool", "content": [ { "type": "tool_response", "value": { "name": "get_date", "content": {"date": "2024-09-09"}, }, }, { "type": "tool_response", "value": { "name": "get_stock_price", "content": {"symbol": "AAPL", "price": 123.45}, }, }, ], }, { "role": "assistant", "content": [ { "type": "text", "value": "The stock price of Apple is $123.45.\n", "weight": 0, }, { "type": "text", "value": "The original query asked for today's stock price of Apple. This implies they also wanted the date included in the response.", }, { "type": "text", "value": "The stock price of Apple on September 9, 2024 is $123.45.", }, ], "weight": 1, }, ] } class TestMessagesCase: """ Test cases for the chat messages module """ def test_tool_call_stringify(self, chat_msgs): chat_msgs_as_obj = Chats(**chat_msgs) assert '{"name": "get_stock_price", "arguments": {"symbol": "AAPL"}}' == str( chat_msgs_as_obj.conversation[2].content[1].value ) def test_chatml_formatted_wrapper(self, chat_msgs): chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message) target_chatml = """<|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user What is today's stock price of Apple?<|im_end|> <|im_start|>assistant {"name": "get_date", "arguments": {}} {"name": "get_stock_price", "arguments": {"symbol": "AAPL"}} <|im_end|> <|im_start|>tool {"name": "get_date", "content": {"date": "2024-09-09"}} {"name": "get_stock_price", "content": {"symbol": "AAPL", "price": 123.45}} <|im_end|> <|im_start|>assistant The stock price of Apple is $123.45. The original query asked for today's stock price of Apple. This implies they also wanted the date included in the response.The stock price of Apple on September 9, 2024 is $123.45.<|im_end|>\n""" assert target_chatml == str(chat_msg_formatted) def test_chatml_formatting_tool_call(self, chat_msgs): chat_msgs_as_obj = Chats(**chat_msgs) target_chatml_turn2 = """<|im_start|>assistant\n\n{"name": "get_date", "arguments": {}}\n\n\n{"name": "get_stock_price", "arguments": {"symbol": "AAPL"}}\n\n<|im_end|>\n""" assert target_chatml_turn2 == str( format_message(chat_msgs_as_obj.conversation[2]) ) def test_train_labels(self, chatml_tokenizer, chat_msgs): chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message) tokenized = chat_msg_formatted.conversation[2].tokenized(chatml_tokenizer) # fmt: off target_labels = [ -100, -100, -100, # role 27, 14506, 13735, 397, 5018, 609, 794, 330, 456, 4257, 498, 330, 16774, 794, 4792, 534, 524, 14506, 13735, 397, 27, 14506, 13735, 397, 5018, 609, 794, 330, 456, 31641, 9217, 498, 330, 16774, 794, 5324, 19314, 794, 330, 84016, 43, 96742, 524, 14506, 13735, 397, 128256, # <|im_end|> -100 # trailing newline ] # fmt: on assert tokenized["labels"] == target_labels def test_train_labels_2(self, chatml_tokenizer, chat_msgs): # also test if indivudal contents are set not to train chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message) tokenized = chat_msg_formatted.conversation[4].tokenized(chatml_tokenizer) # fmt: off target_labels = [ -100, -100, -100, # role -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # initial response 27, 78098, 16761, 4113, 3319, 4691, 369, 3432, 596, 5708, 3430, 315, 8325, 13, 1115, 24897, 814, 1101, 4934, 279, 2457, 5343, 304, 279, 2077, 4005, 78098, 16761, 5708, 3430, 315, 8325, 389, 6250, 220, 24, 11, 220, 2366, 19, 374, 400, 4513, 13, 1774, 13, 128256, # <|im_end|> -100, # trailing newline ] # fmt: on assert tokenized["labels"] == target_labels if __name__ == "__main__": unittest.main() ================================================ FILE: tests/core/test_async_grpo.py ================================================ """Unit tests for async GRPO""" import unittest from unittest.mock import MagicMock import torch class TestReplayBuffer(unittest.TestCase): """Tests for ReplayBuffer edge cases.""" def test_add_noop_when_max_size_zero(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=0) buf.add(1.0, {"data": "test"}) self.assertEqual(len(buf), 0) def test_add_noop_when_max_size_negative(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=-1) buf.add(1.0, {"data": "test"}) self.assertEqual(len(buf), 0) def test_sample_returns_none_when_max_size_zero(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=0) self.assertIsNone(buf.sample(1)) def test_sample_returns_none_when_empty(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=5) self.assertIsNone(buf.sample(1)) def test_normal_add_and_sample(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=3) buf.add(1.0, {"a": 1}) buf.add(2.0, {"a": 2}) buf.add(3.0, {"a": 3}) self.assertEqual(len(buf), 3) result = buf.sample(1) self.assertIsNotNone(result) self.assertEqual(len(result), 1) def test_replaces_lowest_when_full(self): from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer buf = ReplayBuffer(max_size=2) buf.add(1.0, {"a": 1}) buf.add(2.0, {"a": 2}) buf.add(3.0, {"a": 3}) # should replace score=1.0 self.assertEqual(len(buf), 2) scores = sorted(item[0] for item in buf._heap) self.assertEqual(scores, [2.0, 3.0]) class TestGRPOStrategyConflict(unittest.TestCase): """Tests for sequence_parallel + async_grpo conflict detection.""" def test_raises_on_both_enabled(self): from axolotl.core.trainers.grpo import GRPOStrategy with self.assertRaises(ValueError) as ctx: GRPOStrategy.get_trainer_class(sequence_parallel=True, async_grpo=True) self.assertIn("sequence_parallel", str(ctx.exception)) self.assertIn("async_grpo", str(ctx.exception)) def test_sequence_parallel_only(self): from axolotl.core.trainers.grpo import GRPOStrategy from axolotl.core.trainers.grpo.trainer import ( AxolotlGRPOSequenceParallelTrainer, ) cls = GRPOStrategy.get_trainer_class(sequence_parallel=True, async_grpo=False) self.assertIs(cls, AxolotlGRPOSequenceParallelTrainer) def test_async_only(self): from axolotl.core.trainers.grpo import GRPOStrategy from axolotl.core.trainers.grpo.trainer import AxolotlAsyncGRPOTrainer cls = GRPOStrategy.get_trainer_class(sequence_parallel=False, async_grpo=True) self.assertIs(cls, AxolotlAsyncGRPOTrainer) def test_neither(self): from axolotl.core.trainers.grpo import GRPOStrategy from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer cls = GRPOStrategy.get_trainer_class(sequence_parallel=False, async_grpo=False) self.assertIs(cls, AxolotlGRPOTrainer) class TestDequantizeFP8TailBlocks(unittest.TestCase): """Tests for FP8 dequantization with non-divisible dimensions.""" def test_exact_divisible_shape(self): from axolotl.kernels.quantize import dequantize_fp8 W = torch.randn(256, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn) scale_inv = torch.ones(2, 1, dtype=torch.bfloat16) result = dequantize_fp8(W, scale_inv) self.assertEqual(result.shape, (256, 128)) self.assertEqual(result.dtype, torch.bfloat16) def test_non_divisible_rows(self): from axolotl.kernels.quantize import dequantize_fp8 # 130 rows, scale has 2 blocks (block_size ~65 for exact div, but with # tail blocks: first block=65 rows, second=65 rows, 130%2=0 actually). # Use 131 rows with 2 scale blocks to trigger tail handling. W = torch.ones(131, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn) scale_inv = torch.tensor([[2.0], [3.0]], dtype=torch.bfloat16) result = dequantize_fp8(W, scale_inv) self.assertEqual(result.shape, (131, 128)) self.assertEqual(result.dtype, torch.bfloat16) def test_non_divisible_cols(self): from axolotl.kernels.quantize import dequantize_fp8 W = torch.ones(128, 200, dtype=torch.bfloat16).to(torch.float8_e4m3fn) scale_inv = torch.ones(1, 2, dtype=torch.bfloat16) result = dequantize_fp8(W, scale_inv) self.assertEqual(result.shape, (128, 200)) def test_scalar_scale(self): from axolotl.kernels.quantize import dequantize_fp8 W = torch.ones(64, 64, dtype=torch.bfloat16).to(torch.float8_e4m3fn) scale_inv = torch.tensor(2.0, dtype=torch.bfloat16) result = dequantize_fp8(W, scale_inv) self.assertEqual(result.shape, (64, 64)) class TestLoraFP8Guard(unittest.TestCase): """Tests that get_lora_parameters only uses weight_scale_inv for FP8 weights.""" def test_non_fp8_weight_skips_scale_inv(self): """Non-FP8 weight should NOT pick up weight_scale_inv as quant_state.""" from axolotl.kernels.lora import get_lora_parameters proj = MagicMock() proj.disable_adapters = True base_layer = MagicMock(spec=[]) # empty spec to control attrs precisely # Use a real tensor for weight (bf16, no quant_state attr) base_layer.weight = torch.randn(64, 64, dtype=torch.bfloat16) base_layer.bias = None base_layer.weight_scale_inv = torch.ones(1) # should NOT be used for bf16 proj.base_layer = base_layer W, b, quant_state, A, B, s = get_lora_parameters(proj) # quant_state should be None since weight is bf16, not FP8 self.assertIsNone(quant_state) def test_fp8_weight_uses_scale_inv(self): """FP8 weight should pick up weight_scale_inv as quant_state.""" from axolotl.kernels.lora import get_lora_parameters proj = MagicMock() proj.disable_adapters = True base_layer = MagicMock() proj.base_layer = base_layer # FP8 weight base_layer.weight = torch.randn(64, 64, dtype=torch.bfloat16).to( torch.float8_e4m3fn ) base_layer.bias = None scale_inv = torch.ones(1) base_layer.weight_scale_inv = scale_inv W, b, quant_state, A, B, s = get_lora_parameters(proj) self.assertIs(quant_state, scale_inv) class TestValidateQuantPatchRestore(unittest.TestCase): """Test that validate_quantization_for_training is restored after trainer creation.""" def test_patch_restored_on_success(self): """Monkeypatch should be restored even after successful trainer creation.""" import transformers.trainer as _trainer_module original = _trainer_module.validate_quantization_for_training # After the build() method runs, original should be restored. # We can't easily test the full build(), but we can test the pattern. _orig = _trainer_module.validate_quantization_for_training _trainer_module.validate_quantization_for_training = lambda model: None try: pass # simulate trainer_cls() succeeding finally: _trainer_module.validate_quantization_for_training = _orig self.assertIs(_trainer_module.validate_quantization_for_training, original) def test_patch_restored_on_error(self): """Monkeypatch should be restored even if trainer creation raises.""" import transformers.trainer as _trainer_module original = _trainer_module.validate_quantization_for_training _orig = _trainer_module.validate_quantization_for_training _trainer_module.validate_quantization_for_training = lambda model: None try: raise ValueError("test error") except ValueError: pass finally: _trainer_module.validate_quantization_for_training = _orig self.assertIs(_trainer_module.validate_quantization_for_training, original) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/core/test_builders.py ================================================ """Unit tests for axolotl.core.builders""" import sys from pathlib import Path from unittest.mock import MagicMock, patch import pytest from axolotl.common.datasets import load_datasets from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder from axolotl.loaders import ModelLoader, load_tokenizer from axolotl.utils.config import normalize_config from axolotl.utils.data import prepare_preference_datasets from axolotl.utils.dict import DictDefault from axolotl.utils.schemas.enums import RLType from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION @pytest.fixture(name="base_cfg") def fixture_base_cfg(): """ Base config with all common arguments between SFT and RLHF """ cfg = DictDefault( { # Model and tokenizer settings "base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "sequence_len": 2048, "model_config_type": "llama", # example type # Basic training settings "micro_batch_size": 2, "eval_batch_size": 2, "num_epochs": 1, "gradient_accumulation_steps": 1, "max_steps": 100, "val_set_size": 0, # Optimizer settings "optimizer": "adamw_torch_fused", "learning_rate": 0.00005, "weight_decay": 0.01, "adam_beta1": 0.998, "adam_beta2": 0.9, "adam_epsilon": 0.00001, "max_grad_norm": 1.0, # LR scheduler settings "lr_scheduler": "cosine", "lr_scheduler_kwargs": {"foo": "bar"}, "warmup_steps": 10, "warmup_ratio": None, "cosine_min_lr_ratio": 0.1, "cosine_constant_lr_ratio": 0.2, # Checkpointing and saving "save_steps": 100, "output_dir": "./model-out", "save_total_limit": 4, "save_only_model": False, # Hardware/performance settings "gradient_checkpointing": False, "gradient_checkpointing_kwargs": {"use_reentrant": False}, "dataloader_num_workers": 1, "dataloader_pin_memory": True, "dataloader_prefetch_factor": 2, "context_parallel_size": 1, "tensor_parallel_size": 1, # Dtype "fp16": False, "bf16": False, "tf32": False, # Logging and evaluation "logging_steps": 10, "eval_steps": 50, "eval_strategy": "steps", "save_strategy": "steps", "include_tokens_per_second": True, # Other common settings "seed": 42, "remove_unused_columns": True, "ddp_timeout": 1800, "ddp_bucket_cap_mb": 25, "ddp_broadcast_buffers": False, "dataset_num_proc": 4, } ) normalize_config(cfg) return cfg @pytest.fixture(name="dpo_cfg") def fixture_dpo_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.DPO, "dpo_use_weighting": True, "dpo_label_smoothing": 0.1, "beta": 0.1, # DPO beta } ) return cfg @pytest.fixture(name="orpo_cfg") def fixture_orpo_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.ORPO, "orpo_alpha": 0.1, "max_prompt_len": 512, } ) return cfg @pytest.fixture(name="kto_cfg") def fixture_kto_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.KTO, "kto_desirable_weight": 1.0, "kto_undesirable_weight": 1.0, "max_prompt_len": 512, } ) return cfg @pytest.fixture(name="grpo_cfg") def fixture_grpo_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.GRPO, "trl": DictDefault( { "beta": 0.001, "max_completion_length": 256, "use_vllm": False, # run on CPU # "vllm_device": "auto", # "vllm_gpu_memory_utilization": 0.15, "num_generations": 4, "reward_funcs": ["rewards.rand_reward_func"], } ), # Must be evenly divisible by num_generations "micro_batch_size": 4, "datasets": [ { "path": "openai/gsm8k", "name": "main", "split": "train[:1%]", } ], } ) return DictDefault(cfg) @pytest.fixture(name="ipo_cfg") def fixture_ipo_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.IPO, "dpo_label_smoothing": 0, "beta": 0.1, } ) return cfg @pytest.fixture(name="simpo_cfg") def fixture_simpo_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": RLType.SIMPO, "rl_beta": 0.2, "cpo_alpha": 0.9, "simpo_gamma": 0.4, } ) return cfg @pytest.fixture(name="sft_cfg") def fixture_sft_cfg(base_cfg): cfg = base_cfg.copy() cfg.update( { "rl": None, "sample_packing": False, "eval_sample_packing": False, "flash_attention": False, } ) return cfg @pytest.fixture(name="rm_cfg") def fixture_rm_cfg(sft_cfg): cfg = sft_cfg.copy() cfg.update( DictDefault( { "reward_model": True, "datasets": [ { "path": "argilla/distilabel-intel-orca-dpo-pairs", "type": "bradley_terry.chat_template", "split": "train[:1%]", } ], } ) ) return cfg @pytest.fixture(name="prm_cfg") def fixture_prm_cfg(sft_cfg): cfg = sft_cfg.copy() cfg.update( DictDefault( { "process_reward_model": True, "datasets": [ { "path": "trl-lib/math_shepherd", "type": "stepwise_supervised", "split": "train[:1%]", } ], } ) ) return cfg @pytest.fixture(name="tokenizer") def fixture_tokenizer(base_cfg): return load_tokenizer(base_cfg) @pytest.fixture(name="model") def fixture_model(base_cfg, tokenizer): model, _ = ModelLoader(base_cfg, tokenizer).load() return model class TestHFRLTrainerBuilder: """ TestCase class for RLHF trainer builders """ def _test_common_training_arguments(self, training_arguments, rl: str): """Helper to test common arguments across all variants""" # Basic training settings if rl == "grpo": # grpo_cfg's micro_batch_size is diff from others assert training_arguments.per_device_train_batch_size == 4 else: assert training_arguments.per_device_train_batch_size == 2 assert training_arguments.gradient_accumulation_steps == 1 assert training_arguments.max_steps == 100 # Optimizer settings assert training_arguments.learning_rate == 0.00005 assert training_arguments.weight_decay == 0.01 assert training_arguments.adam_beta1 == 0.998 assert training_arguments.adam_beta2 == 0.9 assert training_arguments.adam_epsilon == 0.00001 assert training_arguments.max_grad_norm == 1.0 # LR scheduler settings assert training_arguments.lr_scheduler_type == "cosine" assert training_arguments.warmup_steps == 10 assert training_arguments.cosine_min_lr_ratio == 0.1 assert training_arguments.cosine_constant_lr_ratio == 0.2 # Other settings assert training_arguments.dataloader_num_workers == 1 assert training_arguments.dataloader_pin_memory is True # TODO(wing): restore once trl releases 0.22.0 # assert training_arguments.gradient_checkpointing is True def test_dpo_training_arguments(self, dpo_cfg, model, tokenizer): builder = HFRLTrainerBuilder(dpo_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) self._test_common_training_arguments(training_arguments, rl=dpo_cfg.rl) # DPO specific assert training_arguments.beta == 0.1 assert hasattr(training_arguments, "use_weighting") assert training_arguments.use_weighting is True assert training_arguments.label_smoothing == 0.1 def test_orpo_training_arguments(self, orpo_cfg, model, tokenizer): builder = HFRLTrainerBuilder(orpo_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl) # ORPO specific assert training_arguments.beta == 0.1 # maps from orpo_alpha def test_kto_training_arguments(self, kto_cfg, model, tokenizer): builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) self._test_common_training_arguments(training_arguments, rl=kto_cfg.rl) # KTO specific assert training_arguments.desirable_weight == 1.0 assert training_arguments.undesirable_weight == 1.0 def _write_rewards_file(self, rewards_dir: Path): """ Writes reward function to local tmp path to be loaded on trainer building """ # Create rewards.py in a directory we can import from rewards_dir.mkdir() rewards_file = rewards_dir / "rewards.py" rewards_file.write_text( """import random def rand_reward_func(prompts, completions) -> list[float]: return [random.uniform(0, 1) for _ in completions] """ ) def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path): rewards_dir = tmp_path / "rewards_test" self._write_rewards_file(rewards_dir) # Add the directory to Python path so we can import the module sys.path.insert(0, str(rewards_dir)) try: builder = HFRLTrainerBuilder(grpo_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) builder.train_dataset = MagicMock() self._test_common_training_arguments(training_arguments, rl=grpo_cfg.rl) # GRPO specific assert training_arguments.beta == 0.001 assert training_arguments.max_completion_length == 256 assert training_arguments.use_vllm is False # assert training_arguments.vllm_device == "auto" # assert training_arguments.vllm_gpu_memory_utilization == 0.15 assert training_arguments.num_generations == 4 # Test trainer creation to verify reward_funcs trainer = builder.build(100) # Verify reward functions are properly loaded assert len(trainer.reward_funcs) == 1 assert trainer.reward_funcs[0].__module__ == "rewards" assert trainer.reward_funcs[0].__name__ == "rand_reward_func" finally: # remove imported module from path if str(rewards_dir) in sys.path: sys.path.remove(str(rewards_dir)) def test_ipo_training_arguments(self, ipo_cfg, model, tokenizer): builder = HFRLTrainerBuilder(ipo_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) self._test_common_training_arguments(training_arguments, rl=ipo_cfg.rl) # IPO specific assert training_arguments.beta == 0.1 assert training_arguments.loss_type == ["ipo"] assert training_arguments.label_smoothing == 0 def test_simpo_training_arguments(self, simpo_cfg, model, tokenizer): builder = HFRLTrainerBuilder(simpo_cfg, model, tokenizer) training_arguments, _ = builder._build_training_arguments(100) self._test_common_training_arguments(training_arguments, rl=simpo_cfg.rl) # SIMPO specific assert training_arguments.beta == 0.2 assert training_arguments.cpo_alpha == 0.9 assert training_arguments.simpo_gamma == 0.4 @pytest.mark.parametrize( ("cfg_string", "dataset_name"), [ ( "dpo_cfg", "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff", ), ( "ipo_cfg", "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff", ), ( "grpo_cfg", "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff", ), ("orpo_cfg", None), # don't use fixture for orpo to use smaller split ("kto_cfg", None), # no fixture for kto # ( # "simpo_cfg", # "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff", # ), ], ) def test_custom_optimizer_cls_and_kwargs( self, request, cfg_string, dataset_name, tmp_path, model, tokenizer, ): cfg = request.getfixturevalue(cfg_string) builder = HFRLTrainerBuilder(cfg, model, tokenizer) cfg["optimizer"] = "muon" if cfg_string in ["dpo_cfg", "ipo_cfg", "grpo_cfg", "simpo_cfg"]: cfg["datasets"] = [DictDefault(ALPACA_MESSAGES_CONFIG_REVISION)] elif cfg_string == "kto_cfg": cfg["datasets"] = [ DictDefault( { "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto", "type": "llama3.ultra", "split": "train[:1%]", } ) ] elif cfg_string == "orpo_cfg": cfg["datasets"] = [ DictDefault( { "path": "argilla/ultrafeedback-binarized-preferences-cleaned", "type": "chat_template.argilla", "split": "train[:1%]", } ) ] else: raise ValueError(f"Unhandled cfg_string: {cfg_string}") cfg["dataset_num_proc"] = 4 if cfg_string == "grpo_cfg": rewards_dir = tmp_path / "rewards_test" self._write_rewards_file(rewards_dir) # Add the directory to Python path so we can import the module sys.path.insert(0, str(rewards_dir)) try: # Only use mock for the commented out configs if dataset_name is not None: with patch( "axolotl.utils.data.rl.load_dataset_with_config" ) as mock_load_dataset: mock_load_dataset.return_value = request.getfixturevalue( dataset_name ) train_dataset, eval_dataset = prepare_preference_datasets( cfg, tokenizer ) else: # Load actual datasets for orpo_cfg and kto_cfg train_dataset, eval_dataset = prepare_preference_datasets( cfg, tokenizer ) builder.train_dataset = train_dataset builder.eval_dataset = eval_dataset trainer = builder.build(100) assert trainer.optimizer_cls_and_kwargs is not None from axolotl.contribs.mit.muon import MuonOptimizerFactory from axolotl.contribs.mit.muon.muon import Muon optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs assert optimizer_cls is MuonOptimizerFactory assert optimizer_kwargs["lr"] == 0.00005 assert optimizer_kwargs["weight_decay"] == 0.01 assert optimizer_kwargs["betas"] == (0.998, 0.9) assert optimizer_kwargs["eps"] == 0.00001 # Ensure optimizer is created with correct class optim = trainer.create_optimizer() assert isinstance(optim, Muon) finally: # remove imported module from path if cfg_string == "grpo_cfg" and str(rewards_dir) in sys.path: sys.path.remove(str(rewards_dir)) class TestHFCausalTrainerBuilder: """ TestCase class for SFT trainer builder """ def test_training_arguments(self, sft_cfg, model, tokenizer): builder = HFCausalTrainerBuilder(sft_cfg, model, tokenizer) trainer = builder.build(100) training_arguments = trainer.args # Test common arguments assert training_arguments.per_device_train_batch_size == 2 assert training_arguments.gradient_accumulation_steps == 1 assert training_arguments.max_steps == 100 assert training_arguments.learning_rate == 0.00005 assert training_arguments.weight_decay == 0.01 assert training_arguments.adam_beta1 == 0.998 assert training_arguments.adam_beta2 == 0.9 assert training_arguments.adam_epsilon == 0.00001 assert training_arguments.max_grad_norm == 1.0 assert training_arguments.lr_scheduler_type == "cosine" assert training_arguments.warmup_steps == 10 assert training_arguments.cosine_min_lr_ratio == 0.1 assert training_arguments.dataloader_num_workers == 1 assert training_arguments.dataloader_pin_memory is True assert training_arguments.gradient_checkpointing is False # SFT specific assert training_arguments.sample_packing is False assert training_arguments.eval_sample_packing is False @pytest.mark.parametrize( "cfg_string", [ "sft_cfg", "rm_cfg", "prm_cfg", ], ) def test_builder_w_rm_trainers(self, request, cfg_string, model, tokenizer): cfg = request.getfixturevalue(cfg_string) builder = HFCausalTrainerBuilder(cfg, model, tokenizer) cfg["optimizer"] = "muon" # need to load datasets for reward model and process reward model trainer if cfg_string in ["rm_cfg", "prm_cfg"]: dataset_meta = load_datasets(cfg=cfg) builder.train_dataset = dataset_meta.train_dataset builder.eval_dataset = dataset_meta.eval_dataset trainer = builder.build(100) assert trainer.optimizer_cls_and_kwargs is not None from axolotl.contribs.mit.muon import MuonOptimizerFactory from axolotl.contribs.mit.muon.muon import Muon optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs assert optimizer_cls is MuonOptimizerFactory assert optimizer_kwargs["lr"] == 0.00005 assert optimizer_kwargs["weight_decay"] == 0.01 assert optimizer_kwargs["betas"] == (0.998, 0.9) assert optimizer_kwargs["eps"] == 0.00001 # Ensure optimizer is created with correct class optim = trainer.create_optimizer() assert isinstance(optim, Muon) class TestTrainerClsPlugin: """ TestCase class for trainer builder with plugin """ def test_trainer_cls_is_not_none_with_plugin(self, kto_cfg, model, tokenizer): """ Test that the trainer cls is not none with plugin Fixes #2693 """ cfg = kto_cfg.copy() cfg.plugins = ["axolotl.integrations.liger.LigerPlugin"] # Expected AttributeError as we don't pass regular model configs to RL trainer builder # If it throws `TypeError: None is not a callable object`, trainer_cls could be None try: builder = HFRLTrainerBuilder(cfg, model, tokenizer) builder.build(100) except TypeError as e: # Error raised if trainer_cls is None assert "'tuple' object has no attribute 'config'" not in str(e) except Exception: # Another error happens, so we passed trainer_cls to builder pass ================================================ FILE: tests/e2e/.gitignore ================================================ last_run_prepared ================================================ FILE: tests/e2e/__init__.py ================================================ ================================================ FILE: tests/e2e/integrations/test_cut_cross_entropy.py ================================================ """ Simple end-to-end test for Cut Cross Entropy integration """ import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils import get_pytorch_version from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists @pytest.fixture() def min_cfg(temp_dir): return { "base_model": "HuggingFaceTB/SmolLM2-135M", "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin", ], "cut_cross_entropy": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", "max_steps": 10, "bf16": "auto", "save_first_step": False, } class TestCutCrossEntropyIntegration: """ e2e tests for cut_cross_entropy integration with Axolotl """ def test_llama_w_cce(self, min_cfg, temp_dir): cfg = DictDefault(min_cfg) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) major, minor, _ = get_pytorch_version() if (major, minor) < (2, 4): with pytest.raises(ImportError): train(cfg=cfg, dataset_meta=dataset_meta) else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) def test_qwen2_w_cce(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin", ], "cut_cross_entropy": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", "max_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) major, minor, _ = get_pytorch_version() if (major, minor) < (2, 4): with pytest.raises(ImportError): train(cfg=cfg, dataset_meta=dataset_meta) else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @pytest.mark.parametrize( "attention_type", [ "flash_attention", "sdp_attention", # "xformers_attention", ], ) def test_llama_w_cce_and_attention(self, min_cfg, temp_dir, attention_type): cfg = DictDefault( min_cfg | { attention_type: True, } ) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) major, minor, _ = get_pytorch_version() if (major, minor) < (2, 4): with pytest.raises(ImportError): train(cfg=cfg, dataset_meta=dataset_meta) else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/integrations/test_fp8.py ================================================ """ Simple end-to-end smoke tests for FP8 mixed precision training """ from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists, require_torch_2_7_0 class FP8IntegrationTestCase: """ e2e smoke tests for FP8 mixed precision training with Axolotl """ @require_torch_2_7_0 def test_fp8_single_gpu_smoke(self, temp_dir): """Smoke test for single GPU FP8 + torch.compile training""" cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "trust_remote_code": True, "sequence_len": 512, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, # Very short smoke test "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "sdp_attention": True, "pad_to_seq_len": True, "sample_packing": True, "fp8": True, "torch_compile": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/integrations/test_hooks.py ================================================ """ e2e tests to make sure all the hooks are fired on the plugin """ import os from pathlib import Path from axolotl.common.datasets import load_datasets from axolotl.integrations.base import BasePlugin from axolotl.train import train from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists class LogHooksPlugin(BasePlugin): """ fixture to capture in a log file each hook that was fired """ base_dir = Path("/tmp/axolotl-log-hooks") def __init__(self): self.base_dir.mkdir(parents=True, exist_ok=True) try: os.remove(self.base_dir.joinpath("plugin_hooks.log")) except FileNotFoundError: pass def post_trainer_create(self, cfg, trainer): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_trainer_create\n") def pre_model_load(self, cfg): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("pre_model_load\n") def post_model_build(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_model_build\n") def pre_lora_load(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("pre_lora_load\n") def post_lora_load(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_lora_load\n") def post_model_load(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_model_load\n") def create_optimizer(self, cfg, trainer): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("create_optimizer\n") def get_trainer_cls(self, cfg): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("get_trainer_cls\n") def create_lr_scheduler(self, cfg, trainer, optimizer, num_training_steps): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("create_lr_scheduler\n") def add_callbacks_pre_trainer(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("add_callbacks_pre_trainer\n") return [] def add_callbacks_post_trainer(self, cfg, trainer): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("add_callbacks_post_trainer\n") return [] def post_train(self, cfg, model): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_train\n") def post_train_unload(self, cfg): with open( self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8" ) as f: f.write("post_train_unload\n") class TestPluginHooks: """ e2e tests to make sure all the hooks are fired during the training """ def test_plugin_hooks(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "plugins": [ "tests.e2e.integrations.test_hooks.LogHooksPlugin", ], "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "flash_attention": True, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) with open( "/tmp/axolotl-log-hooks" + "/plugin_hooks.log", "r", encoding="utf-8" ) as f: file_contents = f.readlines() file_contents = "\n".join(file_contents) assert "post_trainer_create" in file_contents assert "pre_model_load" in file_contents assert "post_model_build" in file_contents assert "pre_lora_load" in file_contents assert "post_lora_load" in file_contents assert "post_model_load" in file_contents # assert "create_optimizer" in file_contents # not implemented yet assert "get_trainer_cls" in file_contents assert "create_lr_scheduler" in file_contents assert "add_callbacks_pre_trainer" in file_contents assert "add_callbacks_post_trainer" in file_contents assert "post_train" in file_contents # assert "post_train_unload" in file_contents # not called from test train call try: os.remove("/tmp/axolotl-log-hooks" + "/plugin_hooks.log") except FileNotFoundError: pass ================================================ FILE: tests/e2e/integrations/test_kd.py ================================================ """ e2e tests for kd trainer support in Axolotl """ from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard, require_torch_2_5_1 @pytest.fixture(name="kd_min_cfg") def min_cfg(temp_dir): return { "base_model": "Qwen/Qwen3-0.6B", "tokenizer_config": "winglian/qwen3-14b-math", "plugins": [ "axolotl.integrations.kd.KDPlugin", "axolotl.integrations.liger.LigerPlugin", ], "liger_rms_norm": True, "liger_glu_activation": True, "torch_compile": True, "chat_template": "qwen3", "kd_trainer": True, "kd_ce_alpha": 0.1, "kd_alpha": 0.9, "kd_temperature": 1.0, "kd_beta": 0.0, "kd_normalize_topk": True, "dataloader_prefetch_factor": 8, "dataloader_num_workers": 4, "dataloader_pin_memory": True, "datasets": [ { "path": "winglian/OpenThoughts-114k-math-correct-qwen3-14b-math-prepared-topk128-normalized", "type": "chat_template", "split": "train", "split_thinking": True, "eot_tokens": ["<|im_end|>"], "data_files": ["train/batch-000000.parquet"], }, ], "skip_prepare_dataset": True, "val_set_size": 0.0, "sequence_len": 2048, "sample_packing": True, "pad_to_sequence_len": True, "gradient_accumulation_steps": 2, "micro_batch_size": 1, "num_epochs": 1, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "learning_rate": 0.00001, "bf16": "auto", "gradient_checkpointing": True, "flash_attention": True, "special_tokens": { "pad_token": "<|end_of_text|>", "eos_token": "<|eot_id|>", }, "max_steps": 5, "output_dir": temp_dir, "use_tensorboard": True, "save_first_step": False, } class TestKnowledgeDistillation: """ Test case for Knowledge Distillation """ # While this will run on torch 2.4.x without torch_compile enabled # the VRAM requirement is higher than what is available in CI @require_torch_2_5_1 def test_llama_kd(self, temp_dir, kd_min_cfg): cfg = DictDefault(kd_min_cfg) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "1", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) assert (Path(temp_dir) / "model.safetensors").exists() check_tensorboard( temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "load_in_8bit", [True, False], ) def test_llama_lora_kd(self, temp_dir, kd_min_cfg, load_in_8bit): cfg = DictDefault( { "load_in_8bit": load_in_8bit, "torch_compile": False, "adapter": "lora", "peft_use_dora": True, "lora_target_linear": True, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.0, "lora_modules_to_save": ["embed_tokens", "lm_head"], "lora_mlp_kernel": False, "lora_qkv_kernel": False, "lora_o_kernel": False, } | kd_min_cfg ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "1", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) assert (Path(temp_dir) / "adapter_model.safetensors").exists() check_tensorboard( temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/integrations/test_liger.py ================================================ """ Simple end-to-end test for Liger integration """ import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1 class LigerIntegrationTestCase: """ e2e tests for liger integration with Axolotl """ @require_torch_2_4_1 def test_llama_wo_flce(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "plugins": [ "axolotl.integrations.liger.LigerPlugin", ], "liger_rope": True, "liger_rms_norm": True, "liger_glu_activation": True, "liger_cross_entropy": True, "liger_fused_linear_cross_entropy": False, "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "max_steps": 5, "save_first_step": False, } ) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @require_torch_2_4_1 @pytest.mark.parametrize( "liger_use_token_scaling", [True, False], ) def test_llama_w_flce(self, temp_dir, liger_use_token_scaling): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "plugins": [ "axolotl.integrations.liger.LigerPlugin", ], "liger_rope": True, "liger_rms_norm": True, "liger_glu_activation": True, "liger_cross_entropy": False, "liger_fused_linear_cross_entropy": True, "liger_use_token_scaling": liger_use_token_scaling, "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "max_steps": 5, "save_first_step": False, } ) cfg = validate_config(cfg) prepare_plugins(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/integrations/test_llm_compressor.py ================================================ """ E2E smoke tests for LLMCompressorPlugin integration """ from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import ( check_model_output_exists, require_llmcompressor, require_torch_2_4_1, ) MODELS = [ "nm-testing/llama2.c-stories42M-pruned2.4-compressed", "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed", ] @pytest.mark.parametrize( "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"] ) @pytest.mark.parametrize( "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"] ) class TestLLMCompressorIntegration: """ e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin """ @require_llmcompressor @require_torch_2_4_1 def test_llmcompressor_plugin( self, temp_dir, base_model: str, save_compressed: bool ): from llmcompressor import active_session # core cfg cfg = DictDefault( { "base_model": base_model, "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"], "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 1e-5, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "max_steps": 5, "llmcompressor": { "recipe": { "finetuning_stage": { "finetuning_modifiers": { "ConstantPruningModifier": { "targets": [ "re:.*q_proj.weight", "re:.*k_proj.weight", "re:.*v_proj.weight", "re:.*o_proj.weight", "re:.*gate_proj.weight", "re:.*up_proj.weight", "re:.*down_proj.weight", ], "start": 0, }, }, }, }, "save_compressed": save_compressed, }, "save_first_step": False, } ) prepare_plugins(cfg) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) try: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) _check_llmcompressor_model_outputs(temp_dir, save_compressed) finally: active_session().reset() def _check_llmcompressor_model_outputs(temp_dir, save_compressed): if save_compressed: assert (Path(temp_dir) / "recipe.yaml").exists() from compressed_tensors import ModelCompressor from compressed_tensors.config import Sparse24BitMaskConfig compressor = ModelCompressor.from_pretrained(temp_dir) assert compressor is not None assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig) ================================================ FILE: tests/e2e/integrations/test_scattermoe_lora_kernels.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Tests for ScatterMoE + LoRA Fused Kernels ========================================== Tests verify correctness of: 1. Forward pass: fused kernel matches naive PyTorch reference 2. Backward pass: gradients for LoRA A, B, and input match reference 3. Frozen weights: expert weight gradients are correctly skipped 4. Various configurations: top-k, grouped_in/out, with/without bias 5. Numerical stability: bf16/fp16 outputs within tolerance of fp32 reference 6. HFScatterMoEGatedMLP with sigmoid routing (GLM/DeepSeek/MiniMax M2) Test strategy: - Reference implementation uses pure PyTorch ops (no Triton) - ScatterMoE routing (flatten_sort_count) is shared between reference and kernel - Tolerances account for tf32 accumulation in Triton kernels """ from types import SimpleNamespace import pytest import torch # Skip all tests if CUDA is not available pytestmark = pytest.mark.skipif( not torch.cuda.is_available(), reason="CUDA required for Triton kernels", ) _SMOE = "axolotl.integrations.kernels.libs.scattermoe_lora" # ============================================================================= # Helpers # ============================================================================= def flatten_sort_count_ref(expert_idxs: torch.Tensor, num_experts: int): """Reference implementation of routing.""" with torch.no_grad(): flat = expert_idxs.flatten() sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flat) counts = flat.bincount(minlength=num_experts) offsets = counts.cumsum(-1) return sorted_expert_idxs, sorted_scattered_idxs, offsets def reference_parallel_linear_lora( X, W, k, sorted_expert_idxs, sorted_scattered_idxs, lora_A, lora_B, scaling, x_grouped=False, y_grouped=False, bias=None, ): """ Pure PyTorch reference for: Y[i] = X[i] @ W[e] + scaling * (X[i] @ A[e]^T) @ B[e]^T + b[e] Args: X: [M, K] input (token order) W: [E, K, N] expert weights sorted_expert_idxs: [M*k] expert assignments (sorted) sorted_scattered_idxs: [M*k] original token indices (sorted) lora_A: [r*E, K] LoRA A weights lora_B: [N, r*E] LoRA B weights scaling: LoRA scaling factor """ E, K, N = W.shape R = lora_A.size(0) // E L = sorted_expert_idxs.size(0) # M * k output = torch.zeros(L, N, device=X.device, dtype=X.dtype) for i in range(L): e = sorted_expert_idxs[i].item() if x_grouped: x_i = X[i] else: token_idx = sorted_scattered_idxs[i].item() // k x_i = X[token_idx] w_e = W[e] # [K, N] a_e = lora_A[e * R : (e + 1) * R, :] # [r, K] b_e = lora_B[:, e * R : (e + 1) * R] # [N, r] # Y = X @ W + scaling * (X @ A^T) @ B^T base = x_i @ w_e # [N] lora = scaling * ((x_i @ a_e.T) @ b_e.T) # [N] out_i = base + lora if bias is not None: out_i = out_i + bias[e] if y_grouped: output[i] = out_i else: output[sorted_scattered_idxs[i]] = out_i return output def reference_lora_backward( grad_out, X, W, lora_A, lora_B, scaling, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, k, E, ): """ Pure PyTorch reference for LoRA backward pass on grouped data. Returns: dX: [M*k, K] input gradient (in grouped order) dA: [r*E, K] LoRA A gradient dB: [N, r*E] LoRA B gradient """ R = lora_A.size(0) // E dA = torch.zeros_like(lora_A) dB = torch.zeros_like(lora_B) dX = torch.zeros_like(X) prev_offset = 0 for e in range(E): curr_offset = expert_offsets[e].item() if curr_offset > prev_offset: dy_e = grad_out[prev_offset:curr_offset] # [M_e, N] x_e = X[prev_offset:curr_offset] # [M_e, K] a_e = lora_A[e * R : (e + 1) * R, :] # [r, K] b_e = lora_B[:, e * R : (e + 1) * R] # [N, r] w_e = W[e] # [K, N] # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A dx_base = dy_e @ w_e.T # [M_e, K] dy_b = dy_e @ b_e # [M_e, r] dx_lora = scaling * (dy_b @ a_e) # [M_e, K] dX[prev_offset:curr_offset] = dx_base + dx_lora # LoRA A gradient: dA = scaling * (dY @ B)^T @ X xa = x_e @ a_e.T # [M_e, r] dA[e * R : (e + 1) * R, :] = scaling * (dy_b.T @ x_e) # LoRA B gradient: dB = scaling * dY^T @ (X @ A^T) dB[:, e * R : (e + 1) * R] = scaling * (dy_e.T @ xa) prev_offset = curr_offset return dX, dA, dB def make_test_data( M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.float32, device="cuda", seed=42, ): """Create test data for ScatterMoE + LoRA tests.""" torch.manual_seed(seed) X = torch.randn(M, K, device=device, dtype=dtype) W = torch.randn(E, K, N, device=device, dtype=dtype) * 0.02 lora_A = torch.randn(R * E, K, device=device, dtype=dtype) * 0.01 lora_B = torch.randn(N, R * E, device=device, dtype=dtype) * 0.01 scaling = 0.5 # Generate routing selected_experts = torch.randint(0, E, (M, k), device=device) sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count_ref( selected_experts, E ) return { "X": X, "W": W, "lora_A": lora_A, "lora_B": lora_B, "scaling": scaling, "k": k, "E": E, "R": R, "sorted_expert_idxs": sorted_expert_idxs, "sorted_scattered_idxs": sorted_scattered_idxs, "expert_offsets": expert_offsets, } # ============================================================================= # Test: Forward Pass Correctness # ============================================================================= class TestForwardPass: """Test forward pass of fused scatter2scatter_lora kernel.""" def _run_forward_test( self, M, K, N, E, R, k, dtype=torch.float32, atol=1e-2, rtol=1e-2 ): from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype) # Reference ref_output = reference_parallel_linear_lora( data["X"], data["W"], data["k"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["lora_A"], data["lora_B"], data["scaling"], ) # Kernel kernel_output = lora_ops.scatter2scatter_lora( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], ) torch.testing.assert_close(kernel_output, ref_output, atol=atol, rtol=rtol) def test_basic(self): """Basic forward pass with small dimensions.""" self._run_forward_test(M=16, K=64, N=64, E=4, R=8, k=1) def test_topk2(self): """Forward pass with top-2 routing.""" self._run_forward_test(M=32, K=64, N=128, E=4, R=8, k=2) def test_larger_rank(self): """Forward pass with larger LoRA rank.""" self._run_forward_test(M=16, K=128, N=128, E=8, R=32, k=2) def test_small_rank(self): """Forward pass with very small LoRA rank.""" self._run_forward_test(M=32, K=64, N=64, E=4, R=4, k=1) def test_many_experts(self): """Forward with many experts, fewer tokens per expert.""" self._run_forward_test(M=64, K=64, N=64, E=16, R=8, k=2) def test_non_power_of_2_dims(self): """Test with dimensions that are not powers of 2.""" self._run_forward_test(M=17, K=96, N=80, E=6, R=16, k=2, atol=2e-2, rtol=2e-2) def test_single_token(self): """Test with a single token.""" self._run_forward_test(M=1, K=64, N=64, E=4, R=8, k=1) def test_bf16(self): """Test with bfloat16 precision.""" self._run_forward_test( M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.bfloat16, atol=5e-2, rtol=5e-2 ) def test_fp16(self): """Test with float16 precision.""" self._run_forward_test( M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.float16, atol=5e-2, rtol=5e-2 ) class TestForwardGrouped: """Test forward pass with grouped_in/grouped_out configurations.""" def _make_grouped_data(self, M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.float32): from importlib import import_module base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype) # Create grouped X grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) data["grouped_X"] = grouped_X return data def test_x_grouped(self): """Forward with pre-grouped input.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") data = self._make_grouped_data() ref_output = reference_parallel_linear_lora( data["grouped_X"], data["W"], data["k"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["lora_A"], data["lora_B"], data["scaling"], x_grouped=True, ) kernel_output = lora_ops.scatter2scatter_lora( X=data["grouped_X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=1, # When x_grouped, fan_out=1 (already expanded) lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], x_grouped=True, ) torch.testing.assert_close(kernel_output, ref_output, atol=1e-2, rtol=1e-2) def test_y_grouped(self): """Forward with grouped output.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") data = make_test_data() ref_output = reference_parallel_linear_lora( data["X"], data["W"], data["k"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["lora_A"], data["lora_B"], data["scaling"], y_grouped=True, ) kernel_output = lora_ops.scatter2scatter_lora( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], y_grouped=True, ) torch.testing.assert_close(kernel_output, ref_output, atol=1e-2, rtol=1e-2) # ============================================================================= # Test: Backward Pass Correctness (LoRA Gradients) # ============================================================================= class TestLoRAGradients: """Test backward LoRA gradient computation (dA, dB).""" def _run_lora_grad_test(self, M, K, N, E, R, k, atol=1e-2, rtol=1e-2): from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Group X for backward grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) # Create fake grad_out in grouped order grad_out = torch.randn( data["sorted_expert_idxs"].size(0), N, device="cuda", dtype=torch.float32, ) # Reference _, ref_dA, ref_dB = reference_lora_backward( grad_out, grouped_X, data["W"], data["lora_A"], data["lora_B"], data["scaling"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], k, E, ) # Kernel kernel_dA, kernel_dB = lora_ops.group_bwd_lora( DY=grad_out, X=grouped_X, lora_A=data["lora_A"], lora_B=data["lora_B"], expert_offsets=data["expert_offsets"], E=E, scaling=data["scaling"], ) torch.testing.assert_close(kernel_dA, ref_dA, atol=atol, rtol=rtol) torch.testing.assert_close(kernel_dB, ref_dB, atol=atol, rtol=rtol) def test_basic_lora_grads(self): self._run_lora_grad_test(M=32, K=64, N=128, E=4, R=8, k=2) def test_small_rank(self): self._run_lora_grad_test(M=16, K=64, N=64, E=4, R=4, k=1) def test_larger_rank(self): self._run_lora_grad_test( M=16, K=128, N=128, E=8, R=32, k=2, atol=5e-2, rtol=5e-2 ) def test_many_experts(self): self._run_lora_grad_test(M=64, K=64, N=64, E=16, R=8, k=2) def test_single_token_per_expert(self): """Edge case: roughly 1 token per expert.""" self._run_lora_grad_test(M=8, K=64, N=64, E=8, R=4, k=1) # ============================================================================= # Test: Full Autograd (Forward + Backward) via torch.autograd # ============================================================================= class TestAutograd: """Test full autograd integration through ScatterMoELoRA.""" def test_lora_receives_gradients(self): """LoRA A and B receive non-zero gradients; frozen W does not.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") M, K, N, E, R, k = 16, 64, 64, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) X = data["X"].clone().requires_grad_(True) W = data["W"].clone().requires_grad_(False) # Frozen lora_A = data["lora_A"].clone().requires_grad_(True) lora_B = data["lora_B"].clone().requires_grad_(True) output = pll.ScatterMoELoRA.apply( X, W, k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], lora_A, lora_B, data["scaling"], None, None, False, False, ) loss = output.sum() loss.backward() # LoRA params should have gradients assert lora_A.grad is not None, "lora_A should have gradient" assert lora_B.grad is not None, "lora_B should have gradient" assert lora_A.grad.abs().sum() > 0, "lora_A gradient should be non-zero" assert lora_B.grad.abs().sum() > 0, "lora_B gradient should be non-zero" # Input should have gradient (needed for upstream backprop) assert X.grad is not None, "X should have gradient" assert X.grad.abs().sum() > 0, "X gradient should be non-zero" def test_input_gradient_matches_reference(self): """Input gradient from autograd matches pure PyTorch reference.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") base_ops = import_module(f"{_SMOE}.kernels.ops") M, K, N, E, R, k = 16, 64, 64, 4, 8, 1 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Autograd path X_kern = data["X"].clone().requires_grad_(True) lora_A_kern = data["lora_A"].clone().requires_grad_(True) lora_B_kern = data["lora_B"].clone().requires_grad_(True) out_kern = pll.ScatterMoELoRA.apply( X_kern, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], lora_A_kern, lora_B_kern, data["scaling"], None, None, False, False, ) grad_out = torch.randn_like(out_kern) out_kern.backward(grad_out) # Reference path grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1 ) ref_dX, ref_dA, ref_dB = reference_lora_backward( grouped_grad, grouped_X, data["W"], data["lora_A"], data["lora_B"], data["scaling"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], k, E, ) # Compare input gradient (for k=1, no reduction needed) # ref_dX is in grouped (expert-sorted) order; X_kern.grad is in original order. # Ungroup ref_dX by scattering back to original positions. ref_dX_ungrouped = torch.zeros_like(ref_dX) ref_dX_ungrouped[data["sorted_scattered_idxs"]] = ref_dX torch.testing.assert_close(X_kern.grad, ref_dX_ungrouped, atol=5e-2, rtol=5e-2) def test_lora_gradient_matches_reference(self): """LoRA A/B gradients from autograd match reference.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") base_ops = import_module(f"{_SMOE}.kernels.ops") M, K, N, E, R, k = 16, 64, 64, 4, 8, 1 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Autograd path X_kern = data["X"].clone().requires_grad_(True) lora_A_kern = data["lora_A"].clone().requires_grad_(True) lora_B_kern = data["lora_B"].clone().requires_grad_(True) out_kern = pll.ScatterMoELoRA.apply( X_kern, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], lora_A_kern, lora_B_kern, data["scaling"], None, None, False, False, ) grad_out = torch.randn_like(out_kern) out_kern.backward(grad_out) # Reference path grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1 ) _, ref_dA, ref_dB = reference_lora_backward( grouped_grad, grouped_X, data["W"], data["lora_A"], data["lora_B"], data["scaling"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], k, E, ) torch.testing.assert_close(lora_A_kern.grad, ref_dA, atol=5e-2, rtol=5e-2) torch.testing.assert_close(lora_B_kern.grad, ref_dB, atol=5e-2, rtol=5e-2) # ============================================================================= # Test: Equivalence with Base ScatterMoE (scaling=0 should match base) # ============================================================================= class TestBaseEquivalence: """When scaling=0, fused kernel should match base scatter2scatter.""" def test_zero_scaling_matches_base(self): """With scaling=0, LoRA contribution vanishes; should match base.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2) base_output = base_ops.scatter2scatter( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], ) lora_output = lora_ops.scatter2scatter_lora( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=0.0, ) torch.testing.assert_close(lora_output, base_output, atol=1e-3, rtol=1e-3) def test_zero_lora_weights_matches_base(self): """With A=0, B=0, should match base scatter2scatter.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2) zero_A = torch.zeros_like(data["lora_A"]) zero_B = torch.zeros_like(data["lora_B"]) base_output = base_ops.scatter2scatter( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], ) lora_output = lora_ops.scatter2scatter_lora( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], lora_A=zero_A, lora_B=zero_B, scaling=1.0, ) torch.testing.assert_close(lora_output, base_output, atol=1e-3, rtol=1e-3) # ============================================================================= # Test: LoRA Additivity # ============================================================================= class TestLoRAAdditivity: """Test that the LoRA component is correctly additive.""" def test_lora_additivity(self): """ Verify: fused(X, W, A, B, s) == base(X, W) + s * per_expert_lora(X, A, B) """ from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2) # Base output (no LoRA) base_output = base_ops.scatter2scatter( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], ) # Fused output fused_output = lora_ops.scatter2scatter_lora( X=data["X"], W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=data["k"], lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], ) # Compute LoRA contribution manually (reference) lora_only = reference_parallel_linear_lora( data["X"], torch.zeros_like(data["W"]), data["k"], data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["lora_A"], data["lora_B"], data["scaling"], ) # fused = base + lora expected = base_output + lora_only torch.testing.assert_close(fused_output, expected, atol=2e-2, rtol=2e-2) # ============================================================================= # Test: ParallelExperts module integration # ============================================================================= class TestParallelExpertsModule: """Test the ParallelExperts module with LoRA.""" def test_set_and_clear_lora(self): """Test set_lora/clear_lora lifecycle.""" from importlib import import_module lora_module = import_module(f"{_SMOE}.lora_ops") pe = lora_module.ParallelExperts(4, 64, 128).cuda() A = torch.randn(32, 64, device="cuda") # r=8, E=4 B = torch.randn(128, 32, device="cuda") pe.set_lora(A, B, 0.5) assert pe._lora_A is A assert pe._lora_B is B assert pe._lora_scaling == 0.5 pe.clear_lora() assert pe._lora_A is None assert pe._lora_B is None def test_forward_with_lora(self): """ParallelExperts forward with LoRA matches reference.""" from importlib import import_module lora_module = import_module(f"{_SMOE}.lora_ops") E, K, N, R = 4, 64, 128, 8 M, k = 16, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) pe = lora_module.ParallelExperts(E, K, N).cuda() # Set weights to match test data with torch.no_grad(): pe.weight.copy_(data["W"].permute(0, 2, 1)) # [E, N, K] pe.set_lora(data["lora_A"], data["lora_B"], data["scaling"]) output = pe( data["X"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], ) ref = reference_parallel_linear_lora( data["X"], data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["lora_A"], data["lora_B"], data["scaling"], ) torch.testing.assert_close(output, ref, atol=2e-2, rtol=2e-2) # ============================================================================= # Test: Edge Cases # ============================================================================= class TestEdgeCases: """Edge cases and boundary conditions.""" def test_all_tokens_one_expert(self): """All tokens routed to a single expert.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") M, K, N, E, R, k = 16, 64, 64, 4, 8, 1 torch.manual_seed(42) X = torch.randn(M, K, device="cuda") W = torch.randn(E, K, N, device="cuda") * 0.02 lora_A = torch.randn(R * E, K, device="cuda") * 0.01 lora_B = torch.randn(N, R * E, device="cuda") * 0.01 # All tokens go to expert 0 selected_experts = torch.zeros(M, k, device="cuda", dtype=torch.long) sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = ( flatten_sort_count_ref(selected_experts, E) ) ref = reference_parallel_linear_lora( X, W, k, sorted_expert_idxs, sorted_scattered_idxs, lora_A, lora_B, 0.5, ) kernel = lora_ops.scatter2scatter_lora( X=X, W=W, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=k, lora_A=lora_A, lora_B=lora_B, scaling=0.5, ) torch.testing.assert_close(kernel, ref, atol=1e-2, rtol=1e-2) def test_empty_experts(self): """Some experts have no tokens assigned.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") M, K, N, E, R, k = 8, 64, 64, 8, 4, 1 torch.manual_seed(42) X = torch.randn(M, K, device="cuda") W = torch.randn(E, K, N, device="cuda") * 0.02 lora_A = torch.randn(R * E, K, device="cuda") * 0.01 lora_B = torch.randn(N, R * E, device="cuda") * 0.01 # Only use experts 0 and 1 selected_experts = torch.randint(0, 2, (M, k), device="cuda") sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = ( flatten_sort_count_ref(selected_experts, E) ) ref = reference_parallel_linear_lora( X, W, k, sorted_expert_idxs, sorted_scattered_idxs, lora_A, lora_B, 0.5, ) kernel = lora_ops.scatter2scatter_lora( X=X, W=W, sorted_expert_idxs=sorted_expert_idxs, sorted_scattered_idxs=sorted_scattered_idxs, k=k, lora_A=lora_A, lora_B=lora_B, scaling=0.5, ) torch.testing.assert_close(kernel, ref, atol=1e-2, rtol=1e-2) # ============================================================================= # Test: Optimization 1 - Fused dX Kernel # ============================================================================= class TestFusedDX: """Test fused backward dX kernel: dX = dY @ W^T + scaling * (dY @ B) @ A.""" def _run_fused_dX_test( self, M, K, N, E, R, k, dtype=torch.float32, atol=5e-2, rtol=5e-2 ): from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") pll = import_module(f"{_SMOE}.parallel_linear_lora") data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype) # Create dummy grad_out in grouped order grad_out = torch.randn( data["sorted_expert_idxs"].size(0), N, device="cuda", dtype=dtype ) grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1, ) # Reference: separate scatter2scatter(DY, W^T) + _compute_lora_input_grad ref_base = base_ops.scatter2scatter( X=grouped_grad, x_grouped=True, W=data["W"].permute(0, 2, 1), sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=1, y_grouped=False, ) ref_lora = pll._compute_lora_input_grad( grouped_grad, data["lora_A"], data["lora_B"], data["expert_offsets"], E, data["scaling"], ) # Scatter lora from grouped to ungrouped order ref_lora_ungrouped = torch.zeros_like(ref_base) ref_lora_ungrouped[data["sorted_scattered_idxs"]] = ref_lora ref_total = ref_base + ref_lora_ungrouped # Fused kernel fused_result = lora_ops.scatter2scatter_lora_dX( DY=grouped_grad, W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=1, lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], dy_grouped=True, dx_grouped=False, ) torch.testing.assert_close(fused_result, ref_total, atol=atol, rtol=rtol) def test_basic(self): self._run_fused_dX_test(M=32, K=64, N=128, E=4, R=8, k=2) def test_large(self): self._run_fused_dX_test(M=256, K=256, N=512, E=8, R=16, k=2) def test_single_expert(self): self._run_fused_dX_test(M=64, K=128, N=256, E=1, R=8, k=1) def test_k1(self): self._run_fused_dX_test(M=64, K=64, N=128, E=4, R=8, k=1) def test_bf16(self): self._run_fused_dX_test( M=64, K=128, N=256, E=4, R=16, k=2, dtype=torch.bfloat16, atol=1e-1, rtol=1e-1, ) def test_grouped_output(self): """Test fused dX with dx_grouped=True.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") pll = import_module(f"{_SMOE}.parallel_linear_lora") M, K, N, E, R, k = 32, 64, 128, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) grad_out = torch.randn(data["sorted_expert_idxs"].size(0), N, device="cuda") grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1 ) # Reference: grouped output ref_base = base_ops.scatter2scatter( X=grouped_grad, x_grouped=True, W=data["W"].permute(0, 2, 1), sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=1, y_grouped=True, # grouped output ) ref_lora = pll._compute_lora_input_grad( grouped_grad, data["lora_A"], data["lora_B"], data["expert_offsets"], E, data["scaling"], ) ref_total = ref_base + ref_lora # Fused kernel with grouped output fused_result = lora_ops.scatter2scatter_lora_dX( DY=grouped_grad, W=data["W"], sorted_expert_idxs=data["sorted_expert_idxs"], sorted_scattered_idxs=data["sorted_scattered_idxs"], k=1, lora_A=data["lora_A"], lora_B=data["lora_B"], scaling=data["scaling"], dy_grouped=True, dx_grouped=True, ) torch.testing.assert_close(fused_result, ref_total, atol=5e-2, rtol=5e-2) def test_autograd_with_fused_dX(self): """Full autograd round-trip with use_fused_dX=True.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") M, K, N, E, R, k = 32, 64, 128, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Run without fused dX X1 = data["X"].clone().requires_grad_(True) A1 = data["lora_A"].clone().requires_grad_(True) B1 = data["lora_B"].clone().requires_grad_(True) out1 = pll.ScatterMoELoRA.apply( X1, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A1, B1, data["scaling"], None, None, False, False, False, # use_fused_dX=False ) out1.sum().backward() # Run with fused dX X2 = data["X"].clone().requires_grad_(True) A2 = data["lora_A"].clone().requires_grad_(True) B2 = data["lora_B"].clone().requires_grad_(True) out2 = pll.ScatterMoELoRA.apply( X2, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A2, B2, data["scaling"], None, None, False, False, True, # use_fused_dX=True ) out2.sum().backward() # Forward should be identical torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5) # Gradients should match torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2) torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2) torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2) # ============================================================================= # Test: Optimization 2 - Fused Gather Backward # ============================================================================= class TestFusedGatherBackward: """Test fused gather + backward dA/dB kernel.""" def _run_fused_gather_test( self, M, K, N, E, R, k, dtype=torch.float32, atol=5e-2, rtol=5e-2 ): from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype) # Create grad_out in ungrouped order (M*k, N) M_total = data["sorted_expert_idxs"].size(0) grad_out = torch.randn(M_total, N, device="cuda", dtype=dtype) # Reference: group() + group_bwd_lora() grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1 ) grouped_x = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) ref_dA, ref_dB = lora_ops.group_bwd_lora( DY=grouped_grad, X=grouped_x, lora_A=data["lora_A"], lora_B=data["lora_B"], expert_offsets=data["expert_offsets"], E=E, scaling=data["scaling"], ) # Fused kernel: no group() calls fused_dA, fused_dB = lora_ops.group_bwd_lora_fused( DY=grad_out, X=data["X"], lora_A=data["lora_A"], lora_B=data["lora_B"], expert_offsets=data["expert_offsets"], sorted_scattered_idxs=data["sorted_scattered_idxs"], E=E, k=k, scaling=data["scaling"], ) torch.testing.assert_close(fused_dA, ref_dA, atol=atol, rtol=rtol) torch.testing.assert_close(fused_dB, ref_dB, atol=atol, rtol=rtol) def test_basic(self): self._run_fused_gather_test(M=32, K=64, N=128, E=4, R=8, k=2) def test_large(self): self._run_fused_gather_test(M=256, K=256, N=512, E=8, R=16, k=2) def test_single_expert(self): self._run_fused_gather_test(M=64, K=128, N=256, E=1, R=8, k=1) def test_k1(self): self._run_fused_gather_test(M=64, K=64, N=128, E=4, R=8, k=1) def test_many_experts(self): self._run_fused_gather_test(M=128, K=64, N=128, E=16, R=8, k=4) def test_bf16(self): self._run_fused_gather_test( M=64, K=128, N=256, E=4, R=16, k=2, dtype=torch.bfloat16, atol=1e-1, rtol=1e-1, ) def test_autograd_with_fused_gather(self): """Full autograd round-trip with use_fused_gather=True.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") M, K, N, E, R, k = 32, 64, 128, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Run without fused gather X1 = data["X"].clone().requires_grad_(True) A1 = data["lora_A"].clone().requires_grad_(True) B1 = data["lora_B"].clone().requires_grad_(True) out1 = pll.ScatterMoELoRA.apply( X1, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A1, B1, data["scaling"], None, None, False, False, False, False, # use_fused_dX=False, use_fused_gather=False ) out1.sum().backward() # Run with fused gather X2 = data["X"].clone().requires_grad_(True) A2 = data["lora_A"].clone().requires_grad_(True) B2 = data["lora_B"].clone().requires_grad_(True) out2 = pll.ScatterMoELoRA.apply( X2, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A2, B2, data["scaling"], None, None, False, False, False, True, # use_fused_dX=False, use_fused_gather=True ) out2.sum().backward() # Forward identical torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5) # dA/dB should match torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2) torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2) # dX should also match (same path for dX) torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2) # ============================================================================= # Test: Optimization 3 - Token Rounding # ============================================================================= class TestTokenRounding: """Test token rounding utility and its integration with backward kernels.""" def test_round_expert_counts_basic(self): """Verify round_expert_counts produces correct shapes and values.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") M, K, N, E, R, k = 32, 64, 128, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) padded_ei, padded_si, padded_offsets, real_offsets = ( lora_ops.round_expert_counts( data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], E=E, block_m=lora_ops.BLOCK_M, ) ) # Real offsets should match original torch.testing.assert_close(real_offsets, data["expert_offsets"]) # Padded offsets should be >= real offsets assert (padded_offsets >= real_offsets).all(), ( "Padded offsets should be >= real offsets" ) # Each expert's padded count should be multiple of BLOCK_M (if non-zero) prev = 0 for e in range(E): count = padded_offsets[e].item() - prev real_count = real_offsets[e].item() - ( real_offsets[e - 1].item() if e > 0 else 0 ) if real_count > 0: assert count % lora_ops.BLOCK_M == 0, ( f"Expert {e}: padded count {count} not multiple of {lora_ops.BLOCK_M}" ) assert count >= real_count, ( f"Expert {e}: padded count {count} < real count {real_count}" ) prev = padded_offsets[e].item() def test_round_with_fused_gather(self): """Token rounding + fused gather gives same result as plain fused gather.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") base_ops = import_module(f"{_SMOE}.kernels.ops") M, K, N, E, R, k = 64, 64, 128, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) M_total = data["sorted_expert_idxs"].size(0) grad_out = torch.randn(M_total, N, device="cuda") # Reference: group() + group_bwd_lora() (the gold standard) grouped_grad = base_ops.group( grad_out, data["sorted_scattered_idxs"], fan_out=1 ) grouped_x = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k) ref_dA, ref_dB = lora_ops.group_bwd_lora( DY=grouped_grad, X=grouped_x, lora_A=data["lora_A"], lora_B=data["lora_B"], expert_offsets=data["expert_offsets"], E=E, scaling=data["scaling"], ) # Apply token rounding padded_ei, padded_si, padded_offsets, real_offsets = ( lora_ops.round_expert_counts( data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], E=E, ) ) # Fused gather with token rounding rounded_dA, rounded_dB = lora_ops.group_bwd_lora_fused( DY=grad_out, X=data["X"], lora_A=data["lora_A"], lora_B=data["lora_B"], expert_offsets=padded_offsets, sorted_scattered_idxs=padded_si, E=E, k=k, scaling=data["scaling"], real_expert_offsets=real_offsets, ) torch.testing.assert_close(rounded_dA, ref_dA, atol=5e-2, rtol=5e-2) torch.testing.assert_close(rounded_dB, ref_dB, atol=5e-2, rtol=5e-2) def test_empty_experts_with_rounding(self): """Token rounding handles experts with 0 tokens correctly.""" from importlib import import_module lora_ops = import_module(f"{_SMOE}.kernels.lora_ops") E, k = 8, 1 M = 8 torch.manual_seed(42) # Only use experts 0 and 1 (rest have 0 tokens) selected_experts = torch.randint(0, 2, (M, k), device="cuda") sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = ( flatten_sort_count_ref(selected_experts, E) ) padded_ei, padded_si, padded_offsets, real_offsets = ( lora_ops.round_expert_counts( sorted_expert_idxs, sorted_scattered_idxs, expert_offsets, E=E, ) ) # Verify empty experts have same count (0) for e in range(E): real_count = real_offsets[e].item() - ( real_offsets[e - 1].item() if e > 0 else 0 ) padded_count = padded_offsets[e].item() - ( padded_offsets[e - 1].item() if e > 0 else 0 ) if real_count == 0: assert padded_count == 0, ( f"Expert {e}: empty expert should have padded_count=0, got {padded_count}" ) # ============================================================================= # Test: Combined Optimizations # ============================================================================= class TestCombinedOptimizations: """Test all optimizations together.""" def test_fused_dX_and_fused_gather(self): """Both fused dX and fused gather together.""" from importlib import import_module pll = import_module(f"{_SMOE}.parallel_linear_lora") M, K, N, E, R, k = 64, 128, 256, 4, 8, 2 data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k) # Baseline: no optimizations X1 = data["X"].clone().requires_grad_(True) A1 = data["lora_A"].clone().requires_grad_(True) B1 = data["lora_B"].clone().requires_grad_(True) out1 = pll.ScatterMoELoRA.apply( X1, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A1, B1, data["scaling"], None, None, False, False, False, False, # no optimizations ) out1.sum().backward() # Both optimizations X2 = data["X"].clone().requires_grad_(True) A2 = data["lora_A"].clone().requires_grad_(True) B2 = data["lora_B"].clone().requires_grad_(True) out2 = pll.ScatterMoELoRA.apply( X2, data["W"], k, data["sorted_expert_idxs"], data["sorted_scattered_idxs"], data["expert_offsets"], A2, B2, data["scaling"], None, None, False, False, True, True, # use_fused_dX=True, use_fused_gather=True ) out2.sum().backward() # Forward identical torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5) # All gradients match torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2) torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2) torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2) # ============================================================================= # Test: HFScatterMoEGatedMLP with Sigmoid Routing # ============================================================================= def _reference_moe_forward( hidden_states, gate_weight, gate_up_proj, down_proj, act_fn, routing_weights, selected_experts, num_experts, ): """Pure PyTorch reference for a full MoE forward pass. Args: hidden_states: [T, H] gate_weight: [E, H] gate_up_proj: [E, 2*FF, H] down_proj: [E, H, FF] act_fn: activation function (e.g. torch.nn.SiLU()) routing_weights: [T, K] routing weights selected_experts: [T, K] expert indices num_experts: int Returns: output: [T, H] """ T, H = hidden_states.shape K = selected_experts.shape[1] output = torch.zeros(T, H, device=hidden_states.device, dtype=hidden_states.dtype) for t in range(T): for j in range(K): e = selected_experts[t, j].item() w = routing_weights[t, j].item() # gate_up projection gup = hidden_states[t] @ gate_up_proj[e].T # [2*I] I_dim = gup.shape[0] // 2 gates = gup[:I_dim] up = gup[I_dim:] # activation h = act_fn(gates) * up # down projection out = h @ down_proj[e].T # [H] output[t] += w * out return output def _make_mock_sigmoid_moe_block( T=16, H=64, FF=32, E=8, K=2, n_group=2, topk_group=1, bias_on_gate=True ): """Create a mock MoE block with sigmoid routing for GPU testing.""" gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02 down_proj = torch.randn(E, H, FF, device="cuda") * 0.02 act_fn = torch.nn.SiLU() experts = SimpleNamespace( gate_up_proj=gate_up_proj, down_proj=down_proj, act_fn=act_fn, num_experts=E, ) if bias_on_gate: gate = SimpleNamespace( weight=torch.randn(E, H, device="cuda") * 0.1, e_score_correction_bias=torch.zeros(E, device="cuda"), ) moe_block = SimpleNamespace( gate=gate, experts=experts, top_k=K, n_routed_experts=E, n_group=n_group, topk_group=topk_group, norm_topk_prob=True, routed_scaling_factor=1.0, ) else: # minimax_m2 style gate = SimpleNamespace( weight=torch.randn(E, H, device="cuda") * 0.1, top_k=K, ) moe_block = SimpleNamespace( gate=gate, experts=experts, top_k=K, e_score_correction_bias=torch.zeros(E, device="cuda"), ) return moe_block, T, H, FF, E, K class TestHFScatterMoESigmoidRouting: """Test HFScatterMoEGatedMLP forward with sigmoid routing on GPU.""" def test_forward_matches_reference_bias_on_gate(self): """Forward pass with sigmoid routing (bias on gate) matches reference.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, _sigmoid_topk_route, ) moe_block, T, H, FF, E, K = _make_mock_sigmoid_moe_block( T=16, H=64, FF=32, E=8, K=2, n_group=2, topk_group=1, bias_on_gate=True ) hidden = torch.randn(1, T, H, device="cuda") # Get routing for reference gate = moe_block.gate hidden_flat = hidden.view(-1, H) routing_weights, selected_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden_flat, gate.weight, None ) # Reference output ref_output = _reference_moe_forward( hidden_flat, gate.weight, moe_block.experts.gate_up_proj, moe_block.experts.down_proj, moe_block.experts.act_fn, routing_weights, selected_experts, E, ) # Kernel output kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden) kernel_output_flat = kernel_output.view(-1, H) torch.testing.assert_close( kernel_output_flat.float(), ref_output.float(), atol=5e-2, rtol=5e-2, ) def test_forward_matches_reference_bias_on_block(self): """Forward pass with sigmoid routing (minimax_m2 style, bias on block).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, _sigmoid_topk_route, ) moe_block, T, H, FF, E, K = _make_mock_sigmoid_moe_block( T=16, H=64, FF=32, E=8, K=2, n_group=1, bias_on_gate=False ) hidden = torch.randn(1, T, H, device="cuda") hidden_flat = hidden.view(-1, H) gate = moe_block.gate routing_weights, selected_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden_flat, gate.weight, None ) ref_output = _reference_moe_forward( hidden_flat, gate.weight, moe_block.experts.gate_up_proj, moe_block.experts.down_proj, moe_block.experts.act_fn, routing_weights, selected_experts, E, ) kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden) kernel_output_flat = kernel_output.view(-1, H) torch.testing.assert_close( kernel_output_flat.float(), ref_output.float(), atol=5e-2, rtol=5e-2, ) def test_softmax_routing_still_works(self): """Verify softmax routing (Qwen/OLMoE) is not broken.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, _softmax_topk_route, ) T, H, FF, E, K = 16, 64, 32, 4, 2 gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02 down_proj = torch.randn(E, H, FF, device="cuda") * 0.02 act_fn = torch.nn.SiLU() experts = SimpleNamespace( gate_up_proj=gate_up_proj, down_proj=down_proj, act_fn=act_fn, num_experts=E, ) gate = SimpleNamespace( weight=torch.randn(E, H, device="cuda") * 0.1, top_k=K, num_experts=E, norm_topk_prob=True, ) moe_block = SimpleNamespace(gate=gate, experts=experts) hidden = torch.randn(1, T, H, device="cuda") hidden_flat = hidden.view(-1, H) routing_weights, selected_experts, _, _ = _softmax_topk_route( moe_block, gate, hidden_flat, gate.weight, None ) ref_output = _reference_moe_forward( hidden_flat, gate.weight, gate_up_proj, down_proj, act_fn, routing_weights, selected_experts, E, ) kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden) kernel_output_flat = kernel_output.view(-1, H) torch.testing.assert_close( kernel_output_flat.float(), ref_output.float(), atol=5e-2, rtol=5e-2, ) class TestHFScatterMoESigmoidWithSharedExperts: """Test HFScatterMoEGatedMLP with sigmoid routing + shared experts.""" def test_shared_experts_plural(self): """DeepSeek V3 style: shared_experts attribute (plural).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, ) T, H, FF, E, K = 8, 64, 32, 8, 2 gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02 down_proj = torch.randn(E, H, FF, device="cuda") * 0.02 act_fn = torch.nn.SiLU() experts = SimpleNamespace( gate_up_proj=gate_up_proj, down_proj=down_proj, act_fn=act_fn, num_experts=E, ) # Shared expert as a simple linear for testing shared_W = torch.randn(H, H, device="cuda") * 0.01 shared_experts_fn = lambda x: x @ shared_W.T # noqa: E731 gate = SimpleNamespace( weight=torch.randn(E, H, device="cuda") * 0.1, e_score_correction_bias=torch.zeros(E, device="cuda"), ) moe_block = SimpleNamespace( gate=gate, experts=experts, shared_experts=shared_experts_fn, top_k=K, n_routed_experts=E, n_group=1, norm_topk_prob=True, routed_scaling_factor=1.0, ) hidden = torch.randn(1, T, H, device="cuda") # Should not raise; output should include shared expert contribution output = HFScatterMoEGatedMLP.forward(moe_block, hidden) assert output.shape == (1, T, H) # Run without shared expert to verify it changes the output moe_block_no_shared = SimpleNamespace( gate=gate, experts=experts, top_k=K, n_routed_experts=E, n_group=1, norm_topk_prob=True, routed_scaling_factor=1.0, ) output_no_shared = HFScatterMoEGatedMLP.forward(moe_block_no_shared, hidden) assert not torch.equal(output, output_no_shared) def test_shared_expert_with_gate(self): """Qwen2MoE style: shared_expert + shared_expert_gate.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, ) T, H, FF, E, K = 8, 64, 32, 4, 2 gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02 down_proj = torch.randn(E, H, FF, device="cuda") * 0.02 act_fn = torch.nn.SiLU() experts = SimpleNamespace( gate_up_proj=gate_up_proj, down_proj=down_proj, act_fn=act_fn, num_experts=E, ) shared_W = torch.randn(H, H, device="cuda") * 0.01 shared_expert_fn = lambda x: x @ shared_W.T # noqa: E731 # Gate that returns 0 -> sigmoid(0) = 0.5 gate_W = torch.zeros(H, H, device="cuda") shared_expert_gate_fn = lambda x: x @ gate_W.T # noqa: E731 gate = SimpleNamespace( weight=torch.randn(E, H, device="cuda") * 0.1, top_k=K, num_experts=E, norm_topk_prob=True, ) moe_block = SimpleNamespace( gate=gate, experts=experts, shared_expert=shared_expert_fn, shared_expert_gate=shared_expert_gate_fn, ) hidden = torch.randn(1, T, H, device="cuda") output = HFScatterMoEGatedMLP.forward(moe_block, hidden) assert output.shape == (1, T, H) ================================================ FILE: tests/e2e/integrations/test_scattermoe_lora_olmoe.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Integration tests: OLMoE + peft LoRA + ScatterMoE fused kernels. Validates that scattermoe_lora fused kernels produce correct results when used with HuggingFace OLMoE models and peft LoRA adapters applied via ``target_parameters``. Key things tested ----------------- - LoRA weight layout conversion between peft (rank-major) and scattermoe (expert-major) - Base forward equivalence: per-expert reference vs ScatterMoE kernels (no LoRA) - LoRA forward equivalence: peft merged-weight approach vs scattermoe fused kernels - Backward gradient correctness through the fused LoRA path - ``kernelize()`` integration via ``LocalLayerRepository`` """ from pathlib import Path import pytest import torch import torch.nn as nn import torch.nn.functional as F from peft import LoraConfig, get_peft_model from transformers import OlmoeConfig from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock _SMOE = "axolotl.integrations.kernels.libs.scattermoe_lora" # Try to import from axolotl's scattermoe_lora.layers; may fail on CPU without triton. try: from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _unwrap_experts_lora, _unwrap_gate_lora, peft_lora_B_to_scattermoe, peft_lora_to_scattermoe, ) HAS_SCATTERMOE = True except (ImportError, ModuleNotFoundError): HAS_SCATTERMOE = False # Provide pure-torch fallbacks for CPU-only layout conversion tests. def peft_lora_B_to_scattermoe(peft_B, num_experts, rank): N = peft_B.shape[0] return ( peft_B.reshape(N, rank, num_experts) .permute(0, 2, 1) .contiguous() .reshape(N, num_experts * rank) ) def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank): peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank) K_inter, N_hidden = peft_B.shape[0], peft_A.shape[1] smoe_A = torch.zeros( rank * num_experts, K_inter, device=peft_A.device, dtype=peft_A.dtype, ) smoe_B = torch.zeros( N_hidden, rank * num_experts, device=peft_A.device, dtype=peft_A.dtype, ) for e in range(num_experts): s = e * rank smoe_A[s : s + rank, :] = peft_B_em[:, s : s + rank].T smoe_B[:, s : s + rank] = peft_A[s : s + rank, :].T return smoe_A, smoe_B def _unwrap_experts_lora(experts_module): return experts_module, None, None def _unwrap_gate_lora(gate_module): if hasattr(gate_module, "base_layer") and hasattr(gate_module, "lora_A"): base_gate = gate_module.base_layer active = getattr(gate_module, "active_adapters", ["default"]) name = active[0] if active else "default" lora_A_dict = getattr(gate_module, "lora_A", {}) lora_B_dict = getattr(gate_module, "lora_B", {}) scaling_dict = getattr(gate_module, "scaling", {}) if name in lora_A_dict: lora_A = lora_A_dict[name].weight lora_B = lora_B_dict[name].weight s = scaling_dict[name] delta = s * (lora_B @ lora_A) return base_gate, base_gate.weight, delta return base_gate, base_gate.weight, None return gate_module, gate_module.weight, None # ============================================================================= # Configuration # ============================================================================= FULL_OLMOE_CONFIG = dict( hidden_size=2048, intermediate_size=1024, num_experts=64, num_experts_per_tok=8, hidden_act="silu", norm_topk_prob=False, ) SMALL_OLMOE_CONFIG = dict( hidden_size=128, intermediate_size=48, # non-square: 2*inter=96 != hidden=128 num_experts=8, num_experts_per_tok=2, hidden_act="silu", norm_topk_prob=False, ) requires_cuda = pytest.mark.skipif( not torch.cuda.is_available(), reason="CUDA not available" ) def make_olmoe_config(use_full=False): cfg = dict(FULL_OLMOE_CONFIG if use_full else SMALL_OLMOE_CONFIG) cfg["experts_implementation"] = "grouped_mm" return OlmoeConfig(**cfg) # ============================================================================= # Layout conversion utilities (test-local helpers) # ============================================================================= def scattermoe_lora_B_to_peft(smoe_B, num_experts, rank): """Inverse of ``peft_lora_B_to_scattermoe``.""" N = smoe_B.shape[0] return ( smoe_B.reshape(N, num_experts, rank) .permute(0, 2, 1) .contiguous() .reshape(N, num_experts * rank) ) def peft_gate_up_lora_to_scattermoe(peft_A, peft_B, num_experts, rank): """Convert peft LoRA for gate_up_proj to scattermoe layout. Both gate_up_proj and down_proj need the A<->B swap because scattermoe transposes the parameter (W = param.T). """ return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank) # ============================================================================= # Helpers # ============================================================================= def _init_expert_weights(moe_block): """Initialize OlmoeExperts parameters which use torch.empty (uninitialized). Without this, gate_up_proj and down_proj contain garbage/NaN values. """ with torch.no_grad(): nn.init.kaiming_uniform_(moe_block.experts.gate_up_proj) nn.init.kaiming_uniform_(moe_block.experts.down_proj) return moe_block class MinimalOLMoEModel(nn.Module): """Thin wrapper so peft's get_peft_model can attach adapters.""" def __init__(self, config): super().__init__() self.moe = OlmoeSparseMoeBlock(config) _init_expert_weights(self.moe) def forward(self, x): return self.moe(x) def _get_routing(moe_block, hidden_states): """Run the router and return (routing_weights, selected_experts).""" with torch.no_grad(): _, routing_weights, selected_experts = moe_block.gate( hidden_states.view(-1, hidden_states.size(-1)) ) return routing_weights, selected_experts def _reference_moe_forward( x_flat, gate_up_proj, down_proj, act_fn, top_k_index, top_k_weights, num_experts, ): """Pure-PyTorch per-expert reference MoE forward (no LoRA). Uses F.linear per expert for an apples-to-apples comparison with the ScatterMoE kernel path. """ final = torch.zeros_like(x_flat) expert_mask = F.one_hot(top_k_index, num_classes=num_experts).permute(2, 1, 0) for e in range(num_experts): top_k_pos, token_idx = torch.where(expert_mask[e]) if token_idx.numel() == 0: continue cur = x_flat[token_idx] gate_up = F.linear(cur, gate_up_proj[e]) g, u = gate_up.chunk(2, dim=-1) h = act_fn(g) * u out = F.linear(h, down_proj[e]) out = out * top_k_weights[token_idx, top_k_pos, None] final.index_add_(0, token_idx, out.to(final.dtype)) return final def _reference_moe_forward_with_lora( x_flat, gate_up_proj, down_proj, act_fn, top_k_index, top_k_weights, num_experts, gup_delta, down_delta, ): """Pure-PyTorch reference MoE forward with pre-computed weight deltas.""" merged_gup = gate_up_proj + gup_delta merged_down = down_proj + down_delta return _reference_moe_forward( x_flat, merged_gup, merged_down, act_fn, top_k_index, top_k_weights, num_experts, ) def _compute_delta_from_scattermoe_lora(lora_A, lora_B, scaling, E, r, param_shape): """Compute additive weight delta from scattermoe-layout LoRA weights. delta[e] = scaling * B_e @ A_e where A_e [r,K], B_e [N,r] -> [N,K]. """ delta = torch.zeros(param_shape, device=lora_A.device, dtype=lora_A.dtype) for e in range(E): A_e = lora_A[e * r : (e + 1) * r, :] B_e = lora_B[:, e * r : (e + 1) * r] delta[e] = scaling * (B_e @ A_e) return delta # ============================================================================= # Tests: Layout conversion # ============================================================================= class TestLoRABLayoutConversion: """Test the peft <-> scattermoe lora_B layout conversion.""" def test_roundtrip(self): E, r, N = 8, 4, 64 original = torch.randn(N, E * r) converted = peft_lora_B_to_scattermoe(original, E, r) back = scattermoe_lora_B_to_peft(converted, E, r) torch.testing.assert_close(back, original) def test_per_expert_slices(self): """After conversion, scattermoe slicing gives the same per-expert matrices as peft's reshape slicing.""" E, r, N = 4, 2, 16 peft_B = torch.randn(N, E * r) smoe_B = peft_lora_B_to_scattermoe(peft_B, E, r) peft_reshaped = peft_B.reshape(N, r, E) for e in range(E): torch.testing.assert_close( smoe_B[:, e * r : (e + 1) * r], peft_reshaped[:, :, e], ) def test_lora_A_already_compatible(self): """lora_A layout is identical between peft and scattermoe.""" E, r, K = 4, 2, 16 lora_A = torch.randn(E * r, K) peft_reshaped = lora_A.reshape(E, r, K) for e in range(E): torch.testing.assert_close( lora_A[e * r : (e + 1) * r, :], peft_reshaped[e], ) def test_delta_weight_equivalence(self): """peft's einsum delta matches per-expert B @ A with converted layouts.""" E, r, K, N = 8, 4, 32, 64 peft_A = torch.randn(E * r, K) peft_B = torch.randn(N, E * r) scaling = 2.0 A_r = peft_A.reshape(E, r, K) B_r = peft_B.reshape(N, r, E) delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling smoe_B = peft_lora_B_to_scattermoe(peft_B, E, r) for e in range(E): A_e = peft_A[e * r : (e + 1) * r, :] B_e = smoe_B[:, e * r : (e + 1) * r] delta_e = scaling * (B_e @ A_e) torch.testing.assert_close(delta_e, delta_peft[e].T, atol=1e-5, rtol=1e-5) def test_down_proj_conversion(self): """Verify peft_lora_to_scattermoe produces correct delta.""" E, r = 4, 2 hidden, inter = 32, 16 scaling = 2.0 peft_A = torch.randn(E * r, hidden) peft_B = torch.randn(inter, E * r) A_r = peft_A.reshape(E, r, hidden) B_r = peft_B.reshape(inter, r, E) delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling smoe_A, smoe_B = peft_lora_to_scattermoe(peft_A, peft_B, E, r) for e in range(E): A_e = smoe_A[e * r : (e + 1) * r, :] B_e = smoe_B[:, e * r : (e + 1) * r] delta_smoe_e = scaling * (B_e @ A_e) torch.testing.assert_close( delta_smoe_e, delta_peft[e], atol=1e-5, rtol=1e-5 ) def test_gate_up_proj_conversion(self): """Verify gate_up_proj LoRA conversion with non-square dims (Qwen3-like). gate_up_proj param: [E, 2*inter, hidden]. peft: in_features=2*inter, out_features=hidden. peft lora_A: [r*E, 2*inter], lora_B: [hidden, r*E]. scattermoe W = param.T = [E, hidden, 2*inter], K=hidden, N=2*inter. scattermoe needs: lora_A [r*E, K=hidden], lora_B [N=2*inter, r*E]. Uses non-square dims (hidden=32 != 2*inter=24) to catch A<->B swap bugs. """ E, r = 4, 2 hidden, inter = 32, 12 # 2*inter=24 != hidden=32 scaling = 2.0 # peft assigns: in_features=2*inter, out_features=hidden peft_A = torch.randn(E * r, 2 * inter) # [r*E, in_features=2*inter] peft_B = torch.randn(hidden, E * r) # [out_features=hidden, r*E] # peft delta via einsum: "o r e, e r i -> e i o" A_r = peft_A.reshape(E, r, 2 * inter) B_r = peft_B.reshape(hidden, r, E) delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling # delta_peft[e] has shape [in_features, out_features] = [2*inter, hidden] # = param[e] shape [2*inter, hidden] smoe_A, smoe_B = peft_gate_up_lora_to_scattermoe(peft_A, peft_B, E, r) # smoe_A should be [r*E, K=hidden], smoe_B should be [N=2*inter, r*E] assert smoe_A.shape == (E * r, hidden), ( f"Expected {(E * r, hidden)}, got {smoe_A.shape}" ) assert smoe_B.shape == (2 * inter, E * r), ( f"Expected {(2 * inter, E * r)}, got {smoe_B.shape}" ) for e in range(E): A_e = smoe_A[e * r : (e + 1) * r, :] # [r, K=hidden] B_e = smoe_B[:, e * r : (e + 1) * r] # [N=2*inter, r] delta_smoe_e = scaling * (B_e @ A_e) # [2*inter, hidden] # Should match peft delta which is [2*inter, hidden] = param[e] torch.testing.assert_close( delta_smoe_e, delta_peft[e], atol=1e-5, rtol=1e-5 ) # ============================================================================= # Tests: peft weight extraction # ============================================================================= class TestPeftLoRAWeightExtraction: """Test extracting peft LoRA weights for OLMoE.""" def test_peft_creates_correct_shapes(self): config = make_olmoe_config(use_full=False) E, r = config.num_experts, 4 model = MinimalOLMoEModel(config) lora_config = LoraConfig( r=r, lora_alpha=16, target_modules=[], target_parameters=[ "gate.weight", "experts.gate_up_proj", "experts.down_proj", ], bias="none", ) peft_model = get_peft_model(model, lora_config) trainable = {n: p for n, p in peft_model.named_parameters() if p.requires_grad} # Gate router assert trainable["base_model.model.moe.gate.lora_A.default.weight"].shape == ( r, config.hidden_size, ) assert trainable["base_model.model.moe.gate.lora_B.default.weight"].shape == ( E, r, ) # gate_up_proj [E, 2*inter, hidden] # peft: in_features=2*inter (dim 1), out_features=hidden (dim 2) assert trainable[ "base_model.model.moe.experts.base_layer.lora_A.default.weight" ].shape == (E * r, 2 * config.intermediate_size) assert trainable[ "base_model.model.moe.experts.base_layer.lora_B.default.weight" ].shape == (config.hidden_size, E * r) # down_proj [E, hidden, inter] # peft: in_features=hidden (dim 1), out_features=inter (dim 2) assert trainable[ "base_model.model.moe.experts.lora_A.default.weight" ].shape == (E * r, config.hidden_size) assert trainable[ "base_model.model.moe.experts.lora_B.default.weight" ].shape == (config.intermediate_size, E * r) @requires_cuda def test_peft_forward_runs(self): """Smoke test: peft model forward pass completes (needs CUDA for grouped_mm).""" config = make_olmoe_config(use_full=False) model = MinimalOLMoEModel(config) lora_config = LoraConfig( r=4, lora_alpha=16, target_modules=[], target_parameters=[ "gate.weight", "experts.gate_up_proj", "experts.down_proj", ], bias="none", ) peft_model = get_peft_model(model, lora_config) x = torch.randn(1, 4, config.hidden_size) out = peft_model(x) assert out.shape == x.shape @pytest.mark.skipif( not HAS_SCATTERMOE, reason="scattermoe_lora not importable (no triton)" ) def test_unwrap_experts_lora(self): """Test that _unwrap_experts_lora correctly detects LoRA wrappers.""" config = make_olmoe_config(use_full=False) model = MinimalOLMoEModel(config) lora_config = LoraConfig( r=4, lora_alpha=16, target_modules=[], target_parameters=["experts.gate_up_proj", "experts.down_proj"], bias="none", ) peft_model = get_peft_model(model, lora_config) base_moe = peft_model.base_model.model.moe # Experts should be wrapped by ParamWrapper experts, gup_lora, down_lora = _unwrap_experts_lora(base_moe.experts) # Base experts should have the raw parameters assert hasattr(experts, "gate_up_proj") assert hasattr(experts, "down_proj") # LoRA should be detected assert gup_lora is not None, "gate_up_proj LoRA not detected" assert down_lora is not None, "down_proj LoRA not detected" # Check shapes (after peft->scattermoe conversion with A<->B swap) # gate_up_proj W = param.T = [E, hidden, 2*inter], K=hidden, N=2*inter E, r = config.num_experts, 4 gup_A, gup_B, gup_s = gup_lora assert gup_A.shape == (E * r, config.hidden_size), ( f"gate_up_proj smoe_A: expected [r*E, K=hidden]={(E * r, config.hidden_size)}, " f"got {gup_A.shape}" ) assert gup_B.shape == (2 * config.intermediate_size, E * r), ( f"gate_up_proj smoe_B: expected [N=2*inter, r*E]=" f"{(2 * config.intermediate_size, E * r)}, got {gup_B.shape}" ) # down_proj W = param.T = [E, inter, hidden], K=inter, N=hidden down_A, down_B, down_s = down_lora assert down_A.shape == (E * r, config.intermediate_size), ( f"down_proj smoe_A: expected [r*E, K=inter]={(E * r, config.intermediate_size)}, " f"got {down_A.shape}" ) assert down_B.shape == (config.hidden_size, E * r), ( f"down_proj smoe_B: expected [N=hidden, r*E]={(config.hidden_size, E * r)}, " f"got {down_B.shape}" ) def test_unwrap_no_lora(self): """Without peft, _unwrap_experts_lora returns no LoRA.""" config = make_olmoe_config(use_full=False) moe = OlmoeSparseMoeBlock(config) experts, gup_lora, down_lora = _unwrap_experts_lora(moe.experts) assert gup_lora is None assert down_lora is None assert hasattr(experts, "gate_up_proj") def test_unwrap_gate_lora(self): """Test that _unwrap_gate_lora detects LoRA on the router gate.""" config = make_olmoe_config(use_full=False) model = MinimalOLMoEModel(config) r = 4 lora_config = LoraConfig( r=r, lora_alpha=16, target_modules=[], target_parameters=["gate.weight"], bias="none", ) peft_model = get_peft_model(model, lora_config) base_moe = peft_model.base_model.model.moe # Set non-zero LoRA weights (peft initializes lora_B to zeros) with torch.no_grad(): base_moe.gate.lora_B["default"].weight.normal_(0, 0.01) base_gate, gate_weight, gate_delta = _unwrap_gate_lora(base_moe.gate) # Base gate should be the original router assert hasattr(base_gate, "top_k") assert hasattr(base_gate, "num_experts") assert base_gate.top_k == config.num_experts_per_tok assert base_gate.num_experts == config.num_experts # Gate weight should be the base weight (delta returned separately) assert gate_weight.shape == (config.num_experts, config.hidden_size) torch.testing.assert_close(gate_weight, base_gate.weight) # Delta should be non-zero (LoRA was applied) assert gate_delta is not None assert gate_delta.shape == (config.num_experts, config.hidden_size) assert gate_delta.abs().max() > 0, "Gate LoRA delta should be non-zero" def test_unwrap_gate_no_lora(self): """Without peft, _unwrap_gate_lora returns the original gate.""" config = make_olmoe_config(use_full=False) moe = OlmoeSparseMoeBlock(config) base_gate, gate_weight, gate_delta = _unwrap_gate_lora(moe.gate) assert base_gate is moe.gate torch.testing.assert_close(gate_weight, moe.gate.weight) assert gate_delta is None def test_gate_lora_delta_matches_peft(self): """Verify _unwrap_gate_lora computes the same delta as peft.""" config = make_olmoe_config(use_full=False) model = MinimalOLMoEModel(config) r = 4 lora_alpha = 16 scaling = lora_alpha / r lora_config = LoraConfig( r=r, lora_alpha=lora_alpha, target_modules=[], target_parameters=["gate.weight"], bias="none", ) peft_model = get_peft_model(model, lora_config) base_moe = peft_model.base_model.model.moe # Our unwrapped weight + delta _, gate_weight, gate_delta = _unwrap_gate_lora(base_moe.gate) # Manually compute expected delta lora_A = base_moe.gate.lora_A["default"].weight # [r, hidden] lora_B = base_moe.gate.lora_B["default"].weight # [E, r] base_weight = base_moe.gate.base_layer.weight # [E, hidden] expected_delta = scaling * (lora_B @ lora_A) torch.testing.assert_close(gate_weight, base_weight) torch.testing.assert_close(gate_delta, expected_delta) # Combined should match the old behavior torch.testing.assert_close( gate_weight + gate_delta, base_weight + expected_delta ) # ============================================================================= # Tests: Base forward equivalence (no LoRA) # ============================================================================= @requires_cuda class TestOLMoEReferenceVsScatterMoE: """Base forward equivalence: per-expert reference vs ScatterMoE kernels.""" def test_small(self): self._run(use_full=False, M=16) @pytest.mark.slow def test_full(self): self._run(use_full=True, M=32) def _run(self, use_full, M): from axolotl.integrations.kernels.libs.scattermoe_lora import ( flatten_sort_count, parallel_linear, ) config = make_olmoe_config(use_full=use_full) torch.manual_seed(42) moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float() E, k = config.num_experts, config.num_experts_per_tok x = torch.randn(1, M, config.hidden_size, device="cuda") x_flat = x.view(-1, config.hidden_size) with torch.no_grad(): # Shared routing for both paths _, rw, sel = moe.gate(x_flat) sei, ssi, eo = flatten_sort_count(sel, num_experts=E) # Per-expert reference ref_out = _reference_moe_forward( x_flat, moe.experts.gate_up_proj, moe.experts.down_proj, moe.experts.act_fn, sel, rw, E, ).view(1, M, config.hidden_size) # ScatterMoE kernel path gup = parallel_linear( x_flat, moe.experts.gate_up_proj.transpose(2, 1), k, sei, ssi, eo, grouped_in=False, grouped_out=True, ) g, u = gup.chunk(2, dim=-1) h = moe.experts.act_fn(g) * u smoe_out = parallel_linear( h, moe.experts.down_proj.transpose(2, 1), 1, sei, ssi, eo, grouped_in=True, grouped_out=False, gates=rw, ).view(1, M, config.hidden_size) torch.testing.assert_close(smoe_out, ref_out, atol=1e-3, rtol=1e-3) # ============================================================================= # Tests: LoRA forward equivalence (peft vs scattermoe fused) # ============================================================================= @requires_cuda class TestOLMoEPeftLoRAForward: """Fused LoRA forward: peft merged-weight vs scattermoe_lora kernel.""" def test_small(self): self._run(use_full=False, M=16, r=4) @pytest.mark.slow def test_full(self): self._run(use_full=True, M=32, r=8) def _run(self, use_full, M, r): from axolotl.integrations.kernels.libs.scattermoe_lora import ( flatten_sort_count, parallel_linear_lora, ) config = make_olmoe_config(use_full=use_full) E, k = config.num_experts, config.num_experts_per_tok lora_alpha = 16 scaling = lora_alpha / r # Create peft model model = MinimalOLMoEModel(config).cuda().float() lora_config = LoraConfig( r=r, lora_alpha=lora_alpha, target_modules=[], target_parameters=["experts.gate_up_proj", "experts.down_proj"], bias="none", ) peft_model = get_peft_model(model, lora_config) torch.manual_seed(42) x = torch.randn(1, M, config.hidden_size, device="cuda") # peft forward with torch.no_grad(): peft_out = peft_model(x) # Extract base weights and LoRA weights base_moe = peft_model.base_model.model.moe base_experts = base_moe.experts.base_layer.base_layer gate_up_proj = base_experts.gate_up_proj down_proj = base_experts.down_proj act_fn = base_experts.act_fn # gate_up_proj LoRA gup_w = base_moe.experts.base_layer peft_gup_A = gup_w.lora_A["default"].weight.detach() peft_gup_B = gup_w.lora_B["default"].weight.detach() smoe_gup_A, smoe_gup_B = peft_gate_up_lora_to_scattermoe( peft_gup_A, peft_gup_B, E, r ) # down_proj LoRA down_w = base_moe.experts peft_down_A = down_w.lora_A["default"].weight.detach() peft_down_B = down_w.lora_B["default"].weight.detach() smoe_down_A, smoe_down_B = peft_lora_to_scattermoe( peft_down_A, peft_down_B, E, r ) # ScatterMoE fused forward -- gate is NOT peft-wrapped, access directly x_flat = x.view(-1, config.hidden_size) with torch.no_grad(): _, rw, sel = base_moe.gate(x_flat) sei, ssi, eo = flatten_sort_count(sel, num_experts=E) gup = parallel_linear_lora( x_flat, gate_up_proj.transpose(2, 1), k, sei, ssi, eo, lora_A=smoe_gup_A, lora_B=smoe_gup_B, scaling=scaling, grouped_in=False, grouped_out=True, ) g, u = gup.chunk(2, dim=-1) h = act_fn(g) * u smoe_out = parallel_linear_lora( h, down_proj.transpose(2, 1), 1, sei, ssi, eo, lora_A=smoe_down_A, lora_B=smoe_down_B, scaling=scaling, grouped_in=True, grouped_out=False, gates=rw, ).view(1, M, config.hidden_size) torch.testing.assert_close(smoe_out, peft_out, atol=5e-3, rtol=5e-3) # ============================================================================= # Tests: Backward gradient correctness # ============================================================================= @requires_cuda class TestOLMoEPeftLoRABackward: """Backward gradients through scattermoe_lora vs pure-PyTorch reference.""" def test_small(self): self._run(use_full=False, M=16, r=4) def _run(self, use_full, M, r): from axolotl.integrations.kernels.libs.scattermoe_lora import ( flatten_sort_count, parallel_linear_lora, ) config = make_olmoe_config(use_full=use_full) E, k = config.num_experts, config.num_experts_per_tok lora_alpha = 16 scaling = lora_alpha / r torch.manual_seed(42) moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float() x = torch.randn(1, M, config.hidden_size, device="cuda") x_flat = x.view(-1, config.hidden_size) gate_up_proj = moe.experts.gate_up_proj down_proj = moe.experts.down_proj # Create LoRA weights in scattermoe layout directly gup_A = torch.randn(r * E, config.hidden_size, device="cuda") * 0.01 gup_B = torch.randn(2 * config.intermediate_size, r * E, device="cuda") * 0.01 down_A = torch.randn(r * E, config.intermediate_size, device="cuda") * 0.01 down_B = torch.randn(config.hidden_size, r * E, device="cuda") * 0.01 rw, sel = _get_routing(moe, x) sei, ssi, eo = flatten_sort_count(sel, num_experts=E) # --- Reference --- gup_delta = _compute_delta_from_scattermoe_lora( gup_A, gup_B, scaling, E, r, gate_up_proj.shape ) down_delta = _compute_delta_from_scattermoe_lora( down_A, down_B, scaling, E, r, down_proj.shape ) x_ref = x_flat.clone().detach().requires_grad_(True) ref_out = _reference_moe_forward_with_lora( x_ref, gate_up_proj, down_proj, moe.experts.act_fn, sel, rw, E, gup_delta, down_delta, ) ref_out.sum().backward() # --- ScatterMoE fused path --- x_smoe = x_flat.clone().detach().requires_grad_(True) gup_A_s = gup_A.clone().requires_grad_(True) gup_B_s = gup_B.clone().requires_grad_(True) down_A_s = down_A.clone().requires_grad_(True) down_B_s = down_B.clone().requires_grad_(True) gup_out = parallel_linear_lora( x_smoe, gate_up_proj.transpose(2, 1), k, sei, ssi, eo, lora_A=gup_A_s, lora_B=gup_B_s, scaling=scaling, grouped_in=False, grouped_out=True, ) g, u = gup_out.chunk(2, dim=-1) h = moe.experts.act_fn(g) * u smoe_out = parallel_linear_lora( h, down_proj.transpose(2, 1), 1, sei, ssi, eo, lora_A=down_A_s, lora_B=down_B_s, scaling=scaling, grouped_in=True, grouped_out=False, gates=rw, ) smoe_out.sum().backward() torch.testing.assert_close( smoe_out.detach(), ref_out.detach(), atol=5e-3, rtol=5e-3, ) torch.testing.assert_close( x_smoe.grad, x_ref.grad, atol=5e-2, rtol=5e-2, ) # ============================================================================= # Tests: kernelize() integration via LocalLayerRepository # ============================================================================= @requires_cuda class TestKernelizeIntegration: """Test the HF kernels library integration with LocalLayerRepository.""" @staticmethod def _get_kernelize_imports(): """Import kernels library components, skip if not available.""" try: from kernels import ( LocalLayerRepository, Mode, kernelize, register_kernel_mapping, replace_kernel_forward_from_hub, ) return ( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, kernelize, ) except ImportError: pytest.skip("kernels library not installed") @staticmethod def _get_repo_path(): """Get the path to scattermoe_lora within axolotl's plugin.""" return ( Path(__file__).parent.parent.parent / "src" / "axolotl" / "integrations" / "kernels" / "libs" / "scattermoe_lora" ) def _setup_kernels( self, LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, ): """Register kernel mapping for tests.""" repo_path = self._get_repo_path() local_repo = LocalLayerRepository( repo_path=repo_path, package_name="scattermoe_lora", layer_name="HFScatterMoEGatedMLP", ) replace_kernel_forward_from_hub( OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts" ) register_kernel_mapping( { "HFScatterMoEParallelExperts": { "cuda": { Mode.TRAINING: local_repo, Mode.INFERENCE: local_repo, }, } } ) def test_base_forward_via_kernelize(self): """Kernelized OlmoeSparseMoeBlock (no LoRA) matches per-expert reference.""" ( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, kernelize, ) = self._get_kernelize_imports() config = make_olmoe_config(use_full=False) E = config.num_experts # Create model torch.manual_seed(42) moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float() x = torch.randn(1, 8, config.hidden_size, device="cuda") x_flat = x.view(-1, config.hidden_size) # Compute reference BEFORE kernelizing with torch.no_grad(): _, rw, sel = moe.gate(x_flat) ref_out = _reference_moe_forward( x_flat, moe.experts.gate_up_proj, moe.experts.down_proj, moe.experts.act_fn, sel, rw, E, ).view(1, 8, config.hidden_size) # Set up kernel mapping self._setup_kernels( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, ) # Kernelize the model kernelize(moe, mode=Mode.TRAINING, device="cuda") # Forward through kernelized model with torch.no_grad(): kern_out = moe(x) torch.testing.assert_close(kern_out, ref_out, atol=1e-3, rtol=1e-3) def test_lora_forward_via_kernelize(self): """Kernelized OlmoeSparseMoeBlock with peft LoRA matches reference.""" ( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, kernelize, ) = self._get_kernelize_imports() config = make_olmoe_config(use_full=False) r = 4 # Create peft model torch.manual_seed(42) model = MinimalOLMoEModel(config).cuda().float() lora_config = LoraConfig( r=r, lora_alpha=16, target_modules=[], target_parameters=["experts.gate_up_proj", "experts.down_proj"], bias="none", ) peft_model = get_peft_model(model, lora_config) x = torch.randn(1, 8, config.hidden_size, device="cuda") # Reference: peft's own forward (uses _activate_lora context manager) with torch.no_grad(): ref_out = peft_model(x) # Set up kernel mapping self._setup_kernels( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, ) # Kernelize the MoE block inside the peft model base_moe = peft_model.base_model.model.moe kernelize(base_moe, mode=Mode.TRAINING, device="cuda") # Forward through kernelized peft model with torch.no_grad(): kern_out = peft_model(x) torch.testing.assert_close(kern_out, ref_out, atol=5e-3, rtol=5e-3) def test_gate_lora_forward_via_kernelize(self): """Kernelized forward with gate LoRA matches peft reference.""" ( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, kernelize, ) = self._get_kernelize_imports() config = make_olmoe_config(use_full=False) r = 4 # Create peft model with gate + experts LoRA torch.manual_seed(42) model = MinimalOLMoEModel(config).cuda().float() lora_config = LoraConfig( r=r, lora_alpha=16, target_modules=[], target_parameters=[ "gate.weight", "experts.gate_up_proj", "experts.down_proj", ], bias="none", ) peft_model = get_peft_model(model, lora_config) x = torch.randn(1, 8, config.hidden_size, device="cuda") # Reference: peft's own forward with torch.no_grad(): ref_out = peft_model(x) # Set up kernel mapping self._setup_kernels( LocalLayerRepository, Mode, register_kernel_mapping, replace_kernel_forward_from_hub, ) # Kernelize the MoE block inside the peft model base_moe = peft_model.base_model.model.moe kernelize(base_moe, mode=Mode.TRAINING, device="cuda") # Forward through kernelized peft model with torch.no_grad(): kern_out = peft_model(x) torch.testing.assert_close(kern_out, ref_out, atol=5e-3, rtol=5e-3) # ============================================================================= # Tests: Shared expert handling # ============================================================================= class TestSharedExpertHandling: """Test that HFScatterMoEGatedMLP.forward handles shared experts.""" @staticmethod def _make_shared_expert_block(config): """Create an OlmoeSparseMoeBlock with a mock shared expert attached.""" moe = OlmoeSparseMoeBlock(config) _init_expert_weights(moe) hidden = config.hidden_size inter = config.intermediate_size # Attach a simple shared expert MLP (mimics Qwen2MoE structure) class SharedExpertMLP(nn.Module): def __init__(self, hidden_size, intermediate_size): super().__init__() self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) self.act_fn = nn.SiLU() def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) moe.shared_expert = SharedExpertMLP(hidden, inter) moe.shared_expert_gate = nn.Linear(hidden, 1, bias=False) return moe def test_shared_expert_is_used(self): """Verify shared expert output affects final result.""" config = make_olmoe_config(use_full=False) moe = self._make_shared_expert_block(config) # Compute reference without shared expert torch.manual_seed(42) x = torch.randn(1, 4, config.hidden_size) x_flat = x.view(-1, config.hidden_size) with torch.no_grad(): # Shared expert contribution shared_out = moe.shared_expert(x_flat) gate_val = F.sigmoid(moe.shared_expert_gate(x_flat)) shared_contribution = shared_out * gate_val # Verify shared expert produces non-zero output assert shared_contribution.abs().max() > 0 @requires_cuda def test_shared_expert_forward_via_kernelize(self): """Kernelized forward with shared expert matches manual reference.""" try: from kernels import ( LocalLayerRepository, Mode, kernelize, register_kernel_mapping, replace_kernel_forward_from_hub, ) except ImportError: pytest.skip("kernels library not installed") config = make_olmoe_config(use_full=False) E = config.num_experts torch.manual_seed(42) moe = self._make_shared_expert_block(config).cuda().float() x = torch.randn(1, 8, config.hidden_size, device="cuda") x_flat = x.view(-1, config.hidden_size) # Compute reference: per-expert + shared expert with torch.no_grad(): _, rw, sel = moe.gate(x_flat) expert_out = _reference_moe_forward( x_flat, moe.experts.gate_up_proj, moe.experts.down_proj, moe.experts.act_fn, sel, rw, E, ) shared_out = moe.shared_expert(x_flat) gate_val = F.sigmoid(moe.shared_expert_gate(x_flat)) ref_out = (expert_out + shared_out * gate_val).view( 1, 8, config.hidden_size ) # Kernelize repo_path = ( Path(__file__).parent.parent.parent / "src" / "axolotl" / "integrations" / "kernels" / "libs" / "scattermoe_lora" ) local_repo = LocalLayerRepository( repo_path=repo_path, package_name="scattermoe_lora", layer_name="HFScatterMoEGatedMLP", ) replace_kernel_forward_from_hub( OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts" ) register_kernel_mapping( { "HFScatterMoEParallelExperts": { "cuda": { Mode.TRAINING: local_repo, Mode.INFERENCE: local_repo, }, } } ) kernelize(moe, mode=Mode.TRAINING, device="cuda") with torch.no_grad(): kern_out = moe(x) torch.testing.assert_close(kern_out, ref_out, atol=1e-3, rtol=1e-3) ================================================ FILE: tests/e2e/integrations/test_sonicmoe.py ================================================ """ End-to-end gradient and convergence tests for SonicMoE integration. Requires: - H100/H200 GPU (SonicMoE CUTLASS kernels target sm_90) - sonicmoe package installed - transformers with Qwen3MoE support Usage: pytest tests/e2e/integrations/test_sonicmoe.py -v -s """ import importlib.util import math import pytest import torch _sonicmoe_available = importlib.util.find_spec("sonicmoe") is not None _is_hopper = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0) pytestmark = [ pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA GPU"), pytest.mark.skipif( not _is_hopper, reason="SonicMoE CUTLASS kernels require Hopper (sm_90)" ), pytest.mark.skipif(not _sonicmoe_available, reason="SonicMoE not installed"), ] def _create_tiny_qwen3_config(): """Create a minimal Qwen3MoE config for fast testing.""" from transformers import AutoConfig config = AutoConfig.for_model("qwen3_moe") config.hidden_size = 512 config.intermediate_size = 1024 config.moe_intermediate_size = 64 config.num_attention_heads = 16 config.num_key_value_heads = 2 config.head_dim = 32 config.num_hidden_layers = 2 config.num_experts = 8 config.num_experts_per_tok = 2 config.vocab_size = 1000 config.max_position_embeddings = 128 config.norm_topk_prob = True config.torch_dtype = torch.bfloat16 return config def _interleave_gate_up_weights(model): """Interleave all gate_up_proj parameters in-place for SonicMoE.""" from axolotl.integrations.kernels.sonicmoe.weight_converter import ( interleave_gate_up, ) with torch.no_grad(): for name, param in model.named_parameters(): if "gate_up_proj" in name: param.copy_(interleave_gate_up(param)) def _unpatch_sonicmoe(): """Restore original forward on the MoE block class if it was patched.""" from axolotl.integrations.kernels.constants import resolve_moe_block_classes for moe_cls in resolve_moe_block_classes("qwen3_moe"): if hasattr(moe_cls, "_original_forward"): moe_cls.forward = moe_cls._original_forward del moe_cls._original_forward class TestSonicMoEForwardCorrectness: """Verify SonicMoE-patched model produces same output as original.""" def teardown_method(self): _unpatch_sonicmoe() def test_forward_output_matches(self): from transformers import AutoModelForCausalLM from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe config = _create_tiny_qwen3_config() input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda") # Original model model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16() with torch.no_grad(): out_orig = model_orig(input_ids) # Patched model (same weights, interleaved for SonicMoE) model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16() model_patched.load_state_dict(model_orig.state_dict()) patch_sonicmoe("qwen3_moe") _interleave_gate_up_weights(model_patched) with torch.no_grad(): out_patched = model_patched(input_ids) max_diff = (out_orig.logits - out_patched.logits).abs().max().item() assert torch.allclose( out_orig.logits, out_patched.logits, atol=1e-1, rtol=1e-1 ), f"Output mismatch: max diff={max_diff:.6f}" class TestSonicMoEGradientCorrectness: """Compare gradients between original HuggingFace and SonicMoE-patched forward.""" def teardown_method(self): _unpatch_sonicmoe() def test_gradients_match(self): """Verify all parameter gradients match between original and patched.""" from transformers import AutoModelForCausalLM from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe from axolotl.integrations.kernels.sonicmoe.weight_converter import ( deinterleave_gate_up, ) config = _create_tiny_qwen3_config() input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda") # ---------- Original model ---------- model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16() out_orig = model_orig(input_ids, labels=input_ids) out_orig.loss.backward() grads_orig = { n: p.grad.float().clone() for n, p in model_orig.named_parameters() if p.grad is not None } loss_orig = out_orig.loss.item() # ---------- SonicMoE-patched model (same weights, interleaved) ---------- model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16() model_patched.load_state_dict(model_orig.state_dict()) patch_sonicmoe("qwen3_moe") _interleave_gate_up_weights(model_patched) out_patched = model_patched(input_ids, labels=input_ids) out_patched.loss.backward() grads_patched = {} for n, p in model_patched.named_parameters(): if p.grad is None: continue g = p.grad.float().clone() # gate_up_proj grads are in interleaved layout, de-interleave to match orig if "gate_up_proj" in n: g = deinterleave_gate_up(g) grads_patched[n] = g loss_patched = out_patched.loss.item() # ---------- Compare ---------- assert abs(loss_orig - loss_patched) < 0.5, ( f"Loss mismatch: orig={loss_orig:.4f}, patched={loss_patched:.4f}" ) # All parameters with gradients in original should have them in patched missing = set(grads_orig.keys()) - set(grads_patched.keys()) assert not missing, f"Missing gradients in patched model: {missing}" # Compare gradient values # bf16 with different GEMM impls (cuBLAS vs CUTLASS) can diverge, # so use generous tolerance: flag only if both rel >10% AND abs >1e-2 mismatches = [] for name in grads_orig: if name not in grads_patched: continue g_orig = grads_orig[name] g_patched = grads_patched[name] max_diff = (g_orig - g_patched).abs().max().item() rel_diff = max_diff / (g_orig.abs().max().item() + 1e-8) if rel_diff > 0.1 and max_diff > 1e-2: mismatches.append( f" {name}: max_abs_diff={max_diff:.6f}, rel_diff={rel_diff:.4f}" ) assert not mismatches, ( "Gradient mismatches (rel_diff > 10% and abs_diff > 1e-2):\n" + "\n".join(mismatches) ) def test_router_weights_receive_gradients(self): """Verify that router (gate) weights get non-zero gradients.""" from transformers import AutoModelForCausalLM from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe config = _create_tiny_qwen3_config() input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda") model = AutoModelForCausalLM.from_config(config).cuda().bfloat16() patch_sonicmoe("qwen3_moe") _interleave_gate_up_weights(model) out = model(input_ids, labels=input_ids) out.loss.backward() gate_grads_found = False for name, param in model.named_parameters(): if "gate" in name and "weight" in name: gate_grads_found = True assert param.grad is not None, f"No gradient for router: {name}" assert param.grad.abs().max() > 0, f"Zero gradient for router: {name}" assert gate_grads_found, "No gate.weight parameters found in model" class TestSonicMoETrainingConvergence: """Verify loss decreases during training with SonicMoE.""" def teardown_method(self): _unpatch_sonicmoe() def test_loss_decreases(self): """Run 30 training steps, verify loss decreases and no NaN/Inf.""" from transformers import AutoModelForCausalLM from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe config = _create_tiny_qwen3_config() input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda") model = AutoModelForCausalLM.from_config(config).cuda().bfloat16() patch_sonicmoe("qwen3_moe") _interleave_gate_up_weights(model) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) losses = [] for step in range(30): out = model(input_ids, labels=input_ids) loss = out.loss assert not math.isnan(loss.item()), f"NaN loss at step {step}" assert not math.isinf(loss.item()), f"Inf loss at step {step}" losses.append(loss.item()) loss.backward() optimizer.step() optimizer.zero_grad() assert losses[-1] < losses[0], ( f"Loss did not decrease: first={losses[0]:.4f}, last={losses[-1]:.4f}" ) def test_expert_weights_update(self): """Verify expert weights change during training (not frozen).""" from transformers import AutoModelForCausalLM from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe config = _create_tiny_qwen3_config() input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda") model = AutoModelForCausalLM.from_config(config).cuda().bfloat16() patch_sonicmoe("qwen3_moe") _interleave_gate_up_weights(model) # Snapshot expert weights before training expert_weights_before = {} for name, param in model.named_parameters(): if "experts" in name: expert_weights_before[name] = param.data.clone() assert expert_weights_before, "No expert parameters found" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) for _ in range(5): out = model(input_ids, labels=input_ids) out.loss.backward() optimizer.step() optimizer.zero_grad() # Check that expert weights changed changed = 0 for name, param in model.named_parameters(): if name in expert_weights_before: if not torch.equal(param.data, expert_weights_before[name]): changed += 1 assert changed > 0, "No expert weights changed after 5 training steps" ================================================ FILE: tests/e2e/kernels/test_geglu.py ================================================ """Tests for GEGLU activation function Triton kernels.""" import pytest import torch import torch.nn.functional as F from axolotl.kernels.geglu import geglu_backward, geglu_forward def test_geglu_forward_shape(): """Test that GEGLU forward pass preserves expected shapes.""" batch, seq_len, hidden_dim = 2, 3, 64 gate = torch.randn(batch, seq_len, hidden_dim, device="cuda") up = torch.randn(batch, seq_len, hidden_dim, device="cuda") out = geglu_forward(gate, up) assert out.shape == (batch, seq_len, hidden_dim) assert out.dtype == gate.dtype assert out.device == gate.device @pytest.mark.flaky(retries=1, delay=5) @pytest.mark.parametrize( "torch_seed", [0, 42], ) def test_geglu_forward_values(torch_seed): """Test GEGLU forward pass matches PyTorch reference implementation.""" torch.manual_seed(torch_seed) gate = torch.randn(2, 3, 64, device="cuda") up = torch.randn(2, 3, 64, device="cuda") # Custom implementation triton_out = geglu_forward(gate.clone(), up.clone()) # PyTorch reference torch_out = F.gelu(gate) * up assert torch.allclose(triton_out, torch_out, rtol=1e-3) @pytest.mark.flaky(retries=1, delay=5) @pytest.mark.parametrize( "torch_seed", [0, 42], ) def test_geglu_backward(torch_seed): """Test GEGLU backward pass matches PyTorch autograd.""" torch.manual_seed(torch_seed) gate = torch.randn(2, 3, 64, device="cuda", requires_grad=True) up = torch.randn(2, 3, 64, device="cuda", requires_grad=True) grad_output = torch.randn(2, 3, 64, device="cuda") # PyTorch reference - compute intermediates gelu_gate = F.gelu(gate) torch_out = gelu_gate * up torch_out.backward(grad_output) # Custom backward pass gate_clone = gate.clone().detach() up_clone = up.clone().detach() grad_output_clone = grad_output.clone() h, grad_gate, grad_up = geglu_backward(grad_output_clone, gate_clone, up_clone) # Compare outputs and gradients assert torch.allclose(h, torch_out, rtol=1e-3) assert torch.allclose(grad_gate, gate.grad, rtol=1e-3) assert torch.allclose(grad_up, up.grad, rtol=1e-3) def test_geglu_inplace_preservation(): """Test that GEGLU backward doesn't modify original tensors unexpectedly.""" gate = torch.randn(2, 3, 64, device="cuda") up = torch.randn(2, 3, 64, device="cuda") grad_output = torch.randn(2, 3, 64, device="cuda") gate_copy = gate.clone() up_copy = up.clone() grad_copy = grad_output.clone() geglu_backward(grad_output, gate, up) assert not torch.equal(gate, gate_copy), "Gate should be modified in-place" assert not torch.equal(up, up_copy), "Up should be modified in-place" assert not torch.equal(grad_output, grad_copy), ( "Grad output should be modified in-place" ) ================================================ FILE: tests/e2e/kernels/test_lora.py ================================================ """Tests for LoRA custom autograd.""" import pytest import torch from bitsandbytes.functional import QuantState from torch import nn from axolotl.kernels.geglu import geglu_backward, geglu_forward from axolotl.kernels.lora import ( LoRA_MLP, LoRA_O, LoRA_QKV, apply_lora_mlp_geglu, apply_lora_mlp_swiglu, get_lora_parameters, matmul_lora, ) from axolotl.kernels.swiglu import swiglu_backward, swiglu_forward @pytest.fixture def mock_quantstate(): """Creates a mock QuantState for testing""" shape = (64, 64) n_blocks = shape[0] # Assuming blockwise quantization along first dimension # Create nested state first nested_state = QuantState( absmax=torch.ones(n_blocks, device="cuda"), # One value per block shape=shape, code=torch.randint(0, 15, shape, device="cuda"), # NF4 range is 0-15 dtype=torch.float16, blocksize=64, quant_type="nf4", offset=None, state2=None, ) # Create main state with nested state return QuantState( absmax=torch.ones(n_blocks, device="cuda"), shape=shape, code=torch.randint(0, 15, shape, device="cuda"), dtype=torch.float16, blocksize=64, quant_type="nf4", offset=torch.zeros(n_blocks, dtype=torch.int32, device="cuda"), state2=nested_state, ) @pytest.fixture def sample_tensors(): """Creates sample tensors for testing""" torch.manual_seed(42) batch_size, seq_len, hidden_dim = 2, 3, 64 rank = 8 out_dim = hidden_dim return { "X": torch.randn( batch_size, seq_len, hidden_dim, device="cuda", dtype=torch.float16 ), "W": torch.randn(out_dim, hidden_dim, device="cuda", dtype=torch.float16), "b": torch.randn(out_dim, device="cuda", dtype=torch.float16), "scale": 0.5, "shapes": { "batch": batch_size, "seq": seq_len, "hidden": hidden_dim, "out": out_dim, "rank": rank, }, } @pytest.fixture def mock_proj(): """Creates a mock projection module for testing.""" class MockProj(nn.Module): """Mock projection class.""" def __init__(self, in_features=64, out_features=128, rank=8): super().__init__() self.base_layer = nn.Linear(in_features, out_features) self.base_layer.to("cuda") self.lora_A = nn.ModuleDict( {"default": nn.Linear(in_features, rank, bias=False).to("cuda")} ) self.lora_B = nn.ModuleDict( {"default": nn.Linear(rank, out_features, bias=False).to("cuda")} ) self.scaling = {"default": 0.5} self.active_adapter = "default" self.disable_adapters = False self.merged = False return MockProj() def test_get_lora_parameters(mock_proj): """Tests get_lora_parameters function""" # Test with LoRA enabled W, b, _, A, B, s = get_lora_parameters(mock_proj) assert isinstance(W, torch.Tensor) assert W.shape == (128, 64) assert b.shape == (128,) assert A.shape == (8, 64) assert B.shape == (128, 8) assert s == 0.5 # Test with LoRA disabled mock_proj.disable_adapters = True W, b, _, A, B, s = get_lora_parameters(mock_proj) assert A is None and B is None and s is None # Test with merged state mock_proj.disable_adapters = False mock_proj.merged = True W, b, _, A, B, s = get_lora_parameters(mock_proj) assert A is None and B is None and s is None def test_matmul_lora(sample_tensors): """Tests matmul_lora function""" X = sample_tensors["X"] W = sample_tensors["W"] b = sample_tensors["b"] scale = sample_tensors["scale"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] rank = shapes["rank"] A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) # Test base matmul out1 = matmul_lora(X, W, b, None, None, None, None) matmul = torch.matmul(X, W.t()) expected1 = matmul + b assert torch.allclose(out1, expected1, rtol=1e-3) # Test with LoRA out2 = matmul_lora(X, W, b, None, A, B, scale) lora_term = scale * torch.matmul(torch.matmul(X, A.t()), B.t()) expected2 = matmul + lora_term + b assert torch.allclose(out2, expected2, rtol=1e-3) # Test 3D input reshaping X_3d = X.clone() out3 = matmul_lora(X_3d, W, b, None, A, B, scale) assert out3.shape == (X.shape[0], X.shape[1], W.shape[0]) @pytest.mark.parametrize( "activation_forward,activation_backward", [(swiglu_forward, swiglu_backward), (geglu_forward, geglu_backward)], ) def test_lora_mlp_direct(sample_tensors, activation_forward, activation_backward): """Tests LoRA_MLP directly with different activation functions""" X = sample_tensors["X"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] # Create linear layers gate_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16) up_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16) down_proj = nn.Linear(out_dim, hidden_dim).to(device="cuda", dtype=torch.float16) # Test SwiGLU path X.requires_grad = True output = LoRA_MLP.apply( X, gate_proj.weight, gate_proj.bias, None, # gate_quant None, # gate_A None, # gate_B None, # gate_scale up_proj.weight, up_proj.bias, None, # up_quant None, # up_A None, # up_B None, # up_scale down_proj.weight, down_proj.bias, None, # down_quant None, # down_A None, # down_B None, # down_scale activation_forward, activation_backward, True, # inplace ) assert output.shape == X.shape assert not torch.isnan(output).any() # Test backward pass loss = output.sum() loss.backward() assert X.grad is not None assert not torch.isnan(X.grad).any() @pytest.mark.parametrize( "activation_forward,activation_backward", [(swiglu_forward, swiglu_backward), (geglu_forward, geglu_backward)], ) def test_lora_mlp_with_adapters( sample_tensors, activation_forward, activation_backward ): """Tests LoRA_MLP with LoRA adapters""" X = sample_tensors["X"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] rank = shapes["rank"] # Create LoRA components gate_A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) gate_B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) up_A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) up_B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) down_A = torch.randn(rank, out_dim, device="cuda", dtype=torch.float16) down_B = torch.randn(hidden_dim, rank, device="cuda", dtype=torch.float16) scale = 0.5 gate_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16) up_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16) down_proj = nn.Linear(out_dim, hidden_dim).to(device="cuda", dtype=torch.float16) X.requires_grad = True gate_A.requires_grad = True gate_B.requires_grad = True up_A.requires_grad = True up_B.requires_grad = True down_A.requires_grad = True down_B.requires_grad = True # Forward pass with adapters output = LoRA_MLP.apply( X, gate_proj.weight, gate_proj.bias, None, gate_A, gate_B, scale, up_proj.weight, up_proj.bias, None, up_A, up_B, scale, down_proj.weight, down_proj.bias, None, down_A, down_B, scale, activation_forward, activation_backward, True, ) assert output.shape == X.shape assert not torch.isnan(output).any() # Test backward pass loss = output.sum() loss.backward() # Check all gradients assert X.grad is not None assert gate_A.grad is not None assert gate_B.grad is not None assert up_A.grad is not None assert up_B.grad is not None assert down_A.grad is not None assert down_B.grad is not None assert not torch.isnan(X.grad).any() assert not torch.isnan(gate_A.grad).any() assert not torch.isnan(gate_B.grad).any() assert not torch.isnan(up_A.grad).any() assert not torch.isnan(up_B.grad).any() assert not torch.isnan(down_A.grad).any() assert not torch.isnan(down_B.grad).any() def test_lora_qkv(sample_tensors): """Tests LoRA QKV implementation with and without adapters""" X = sample_tensors["X"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] rank = shapes["rank"] # Create base weights q_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16) k_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16) v_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16) # Create LoRA matrices q_A = torch.randn( rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True ) q_B = torch.randn( hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True ) k_A = torch.randn( rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True ) k_B = torch.randn( hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True ) v_A = torch.randn( rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True ) v_B = torch.randn( hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True ) scale = 0.5 X.requires_grad = True # Test without LoRA adapters Q1, K1, V1 = LoRA_QKV.apply( X, q_weight, None, None, None, None, None, k_weight, None, None, None, None, None, v_weight, None, None, None, None, None, True, ) assert Q1.shape == K1.shape == V1.shape == X.shape loss1 = (Q1 + K1 + V1).sum() loss1.backward() assert X.grad is not None # Clear gradients X.grad = None # Test with LoRA adapters Q2, K2, V2 = LoRA_QKV.apply( X, q_weight, None, None, q_A, q_B, scale, k_weight, None, None, k_A, k_B, scale, v_weight, None, None, v_A, v_B, scale, True, ) assert Q2.shape == K2.shape == V2.shape == X.shape loss2 = (Q2 + K2 + V2).sum() loss2.backward() # Check gradients assert X.grad is not None assert q_A.grad is not None assert q_B.grad is not None assert k_A.grad is not None assert k_B.grad is not None assert v_A.grad is not None assert v_B.grad is not None # Check for NaN values assert not torch.isnan(X.grad).any() assert not torch.isnan(q_A.grad).any() assert not torch.isnan(q_B.grad).any() assert not torch.isnan(k_A.grad).any() assert not torch.isnan(k_B.grad).any() assert not torch.isnan(v_A.grad).any() assert not torch.isnan(v_B.grad).any() def test_lora_o(sample_tensors): """Tests LoRA output projection""" X = sample_tensors["X"] W = sample_tensors["W"] b = sample_tensors["b"] scale = sample_tensors["scale"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] rank = shapes["rank"] A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) # Test forward pass X.requires_grad = True output = LoRA_O.apply(X, W, b, None, A, B, scale) assert output.shape == (X.shape[0], X.shape[1], W.shape[0]) # Test backward pass loss = output.sum() loss.backward() assert X.grad is not None def test_with_quantization(sample_tensors, mock_quantstate): """Tests LoRA with quantized weights""" X = sample_tensors["X"] # [batch, seq, hidden] W = sample_tensors["W"] # [out, hidden] b = sample_tensors["b"] # [out] scale = 0.5 shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] rank = shapes["rank"] A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) # Test matmul with quantization out = matmul_lora(X, W, b, mock_quantstate, A, B, scale) assert out.shape == (X.shape[0], X.shape[1], W.shape[0]) assert not torch.isnan(out).any() # Test with different batch sizes X2 = torch.randn(4, 6, hidden_dim, device="cuda", dtype=torch.float16) out2 = matmul_lora(X2, W, b, mock_quantstate, A, B, scale) assert out2.shape == (4, 6, W.shape[0]) assert not torch.isnan(out2).any() @pytest.mark.parametrize( "batch,seq,hidden,rank,out", [ (1, 1, 32, 4, 64), (2, 3, 64, 8, 128), (4, 5, 128, 16, 256), ], ) def test_shapes_and_dimensions(batch, seq, hidden, rank, out): """Tests various input shapes and dimensions""" X = torch.randn(batch, seq, hidden, device="cuda", dtype=torch.float16) W = torch.randn(out, hidden, device="cuda", dtype=torch.float16) b = torch.randn(out, device="cuda", dtype=torch.float16) A = torch.randn(rank, hidden, device="cuda", dtype=torch.float16) B = torch.randn(out, rank, device="cuda", dtype=torch.float16) scale = 0.5 result = matmul_lora(X, W, b, None, A, B, scale) assert result.shape == (batch, seq, out) def test_gradient_flow(sample_tensors): """Tests gradient flow through LoRA layers""" X = sample_tensors["X"].clone() W = sample_tensors["W"].clone() b = sample_tensors["b"].clone() scale = sample_tensors["scale"] shapes = sample_tensors["shapes"] hidden_dim = shapes["hidden"] out_dim = shapes["out"] rank = shapes["rank"] A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16) B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16) X.requires_grad = True A.requires_grad = True B.requires_grad = True # Forward pass out = matmul_lora(X, W, b, None, A, B, scale) loss = out.sum() # Backward pass loss.backward() assert X.grad is not None assert A.grad is not None assert B.grad is not None assert not torch.isnan(X.grad).any() assert not torch.isnan(A.grad).any() assert not torch.isnan(B.grad).any() @pytest.mark.parametrize( "apply_function", [apply_lora_mlp_swiglu, apply_lora_mlp_geglu], ) def test_inplace_operations(sample_tensors, apply_function): """Tests inplace operation behavior""" X = sample_tensors["X"] shapes = sample_tensors["shapes"] # Create MLP with both inplace=True and inplace=False mlp = type( "MLPModule", (), { "gate_proj": nn.Linear(shapes["hidden"], shapes["out"]).to( device="cuda", dtype=torch.float16 ), "up_proj": nn.Linear(shapes["hidden"], shapes["out"]).to( device="cuda", dtype=torch.float16 ), "down_proj": nn.Linear(shapes["out"], shapes["hidden"]).to( device="cuda", dtype=torch.float16 ), }, ) out1 = apply_function(mlp, X.clone(), inplace=True) out2 = apply_function(mlp, X.clone(), inplace=False) assert torch.allclose(out1, out2, rtol=1e-3) ================================================ FILE: tests/e2e/kernels/test_quantize.py ================================================ """Tests for quantization utility functions.""" import torch from bitsandbytes.functional import QuantState from axolotl.kernels.quantize import dequantize def test_dequantize_null_state(): """Test that dequantize returns input unchanged when quant_state is None""" W = torch.randn(32, 32) assert torch.equal(dequantize(W, None), W) def test_dequantize_shape_preservation(): """Test that dequantization preserves expected shapes""" shape = (32, 32) W = torch.randn(shape, device="cuda") quant_state = QuantState( absmax=torch.ones(shape[0], device="cuda"), shape=shape, code=torch.randint(0, 15, shape, device="cuda"), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=torch.zeros(shape[0], dtype=torch.int32, device="cuda"), state2=QuantState( absmax=torch.ones(shape[0], device="cuda"), shape=shape, code=torch.randint(0, 15, shape, device="cuda"), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=None, state2=None, ), ) result = dequantize(W, quant_state) assert result.shape == shape assert result.dtype == torch.float16 assert result.device == W.device def test_dequantize_transposed(): """Test that transposed input produces transposed output""" shape = (32, 32) W = torch.randn(1, shape[1], device="cuda") # Transposed input quant_state = QuantState( absmax=torch.ones(1), shape=shape, code=torch.randint(0, 15, shape), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=torch.zeros(1, dtype=torch.int32), state2=QuantState( absmax=torch.ones(1), shape=shape, code=torch.randint(0, 15, shape), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=None, state2=None, ), ) result = dequantize(W, quant_state) assert result.shape[0] == shape[0] def test_dequantize_output_tensor(): """Test dequantization with provided output tensor""" shape = (32, 32) W = torch.randn(shape, device="cuda") out = torch.empty(shape, dtype=torch.float16, device="cuda") quant_state = QuantState( absmax=torch.ones(shape[0]), shape=shape, code=torch.randint(0, 15, shape), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=torch.zeros(shape[0], dtype=torch.int32), state2=QuantState( absmax=torch.ones(shape[0]), shape=shape, code=torch.randint(0, 15, shape), dtype=torch.float16, blocksize=32, quant_type="nf4", offset=None, state2=None, ), ) result = dequantize(W, quant_state, out=out) assert result is out ================================================ FILE: tests/e2e/kernels/test_swiglu.py ================================================ """Tests for SwiGLU activation function Triton kernels.""" import torch import torch.nn.functional as F from axolotl.kernels.swiglu import swiglu_backward, swiglu_forward def test_swiglu_forward_shape(): """Test that SwiGLU forward pass preserves expected shapes""" batch, seq_len, hidden_dim = 2, 3, 64 gate = torch.randn(batch, seq_len, hidden_dim, device="cuda") up = torch.randn(batch, seq_len, hidden_dim, device="cuda") out = swiglu_forward(gate, up) assert out.shape == (batch, seq_len, hidden_dim) assert out.dtype == gate.dtype assert out.device == gate.device def test_swiglu_forward_values(): """Test SwiGLU forward pass matches PyTorch reference implementation""" gate = torch.randn(2, 3, 64, device="cuda") up = torch.randn(2, 3, 64, device="cuda") # Custom implementation triton_out = swiglu_forward(gate.clone(), up.clone()) # PyTorch reference torch_out = F.silu(gate) * up assert torch.allclose(triton_out, torch_out, rtol=1e-3) def test_swiglu_backward(): """Test SwiGLU backward pass matches PyTorch autograd""" gate = torch.randn(2, 3, 64, device="cuda", requires_grad=True) up = torch.randn(2, 3, 64, device="cuda", requires_grad=True) grad_output = torch.randn(2, 3, 64, device="cuda") # PyTorch reference - compute intermediates silu_gate = F.silu(gate) torch_out = silu_gate * up torch_out.backward(grad_output) # Custom backward pass gate_clone = gate.clone().detach() up_clone = up.clone().detach() grad_output_clone = grad_output.clone() h, our_grad_gate, our_grad_up = swiglu_backward( grad_output_clone, gate_clone, up_clone ) # Compare outputs and gradients assert torch.allclose(h, torch_out, rtol=1e-3) assert torch.allclose(our_grad_gate, gate.grad, rtol=1e-3) assert torch.allclose(our_grad_up, up.grad, rtol=1e-3) def test_swiglu_inplace_preservation(): """Test that SwiGLU backward doesn't modify original tensors unexpectedly""" gate = torch.randn(2, 3, 64, device="cuda") up = torch.randn(2, 3, 64, device="cuda") grad_output = torch.randn(2, 3, 64, device="cuda") gate_copy = gate.clone() up_copy = up.clone() grad_copy = grad_output.clone() swiglu_backward(grad_output, gate, up) assert not torch.equal(gate, gate_copy), "Gate should be modified in-place" assert not torch.equal(up, up_copy), "Up should be modified in-place" assert not torch.equal(grad_output, grad_copy), ( "Grad output should be modified in-place" ) ================================================ FILE: tests/e2e/multigpu/__init__.py ================================================ ================================================ FILE: tests/e2e/multigpu/patched/__init__.py ================================================ ================================================ FILE: tests/e2e/multigpu/patched/test_sp.py ================================================ """E2E tests for sequence parallelism""" from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from ...utils import check_tensorboard class TestSequenceParallelism: """Test case for training with sequence parallelism enabled""" def _run_sequence_parallel_test( self, temp_dir, sample_packing=True, micro_batch_size=1, pad_to_sequence_len=True, ring_attn_func=None, threshold=2.0, ): """Helper method to run sequence parallel tests with different configurations""" cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "load_in_8bit": False, "load_in_4bit": True, "strict": False, "sequence_len": 2048, "adapter": "qlora", "sample_packing": sample_packing, "eval_sample_packing": sample_packing, "pad_to_sequence_len": pad_to_sequence_len, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": ["embed_tokens", "lm_head"], "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 8, "micro_batch_size": micro_batch_size, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "loss_watchdog_threshold": 5.0, "loss_watchdog_patience": 3, "bf16": "auto", "warmup_steps": 1, "saves_per_epoch": 1, "logging_steps": 1, "weight_decay": 0.0, "use_tensorboard": True, "context_parallel_size": 2, "ring_attn_func": ring_attn_func, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "accelerate", "launch", "--num-processes", "2", "--main_process_port", f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", threshold, "Train Loss (%s) is too high", ) @pytest.mark.parametrize( "sample_packing, micro_batch_size, pad_to_sequence_len, ring_attn_func, threshold", [ (True, 1, True, None, 2.5), # defaults to varlen_llama3 ring_attn_func (False, 2, True, None, 2.5), # defaults to batch_ring ring_attn_func # (False, 2, True, "batch_zigzag", 2.5), # (False, 2, False, None, 2.65), # defaults to batch_ring ring_attn_func ], ids=[ "sample_packing, varlen_llama3 ring_attn_func", "no sample_packing, pad_to_sequence_len, batch_ring ring_attn_func", # "no sample_packing, no pad_to_sequence_len, batch_zigzag ring_attn_func", # "no sample_packing, no pad_to_sequence_len, batch_ring ring_attn_func", ], ) def test_sequence_parallel_training( self, temp_dir, sample_packing, micro_batch_size, pad_to_sequence_len, ring_attn_func, threshold, ): """Test sequence parallel training with different configurations""" self._run_sequence_parallel_test( temp_dir, sample_packing=sample_packing, micro_batch_size=micro_batch_size, pad_to_sequence_len=pad_to_sequence_len, ring_attn_func=ring_attn_func, threshold=threshold, ) ================================================ FILE: tests/e2e/multigpu/solo/__init__.py ================================================ # Tests under this directory should get run "solo" on their own as they # seem to cause issues when run in the same batch as other tests. ================================================ FILE: tests/e2e/multigpu/solo/test_flex.py ================================================ """ E2E tests for multigpu lora tinyllama """ from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from huggingface_hub import snapshot_download from transformers.testing_utils import get_torch_dist_unique_port from transformers.utils import is_torch_bf16_gpu_available from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard, require_torch_2_6_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent @pytest.fixture(scope="session", autouse=True) def download_model(): # download the model snapshot_download("HuggingFaceTB/SmolLM2-135M") class TestPackedFlex: """ Test case for Packed training of llama models """ @require_torch_2_6_0 def test_loss_llama(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flex_attention": True, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 2, "use_tensorboard": True, "save_strategy": "no", "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/multigpu/solo/test_gdpo.py ================================================ """ GDPO test suite GDPO uses TRL's multi_objective_aggregation="normalize_then_sum" for per-reward normalization in multi-reward RL training. """ import os import random from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.multigpu.solo.test_grpo import recursive_kill, start_vllm from tests.e2e.utils import require_vllm @pytest.mark.skip(reason="flaky vllm tests in modal") class TestGDPO: """Test case for GDPO training using TRL's native multi-objective aggregation.""" def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""): Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) with open(f"rewards_gdpo_{suffix}.py", "w", encoding="utf-8") as fout: fout.write( """import random def format_reward(prompts, completions, **kwargs) -> list[float]: return [1.0 if len(c) > 10 else 0.0 for c in completions] def correctness_reward(prompts, completions, **kwargs) -> list[float]: return [random.uniform(-1, 3) for _ in completions] def safety_reward(prompts, completions, **kwargs) -> list[float]: return [1.0 if 'error' not in c.lower() else 0.0 for c in completions] def single_reward(prompts, completions, **kwargs) -> list[float]: return [random.uniform(0, 1) for _ in completions] def oai_gsm8k_transform(cfg, *args, **kwargs): def transform_fn(example, tokenizer=None): label = example["answer"].split("####")[-1].strip().replace(",", "") return { "prompt": [{"role": "user", "content": example["question"]}], "answer": label, } return transform_fn, {"remove_columns": ["question"]} """ ) @pytest.mark.parametrize("num_gpus", [1, 2]) @require_vllm def test_gdpo_multi_reward_lora(self, temp_dir, num_gpus): """Test GDPO with multiple reward functions using LoRA.""" rnd_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "gdpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [ f"rewards_gdpo_{rnd_suffix}.format_reward", f"rewards_gdpo_{rnd_suffix}.correctness_reward", ], "reward_weights": [1.0, 2.0], "scale_rewards": True, }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_safetensors": True, "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", str(num_gpus), "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) @require_vllm def test_gdpo_three_rewards(self, temp_dir): """Test GDPO with three reward functions (format, correctness, safety).""" rnd_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "gdpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [ f"rewards_gdpo_{rnd_suffix}.format_reward", f"rewards_gdpo_{rnd_suffix}.correctness_reward", f"rewards_gdpo_{rnd_suffix}.safety_reward", ], "reward_weights": [1.0, 2.0, 1.5], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_safetensors": True, "bf16": "auto", } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "1", "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) @require_vllm def test_gdpo_single_reward_fallback(self, temp_dir): """Test GDPO with single reward.""" rnd_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "gdpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [ f"rewards_gdpo_{rnd_suffix}.single_reward", ], "reward_weights": [1.0], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_safetensors": True, "bf16": "auto", } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "1", "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) @require_vllm def test_gdpo_fft(self, temp_dir): """Test GDPO with full fine-tuning (no adapter).""" rnd_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "gdpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [ f"rewards_gdpo_{rnd_suffix}.format_reward", f"rewards_gdpo_{rnd_suffix}.correctness_reward", ], "reward_weights": [1.0, 2.0], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform", }, ], # No adapter - full fine-tuning "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_safetensors": True, "bf16": "auto", } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "1", "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) @require_vllm def test_gdpo_sequence_parallel(self, temp_dir): """Test GDPO with sequence parallelism.""" rnd_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "gdpo", "context_parallel_size": 2, "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [ f"rewards_gdpo_{rnd_suffix}.format_reward", f"rewards_gdpo_{rnd_suffix}.correctness_reward", ], "reward_weights": [1.0, 2.0], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_safetensors": True, "bf16": "auto", } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) ================================================ FILE: tests/e2e/multigpu/solo/test_grpo.py ================================================ """ GRPO test suite """ import os import random import subprocess # nosec B404 import sys import tempfile import time from pathlib import Path import psutil import pytest import requests import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import require_vllm def start_vllm( model: str, env: dict, wait: int | None = None, quiet=False, **kwargs ) -> subprocess.Popen: """ helper function to start the VLLM server in the background, mostly for testing purposes """ cmd = [sys.executable, "-m", "trl.scripts.vllm_serve", "--model", model] if tensor_parallel_size := kwargs.get("tensor_parallel_size"): cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)]) if host := kwargs.get("host"): cmd.extend(["--host", host]) if port := kwargs.get("port"): cmd.extend(["--port", str(port)]) if gpu_memory_utilization := kwargs.get("gpu_memory_utilization"): cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)]) if dtype := kwargs.get("dtype"): cmd.extend(["--dtype", dtype]) if max_model_len := kwargs.get("max_model_len"): cmd.extend(["--max-model-len", str(max_model_len)]) if kwargs.get("enable_prefix_caching"): cmd.extend(["--enable-prefix-caching", "True"]) # print out the command to be executed print(" ".join(cmd)) vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json" with open(vllm_logging_json, "w", encoding="utf-8") as temp_file: temp_file.write( """{ "formatters": { "json": { "class": "pythonjsonlogger.jsonlogger.JsonFormatter" } }, "handlers": { "file": { "class": "logging.FileHandler", "formatter": "json", "level": "DEBUG", "filename": "/tmp/vllm.log", "mode": "a" } }, "loggers": { "vllm": { "handlers": ["file"], "level": "DEBUG", "propagate": false } }, "version": 1 }""" ) cmd_env = env.copy() cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json}) # start `trl vllm-serve` command in the background and capture the process id process = subprocess.Popen( cmd, env=cmd_env, stdout=subprocess.DEVNULL if quiet else subprocess.PIPE, stderr=subprocess.DEVNULL if quiet else subprocess.PIPE, ) # nosec B603 # print out the process id so the user can easily kill it later print(f"VLLM server process started (PID: {process.pid})") # wait until the http server is ready, even if it 404s, but timeout after 60 seconds period_seconds = 5 started = False if wait and host and port: for i in range(0, int(wait), period_seconds): try: response = requests.get(f"http://{host}:{port}", timeout=1) print(f"{i}: VLLM server (status: {response.status_code})") if int(response.status_code) in [200, 404]: started = True break except requests.exceptions.RequestException as exc: print(f"{i}: VLLM server failed to start: {str(exc)}") # also check if the process.pid is still running if process.poll() is not None: break time.sleep(period_seconds) if wait and not started: print( f"VLLM server process did not start within {wait} seconds. Please check your server logs." ) recursive_kill(process) with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file: print(log_file.read()) try: os.remove("/tmp/vllm.log") except FileNotFoundError: pass raise RuntimeError(f"VLLM server process did not start within {wait} seconds.") # return the process return process def recursive_kill(process: subprocess.Popen): """ Recursively kill a process and its children """ process = psutil.Process(process.pid) for child in psutil.Process(process.pid).children(recursive=True): child.terminate() child.kill() os.kill(child.pid, 9) process.terminate() process.kill() os.kill(process.pid, 9) @pytest.mark.skip(reason="flaky vllm tests in modal") class TestGRPO: """ Test case for GRPO training using multiple GPUs """ def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""): # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) with open(f"rewards_{suffix}.py", "w", encoding="utf-8") as fout: fout.write( """import random def rand_reward_func(completions, **kwargs) -> list[float]: return [random.uniform(0, 1) for _ in completions] def oai_gsm8k_transform(cfg, *args, **kwargs): def transform_fn(example, tokenizer=None): label = example["answer"].split("####")[-1].strip().replace(",", "") return { "prompt": [{"role": "user", "content": example["question"]},], "answer": label, } return transform_fn, {"remove_columns": ["question"]} """ ) @pytest.mark.parametrize( "num_gpus", [1, 2], ) @require_vllm def test_llama_dora(self, temp_dir, num_gpus): rnd_reward_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "grpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "peft_use_dora": True, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", str(num_gpus), "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: (recursive_kill(vllm_process)) @require_vllm def test_llama_lora_sp(self, temp_dir): rnd_reward_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "grpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "context_parallel_size": 2, "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", str(2), "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) @pytest.mark.parametrize( "num_gpus", [1, 2], ) @require_vllm def test_llama_fft(self, temp_dir, num_gpus): rnd_reward_suffix = str(random.randint(1000, 9999)) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "chat_template": "llama3", "rl": "grpo", "trl": { "beta": 0.001, "max_completion_length": 256, "use_vllm": True, "num_generations": 4, "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"], }, "vllm": { "max_model_len": 800, "enable_prefix_caching": True, }, "datasets": [ { "path": "openai/gsm8k", "name": "main", "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform", }, ], "flash_attention": True, "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "max_steps": 3, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix) current_env = os.environ.copy() env = { "NCCL_P2P_LEVEL": "LOC", # nccl can be brittle, assume P2P isn't reliable **current_env, "CUDA_VISIBLE_DEVICES": "1", } vllm_process = start_vllm( cfg.base_model, env=env, quiet=True, wait=300, gpu_memory_utilization=0.15, max_model_len=cfg.vllm.max_model_len, enable_prefix_caching=cfg.vllm.enable_prefix_caching, host="0.0.0.0", port=8000, ) try: execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", str(num_gpus), "--main-process-port", f"{get_torch_dist_unique_port()}", ], env={ "NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env, }, ) finally: recursive_kill(vllm_process) ================================================ FILE: tests/e2e/multigpu/test_dist_muon_fsdp2.py ================================================ """Test module for DistMuon optimizer with FSDP2 multi-GPU functionality.""" import os from pathlib import Path import torch import yaml from accelerate.test_utils import execute_subprocess_async from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): """Verify that training completed successfully by checking artifacts and loss.""" output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( output_path.glob("*.safetensors") ) assert len(model_files) > 0, "No model files found - training may have failed" checkpoint_files = list(output_path.glob("checkpoint-*")) assert len(checkpoint_files) > 0, ( "No checkpoint files found - training may have failed" ) tb_log_path = most_recent_subdir(temp_dir + "/runs") if tb_log_path: event_files = sorted(os.listdir(tb_log_path)) if event_files: event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) class TestDistMuon: """Test class for DistMuon optimizer with FSDP2 functionality.""" @require_torch_2_7_0 def test_fft_sft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.02, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @require_torch_2_7_0 def test_lora_sft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.02, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) ================================================ FILE: tests/e2e/multigpu/test_eval.py ================================================ """ E2E tests for multigpu eval """ from pathlib import Path import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from ..utils import check_tensorboard AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent class TestMultiGPUEval: """ Test case for MultiGPU Eval Sample Packing """ def test_eval_sample_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "load_in_8bit": False, "load_in_4bit": True, "strict": False, "sequence_len": 2048, "adapter": "qlora", "sample_packing": True, "eval_sample_packing": True, "pad_to_sequence_len": True, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": ["embed_tokens", "lm_head"], "val_set_size": 0.05, "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [ { "path": "teknium/GPT4-LLM-Cleaned", "type": "alpaca", "split": "train[:5%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "loss_watchdog_threshold": 5.0, "loss_watchdog_patience": 3, "bf16": "auto", "warmup_steps": 1, "evals_per_epoch": 2, "eval_max_new_tokens": 128, "saves_per_epoch": 1, "logging_steps": 1, "weight_decay": 0.0, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "accelerate", "launch", "--num-processes", "2", "--main_process_port", f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high") def test_eval(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "load_in_8bit": False, "load_in_4bit": True, "strict": False, "sequence_len": 2048, "adapter": "qlora", "sample_packing": True, "eval_sample_packing": False, "pad_to_sequence_len": True, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": ["embed_tokens", "lm_head"], "val_set_size": 0.01, "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [ { "path": "teknium/GPT4-LLM-Cleaned", "type": "alpaca", "split": "train[:5%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "loss_watchdog_threshold": 5.0, "loss_watchdog_patience": 3, "bf16": "auto", "warmup_steps": 1, "evals_per_epoch": 2, "eval_max_new_tokens": 128, "saves_per_epoch": 1, "logging_steps": 1, "weight_decay": 0.0, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "accelerate", "launch", "--num-processes", "2", "--main_process_port", f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high") ================================================ FILE: tests/e2e/multigpu/test_fp8_fsdp2.py ================================================ """Test module for FP8 mixed precision with FSDP2 multi-GPU functionality.""" import os from pathlib import Path import torch import yaml from accelerate.test_utils import execute_subprocess_async from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0, supports_fp8 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_fp8_training_success(temp_dir): """Verify that FP8 training completed successfully by checking artifacts and loss.""" output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( output_path.glob("*.safetensors") ) assert len(model_files) > 0, "No model files found - training may have failed" checkpoint_files = list(output_path.glob("checkpoint-*")) assert len(checkpoint_files) > 0, ( "No checkpoint files found - training may have failed" ) tb_log_path = most_recent_subdir(temp_dir + "/runs") if tb_log_path: event_files = sorted(os.listdir(tb_log_path)) if event_files: event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) class TestFP8FSDP2: """Test class for FP8 mixed precision with FSDP2 functionality.""" @require_torch_2_7_0 @supports_fp8 def test_fp8_fsdp2_smoke(self, temp_dir): """Smoke test for 2-GPU FP8 + torch.compile + FSDP2 training""" cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "trust_remote_code": True, "sequence_len": 512, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, # Very short smoke test "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", # Use standard optimizer for stability "lr_scheduler": "cosine", "sdp_attention": True, "pad_to_seq_len": True, "sample_packing": True, # FP8 configuration "fp8": True, "fp8_enable_fsdp_float8_all_gather": True, "torch_compile": True, # FSDP2 configuration "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_fp8_training_success(temp_dir) ================================================ FILE: tests/e2e/multigpu/test_fsdp1.py ================================================ """Test module for FSDP1 multi-GPU functionality.""" import os from pathlib import Path import pytest import torch import yaml from accelerate.test_utils import execute_subprocess_async from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import most_recent_subdir AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): """Verify that training completed successfully by checking artifacts and loss.""" output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( output_path.glob("*.safetensors") ) assert len(model_files) > 0, "No model files found - training may have failed" checkpoint_files = list(output_path.glob("checkpoint-*")) assert len(checkpoint_files) > 0, ( "No checkpoint files found - training may have failed" ) tb_log_path = most_recent_subdir(temp_dir + "/runs") if tb_log_path: event_files = sorted(os.listdir(tb_log_path)) if event_files: event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) class TestFSDP1: """Test class for FSDP1 functionality.""" @pytest.mark.parametrize( "fsdp_cpu_ram_efficient_loading", [True, False], ) def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": "1", "fsdp_config": { "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading, "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_sharding_strategy": "FULL_SHARD", "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @pytest.mark.parametrize( "adapter_config", [ { "adapter": "lora", "load_in_4bit": False, }, { "adapter": "qlora", "load_in_4bit": True, }, ], ) def test_lora_sft(self, temp_dir, adapter_config): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "adapter": adapter_config["adapter"], "load_in_4bit": adapter_config["load_in_4bit"], "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": "1", "fsdp_config": { "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": True, "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_sharding_strategy": "FULL_SHARD", "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @pytest.mark.skip(reason="slow test, deprecate fsdp1 asap") def test_dpo_fft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "Intel/orca_dpo_pairs", "split": "train", "type": "chatml.intel", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": "1", "fsdp_config": { "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": True, "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_sharding_strategy": "FULL_SHARD", "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, }, "use_tensorboard": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @pytest.mark.skip("broken in transformers v5") @pytest.mark.parametrize( "adapter_config", [ { "adapter": "lora", "load_in_4bit": False, }, { "adapter": "qlora", "load_in_4bit": True, }, ], ) def test_dpo_lora(self, temp_dir, adapter_config): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "load_in_4bit": adapter_config["load_in_4bit"], "rl": "dpo", "chat_template": "chatml", "sequence_len": 2048, "adapter": adapter_config["adapter"], "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.01, "datasets": [ { "path": "Intel/orca_dpo_pairs", "split": "train", "type": "chatml.intel", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": "1", "fsdp_config": { "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": True, "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_sharding_strategy": "FULL_SHARD", "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, }, "use_tensorboard": True, "bf16": "auto", "tf32": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) ================================================ FILE: tests/e2e/multigpu/test_fsdp2.py ================================================ """Test module for FSDP2 multi-GPU functionality.""" import os from pathlib import Path import pytest import torch import yaml from accelerate.test_utils import execute_subprocess_async from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): """Verify that training completed successfully by checking artifacts and loss.""" output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( output_path.glob("*.safetensors") ) assert len(model_files) > 0, "No model files found - training may have failed" checkpoint_files = list(output_path.glob("checkpoint-*")) assert len(checkpoint_files) > 0, ( "No checkpoint files found - training may have failed" ) tb_log_path = most_recent_subdir(temp_dir + "/runs") if tb_log_path: event_files = sorted(os.listdir(tb_log_path)) if event_files: event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) class TestFSDP2: """Test class for FSDP2 functionality.""" @require_torch_2_7_0 @pytest.mark.parametrize( "fsdp_cpu_ram_efficient_loading", [True, False], ) def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @require_torch_2_7_0 @pytest.mark.parametrize("peft_use_dora", [True, False]) def test_lora_sft(self, temp_dir, peft_use_dora): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "peft_use_dora": peft_use_dora, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, # explicitly disable LORA kernels, as they may be auto-enabled "lora_mlp_kernel": False, "lora_qkv_kernel": False, "lora_o_kernel": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @require_torch_2_7_0 def test_lora_sft_kernels(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, "lora_o_kernel": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @require_torch_2_7_0 def test_qlora_sft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "load_in_4bit": True, "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @require_torch_2_7_0 def test_qlora_sft_kernels(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "load_in_4bit": True, "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, "lora_o_kernel": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @pytest.mark.skip(reason="slow test w cu129 + torch 2.9.1 + py3.12") @require_torch_2_7_0 def test_dpo_fft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "Intel/orca_dpo_pairs", "split": "train", "type": "chatml.intel", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) @pytest.mark.skip(reason="slow test w cu129 + torch 2.9.1 + py3.12") @require_torch_2_7_0 def test_dpo_lora(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "Intel/orca_dpo_pairs", "split": "train", "type": "chatml.intel", }, ], "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) verify_training_success(temp_dir) ================================================ FILE: tests/e2e/multigpu/test_gemma3.py ================================================ """ E2E tests for multigpu lora tinyllama """ from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from huggingface_hub import snapshot_download from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent @pytest.fixture(scope="session", autouse=True) def download_model(): # download the model snapshot_download("axolotl-mirrors/gemma-3-4b-pt", repo_type="model") @pytest.mark.skip(reason="FIXME") class TestMultiGPUGemma3: """ Test case for Gemma3 models using LoRA """ def test_lora_ddp_packed(self, temp_dir): cfg = DictDefault( { "base_model": "axolotl-mirrors/gemma-3-4b-pt", "unfrozen_parameters": ["model.language_model.*", "lm_head"], "sequence_len": 2048, "ddp_find_unused_parameters": True, "sample_packing": True, "eval_sample_packing": False, "pad_to_sequence_len": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.0, "chat_template": "gemma3", "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 4, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": { "use_reentrant": False, }, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/multigpu/test_llama.py ================================================ """ E2E tests for multigpu lora tinyllama """ from pathlib import Path import pytest import transformers import yaml from accelerate.test_utils import execute_subprocess_async from huggingface_hub import snapshot_download from packaging import version from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard, require_torch_2_6_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent @pytest.fixture(scope="session", autouse=True) def download_model(): # download the model snapshot_download("HuggingFaceTB/SmolLM2-135M") def transformers_version_eq(required_version): return version.parse(transformers.__version__) == version.parse(required_version) class TestMultiGPULlama: """ Test case for Llama models using LoRA """ def test_lora_ddp(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "sample_packing": True, "eval_sample_packing": False, "pad_to_sequence_len": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:20%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, # "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) def test_dpo_lora_ddp(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "sample_packing": False, "eval_sample_packing": False, "pad_to_sequence_len": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "fozziethebeat/alpaca_messages_2k_dpo_test", "type": "chat_template.default", "field_messages": "conversation", "field_chosen": "chosen", "field_rejected": "rejected", "message_field_role": "role", "message_field_content": "content", "roles": { "system": ["system"], "user": ["user"], "assistant": ["assistant"], }, }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "gradient_checkpointing": False, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "warmup_steps": 0, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", "train/train_loss", loss_threshold, "Train Loss (%s) is too high", ) def test_dpo_qlora_ddp(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "sample_packing": False, "eval_sample_packing": False, "pad_to_sequence_len": True, "load_in_4bit": True, "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "fozziethebeat/alpaca_messages_2k_dpo_test", "type": "chat_template.default", "field_messages": "conversation", "field_chosen": "chosen", "field_rejected": "rejected", "message_field_role": "role", "message_field_content": "content", "roles": { "system": ["system"], "user": ["user"], "assistant": ["assistant"], }, }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "gradient_checkpointing": False, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "warmup_steps": 0, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", "train/train_loss", loss_threshold, "Train Loss (%s) is too high", ) @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) def test_fsdp(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": gradient_accumulation_steps, # "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp": [ "full_shard", "auto_wrap", ], "fsdp_config": { "fsdp_offload_params": False, "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": False, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, "use_tensorboard": True, "seed": 42, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "fsdp_state_dict_type", [ "FULL_STATE_DICT", # "SHARDED_STATE_DICT", # not supported since intermediate checkpoints fail with fsdp1 ], ) def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 3, "save_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp": [ "full_shard", "auto_wrap", ], "fsdp_config": { "fsdp_offload_params": False, "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": False, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_state_dict_type": fsdp_state_dict_type, "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_6_0 @pytest.mark.parametrize( "attention_backend", ["flash", "flex"], ) @pytest.mark.parametrize( "fsdp_reshard_after_forward", [True, False], ) def test_fsdp2_packed( self, temp_dir, attention_backend, fsdp_reshard_after_forward ): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 2048, "val_set_size": 0.1, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_8bit", "lr_scheduler": "cosine", "fsdp": [ "auto_wrap", ], "fsdp_config": { "fsdp_version": 2, # "fsdp_forward_prefetch": True, # not yet implemented in accelerate "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": False, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_state_dict_type": "SHARDED_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_reshard_after_forward": fsdp_reshard_after_forward, }, "use_tensorboard": True, "save_first_step": False, } ) if attention_backend == "flash": cfg.flash_attention = True elif attention_backend == "flex": cfg.flex_attention = True # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" ) def test_fsdp_qlora_prequant_packed(self, temp_dir): cfg = DictDefault( { "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", "adapter": "qlora", "mean_resizing_embeddings": True, "load_in_4bit": True, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, # "lora_modules_to_save": [ # "embed_tokens", # "lm_head", # ], "sample_packing": True, "eval_sample_packing": False, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "fsdp": [ "full_shard", "auto_wrap", ], "fsdp_config": { "fsdp_offload_params": False, "fsdp_sync_module_states": True, "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": True, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) @pytest.mark.parametrize( "deepspeed", [ "deepspeed_configs/zero3_bf16.json", "deepspeed_configs/zero3_bf16_cpuoffload_all.json", # "deepspeed_configs/zero3_bf16_cpuoffload_params.json", ], ) @pytest.mark.parametrize( "qlora", [True, False], ) def test_ds_zero3_packed( self, temp_dir, gradient_accumulation_steps, deepspeed, qlora ): if qlora: adapter = { "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "load_in_4bit": True, } else: adapter = {} cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "deepspeed": str(AXOLOTL_ROOT / deepspeed), "use_tensorboard": True, "save_first_step": False, **adapter, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) @pytest.mark.parametrize( "qlora", [True, False], ) def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora): if qlora: adapter = { "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "load_in_4bit": True, } else: adapter = {} cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"), "use_tensorboard": True, "seed": 42, "save_first_step": False, **adapter, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) @pytest.mark.parametrize( "qlora", [True, False], ) def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora): if qlora: adapter = { "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "load_in_4bit": True, } else: adapter = {} cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), "use_tensorboard": True, "save_first_step": False, **adapter, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" ) @pytest.mark.skip( reason="fix untrained tokens brittle with lots of edge cases in latest transformers" ) def test_fix_untrained_tokens(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "fix_untrained_tokens": True, "sequence_len": 512, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", "bos_token": "<|custom_im_start|>", "eos_token": "<|custom_im_end|>", }, "datasets": [ { "chat_template": "jinja", "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}", "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/multigpu/test_locking.py ================================================ """Tests for FileLockLoader class.""" import tempfile import threading import time from pathlib import Path from unittest.mock import MagicMock, Mock, patch import pytest from axolotl.utils.data.lock import FileLockLoader from axolotl.utils.dict import DictDefault class TestFileLockLoader: """Class with tests for FileLockLoader.""" @pytest.fixture def temp_dir(self): """Create a temporary directory for testing.""" with tempfile.TemporaryDirectory() as tmp_dir: yield Path(tmp_dir) @pytest.fixture def cfg(self, temp_dir): """Create a test configuration.""" return DictDefault({"dataset_prepared_path": str(temp_dir)}) @pytest.fixture def loader(self, cfg): """Create a FileLockLoader instance for testing.""" return FileLockLoader(cfg) def test_load_first_process(self, loader): """Test load() when no ready flag exists (first process).""" mock_load_fn = Mock(return_value="test_data") result = loader.load(mock_load_fn) # Should call the load function mock_load_fn.assert_called_once() assert result == "test_data" # Should create the ready flag assert loader.ready_flag_path.exists() def test_load_subsequent_process(self, loader): """Test load() when ready flag already exists (subsequent process).""" # Create ready flag first loader.ready_flag_path.touch() mock_load_fn = Mock(return_value="loaded_data") result = loader.load(mock_load_fn) # Should still call load function (to load the prepared data) mock_load_fn.assert_called_once() assert result == "loaded_data" def test_load_concurrent_processes(self, cfg): """Test that concurrent processes coordinate correctly.""" results = [] call_count = 0 def slow_load_fn(): nonlocal call_count call_count += 1 time.sleep(0.1) # Simulate slow loading return f"data_{call_count}" def worker(): loader = FileLockLoader(cfg) result = loader.load(slow_load_fn) results.append(result) # Start multiple threads simultaneously threads = [threading.Thread(target=worker) for _ in range(3)] for t in threads: t.start() for t in threads: t.join() # Only one thread should have done the initial loading # All should return data, but the load function should be called # once by the first process and once by each subsequent process assert len(results) == 3 assert all(result.startswith("data_") for result in results) @patch("time.sleep") def test_load_waiting_for_ready_flag(self, mock_sleep, loader): """Test that processes wait for the ready flag to appear.""" mock_load_fn = Mock(return_value="waiting_data") mock_ready_flag_path = Mock() exists_call_count = 0 def mock_exists(): nonlocal exists_call_count exists_call_count += 1 if exists_call_count == 1: # First check: ready flag exists (not first process) return True if exists_call_count <= 3: # While loop checks: flag doesn't exist yet return False return True mock_ready_flag_path.exists.side_effect = mock_exists # Replace the ready_flag_path with our mock original_path = loader.ready_flag_path loader.ready_flag_path = mock_ready_flag_path try: result = loader.load(mock_load_fn) finally: # Restore original path loader.ready_flag_path = original_path # Should have slept twice while waiting assert mock_sleep.call_count == 2 mock_sleep.assert_called_with(1) # Should eventually call load function mock_load_fn.assert_called_once() assert result == "waiting_data" def test_complete_workflow_with_cleanup(self, loader): """Test the complete load -> cleanup workflow.""" mock_load_fn = Mock(return_value="test_data") # First process calls load (this should set up counter) result = loader.load(mock_load_fn) assert result == "test_data" assert loader.ready_flag_path.exists() assert loader.counter_path.exists() # Cleanup should remove everything since there's only one process loader.cleanup() assert not loader.ready_flag_path.exists() assert not loader.counter_path.exists() def test_multiple_processes_workflow(self, loader): """Test workflow with multiple processes.""" # Simulate multiple processes by manually setting up counter loader.ready_flag_path.touch() loader.counter_path.write_text("3") # 3 processes # First process cleanup loader.cleanup() assert loader.ready_flag_path.exists() assert loader.counter_path.read_text().strip() == "2" # Second process cleanup loader.cleanup() assert loader.ready_flag_path.exists() assert loader.counter_path.read_text().strip() == "1" # Last process cleanup loader.cleanup() assert not loader.ready_flag_path.exists() assert not loader.counter_path.exists() def test_load_exception_handling(self, loader): """Test behavior when load_fn raises an exception.""" def failing_load_fn(): raise ValueError("Load failed") with pytest.raises(ValueError, match="Load failed"): loader.load(failing_load_fn) # Ready flag should not be created on failure assert not loader.ready_flag_path.exists() def test_file_lock_called(self, loader): """Test that FileLock is properly used.""" mock_load_fn = Mock(return_value="locked_data") with patch("axolotl.utils.data.lock.FileLock") as mock_filelock: mock_context = MagicMock() mock_filelock.return_value.__enter__ = Mock(return_value=mock_context) mock_filelock.return_value.__exit__ = Mock(return_value=None) loader.load(mock_load_fn) # Verify FileLock was called with correct path mock_filelock.assert_called_once_with(str(loader.lock_file_path)) # Verify context manager was used mock_filelock.return_value.__enter__.assert_called_once() mock_filelock.return_value.__exit__.assert_called_once() ================================================ FILE: tests/e2e/multigpu/test_ray.py ================================================ """ E2E tests for multigpu post-training use Ray Train """ from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from axolotl.utils.dict import DictDefault from tests.e2e.utils import ( check_tensorboard, require_torch_2_7_0, ) AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent class TestMultiGPURay: """ Test cases for AnyScale Ray post training """ @require_torch_2_7_0 def test_lora_ddp(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "use_ray": True, "ray_num_workers": 2, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--use-ray", "--ray-num-workers", "2", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_7_0 @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "flash_attention": True, "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"), "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--use-ray", "--ray-num-workers", "2", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_7_0 @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 2], ) def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "flash_attention": True, "fsdp_version": 2, "fsdp_config": { "offload_params": False, "cpu_ram_efficient_loading": False, "transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "state_dict_type": "FULL_STATE_DICT", "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "reshard_after_forward": True, }, "use_tensorboard": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--use-ray", "--ray-num-workers", "2", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/multigpu/test_tp.py ================================================ """multigpu e2e test for tensor parallelism.""" from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard, require_torch_2_7_0 class TestTensorParallel: """Test class for Tensor Parallel functionality.""" @pytest.mark.skip( reason="TP doesn't work with models with tied weights (embeddings)" ) @require_torch_2_7_0 def test_fft_sft(self, temp_dir): cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch", "tensor_parallel_size": 2, "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "train", str(Path(temp_dir) / "config.yaml"), "--num-processes", "2", "--main-process-port", f"{get_torch_dist_unique_port()}", ] ) check_tensorboard( temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/patched/__init__.py ================================================ ================================================ FILE: tests/e2e/patched/lora_kernels/__init__.py ================================================ ================================================ FILE: tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py ================================================ """Integration tests for LoRA activation and attention kernels.""" from pathlib import Path import pytest import torch import yaml from accelerate.state import PartialState from peft import PeftModelForCausalLM, get_peft_config from transformers import AutoModelForCausalLM, LlamaForCausalLM from transformers.models.llama.configuration_llama import LlamaConfig from transformers.models.llama.modeling_llama import LlamaAttention from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention from axolotl.cli.config import load_cfg from axolotl.kernels.lora import ( apply_lora_mlp_geglu, apply_lora_mlp_swiglu, apply_lora_o, apply_lora_qkv, ) from axolotl.loaders.model import ModelLoader from axolotl.loaders.tokenizer import load_tokenizer from axolotl.monkeypatch.lora_kernels import ( apply_lora_kernel_patches, find_self_attn_in_layer, get_attention_cls_from_config, get_layers, patch_self_attn_lora, ) from axolotl.utils.dict import DictDefault MODEL_CONFIGS = [ { "name": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, { "name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, { "name": "HuggingFaceTB/SmolLM2-135M", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float32, }, { "name": "trl-internal-testing/tiny-Gemma2ForCausalLM", "expected_activation": apply_lora_mlp_geglu, "dtype": torch.float16, }, ] @pytest.fixture(autouse=True) def init_accelerate(): """Initialize Accelerate state before tests.""" _ = PartialState() @pytest.fixture def small_llama_model(): """Create a small LLaMA model for testing.""" config = { "vocab_size": 100, "hidden_size": 128, "intermediate_size": 256, "num_hidden_layers": 2, "num_attention_heads": 4, } return LlamaForCausalLM(LlamaConfig(**config)) @pytest.mark.parametrize( "model_name,attention_cls", [ ("HuggingFaceTB/SmolLM2-135M", LlamaAttention), ("Qwen/Qwen3-30B-A3B", Qwen3MoeAttention), ], ) def test_attention_patching_integration(model_name, attention_cls): """Test attention patching in integration context.""" cfg = DictDefault({"base_model": model_name}) # Store the original implementation original_forward = attention_cls.forward # Apply patch patch_self_attn_lora(cfg) # Get the new forward method patched_forward = attention_cls.forward # Check the forward method was replaced assert original_forward is not patched_forward assert patched_forward.__name__ == "axolotl_attn_forward" # Check original implementation was stored assert hasattr(attention_cls, "_original_forward") # Clean up attention_cls.forward = original_forward delattr(attention_cls, "_original_forward") def test_swiglu_mlp_integration(small_llama_model): """Test SwiGLU activation in LoRA MLP context.""" peft_config = get_peft_config( { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0, "bias": "none", } ) model = PeftModelForCausalLM(small_llama_model, peft_config).to("cuda") cfg = DictDefault({"lora_mlp_kernel": True}) # Apply patches patched_model = apply_lora_kernel_patches(model, cfg) # Verify patches layer = patched_model.model.model.layers[0] assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu # Test forward pass batch_size, seq_len = 2, 10 hidden_states = torch.randn( batch_size, seq_len, model.config.hidden_size, device=model.device ) position_ids = ( torch.arange(seq_len, device=model.device).unsqueeze(0).expand(batch_size, -1) ) cos, sin = model.model.model.rotary_emb(hidden_states, position_ids) inputs = { "hidden_states": hidden_states, "attention_mask": None, "position_embeddings": (cos, sin), "output_attentions": False, "use_cache": False, "past_key_value": None, } # Compare outputs with torch.no_grad(): original_output = model.model.model.layers[0](**inputs)[0] patched_output = layer(**inputs)[0] assert torch.allclose(original_output, patched_output, rtol=1e-4) def test_geglu_model_integration(): """Test GeGLU activation with Gemma model.""" model = AutoModelForCausalLM.from_pretrained( "trl-internal-testing/tiny-Gemma2ForCausalLM", dtype=torch.float16, device_map="cuda:0", ) peft_config = get_peft_config( { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0, "bias": "none", } ) model = PeftModelForCausalLM(model, peft_config) cfg = DictDefault({"lora_mlp_kernel": True}) patched_model = apply_lora_kernel_patches(model, cfg) # Verify patches layer = patched_model.model.model.layers[0] assert layer.mlp.forward.__func__ is apply_lora_mlp_geglu # Test end-to-end inputs = torch.randint(0, 100, (1, 20), device=model.device, dtype=torch.long) with torch.no_grad(): original_output = model(inputs).logits patched_output = patched_model(inputs).logits assert torch.allclose(original_output, patched_output, rtol=1e-4) @pytest.mark.parametrize( "model_name,expected_activation", [ ("HuggingFaceTB/SmolLM2-135M", apply_lora_mlp_swiglu), ("mhenrichsen/gemma-2b", apply_lora_mlp_geglu), ], ) def test_model_specific_activation(model_name, expected_activation): """Test that each model type gets the correct activation function.""" model = AutoModelForCausalLM.from_pretrained(model_name) peft_config = get_peft_config( { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0, "bias": "none", } ) model = PeftModelForCausalLM(model, peft_config) cfg = DictDefault({"lora_mlp_kernel": True}) patched_model = apply_lora_kernel_patches(model, cfg) layer = patched_model.model.model.layers[0] assert layer.mlp.forward.__func__ is expected_activation def test_kernel_patch_conditions(): """Test various conditions that should prevent kernel patching.""" test_configs = [ # Dropout prevents patching { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0.1, "bias": "none", }, # Bias prevents patching { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0, "bias": "lora_only", }, ] for config in test_configs: model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M") peft_config = get_peft_config(config) model = PeftModelForCausalLM(model, peft_config) cfg = DictDefault({"lora_mlp_kernel": True}) # Should not patch patched_model = apply_lora_kernel_patches(model, cfg) layer = patched_model.model.model.layers[0].mlp # Verify no patches applied assert layer.forward.__func__ is not apply_lora_mlp_swiglu assert layer.forward.__func__ is not apply_lora_mlp_geglu def test_kernel_config_options(): """Test that kernel configuration options are respected.""" # Test different configurations test_configs = [ ( {"lora_mlp_kernel": True, "lora_qkv_kernel": False, "lora_o_kernel": False}, lambda layer: ( layer.mlp.forward.__func__ is apply_lora_mlp_swiglu and layer.self_attn.apply_qkv.__func__ is not apply_lora_qkv and layer.self_attn.apply_o.__func__ is not apply_lora_o ), ), ( {"lora_mlp_kernel": False, "lora_qkv_kernel": True, "lora_o_kernel": False}, lambda layer: ( layer.mlp.forward.__func__ is not apply_lora_mlp_swiglu and layer.self_attn.apply_qkv.__func__ is apply_lora_qkv and layer.self_attn.apply_o.__func__ is not apply_lora_o ), ), ( {"lora_mlp_kernel": False, "lora_qkv_kernel": False, "lora_o_kernel": True}, lambda layer: ( layer.mlp.forward.__func__ is not apply_lora_mlp_swiglu and layer.self_attn.apply_qkv.__func__ is not apply_lora_qkv and layer.self_attn.apply_o.__func__ is apply_lora_o ), ), ] for config_dict, check_fn in test_configs: # Create fresh model for each test config = { "vocab_size": 100, "hidden_size": 128, "intermediate_size": 256, "num_hidden_layers": 2, "num_attention_heads": 4, } small_llama_model = LlamaForCausalLM(LlamaConfig(**config)) peft_config = get_peft_config( { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": [ "gate_proj", "up_proj", "down_proj", "q_proj", "k_proj", "v_proj", "o_proj", ], "lora_dropout": 0, "bias": "none", } ) model = PeftModelForCausalLM(small_llama_model, peft_config).to("cuda") cfg = DictDefault(config_dict) patched_model = apply_lora_kernel_patches(model, cfg) # Verify only requested optimizations were applied for layer in patched_model.model.model.layers: assert check_fn(layer), f"Failed for config: {config_dict}" # Clean up del model del small_llama_model del patched_model def get_lora_config(): """Get standard LoRA configuration for testing.""" return { "peft_type": "LORA", "task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 16, "target_modules": ["gate_proj", "up_proj", "down_proj"], "lora_dropout": 0, "bias": "none", } def get_test_inputs(model, seq_length=20): """Generate test inputs for model evaluation.""" return torch.randint( 0, model.config.vocab_size, (1, seq_length), device=model.device, dtype=torch.long, ) @pytest.mark.parametrize("model_config", MODEL_CONFIGS) def test_model_architecture(model_config): """Test LoRA kernel patches across different model architectures.""" # Load model with appropriate dtype model = AutoModelForCausalLM.from_pretrained( model_config["name"], torch_dtype=model_config["dtype"], device_map="cuda:0" ) # Apply LoRA configuration peft_config = get_peft_config(get_lora_config()) model = PeftModelForCausalLM(model, peft_config) # Apply kernel patches cfg = DictDefault({"lora_mlp_kernel": True}) patched_model = apply_lora_kernel_patches(model, cfg) # Verify correct activation function layer = patched_model.model.model.layers[0] assert layer.mlp.forward.__func__ is model_config["expected_activation"], ( f"Wrong activation for {model_config['name']}" ) # Test forward pass inputs = get_test_inputs(model) with torch.no_grad(): original_output = model(inputs).logits patched_output = patched_model(inputs).logits # Check outputs match assert torch.allclose(original_output, patched_output, rtol=1e-4), ( f"Outputs don't match for {model_config['name']}" ) def test_kernel_training_integration(temp_dir): """Test model loading with kernel patches enabled.""" from axolotl.cli.utils import load_model_and_tokenizer # Create minimal config cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.0, "lora_target_linear": True, "sequence_len": 1024, "lora_mlp_kernel": True, "lora_qkv_kernel": True, "lora_o_kernel": True, } ) # Write cfg to yaml file path = Path(temp_dir) / "config.yaml" with open(path, "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) # Load config cfg = load_cfg(str(path)) # Load model model, _, _ = load_model_and_tokenizer(cfg=cfg) # Verify correct activation function layer = model.model.model.layers[0] assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu def test_kernel_training_integration_auto_enable(temp_dir): """Test model loading with auto-enabled kernel patches.""" # Create minimal config without explicitly setting kernel options cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.0, "lora_target_linear": True, "sequence_len": 1024, } ) # Write cfg to yaml file path = Path(temp_dir) / "config.yaml" with open(path, "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) # Load config cfg = load_cfg(str(path)) # Verify kernel options were auto-enabled in the config assert cfg.lora_mlp_kernel is True assert cfg.lora_qkv_kernel is True assert cfg.lora_o_kernel is True # Get the attention class before patching to check for side effects attention_cls = get_attention_cls_from_config(cfg) # Store original state before patching original_forward_method = attention_cls.forward # Load the model (this should trigger the patches) tokenizer = load_tokenizer(cfg) model, _ = ModelLoader(cfg, tokenizer).load() # Test side effects of patch_self_attn_lora assert hasattr(attention_cls, "_original_forward") assert attention_cls.forward != original_forward_method # Find at least one self-attention module and verify it has the patched methods found_patched_attn = False for layer in model.model.model.layers: if hasattr(layer, "self_attn"): self_attn = layer.self_attn if all( hasattr(self_attn, proj) for proj in ["q_proj", "k_proj", "v_proj", "o_proj"] ): # These methods should be added by apply_lora_kernel_patches assert hasattr(self_attn, "apply_qkv") and callable(self_attn.apply_qkv) assert hasattr(self_attn, "apply_o") and callable(self_attn.apply_o) found_patched_attn = True break assert found_patched_attn def test_kernel_training_integration_dropout_non_zero(temp_dir): """Test model loading with dropout non-zero should not patch.""" from axolotl.cli.utils import load_model_and_tokenizer # Create minimal config cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.1, "lora_target_linear": True, "sequence_len": 1024, } ) # Write cfg to yaml file path = Path(temp_dir) / "config.yaml" with open(path, "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) # Load config cfg = load_cfg(str(path)) # Get original attention class attention_cls = get_attention_cls_from_config(cfg) # Store original state before patching original_forward_method = attention_cls.forward # Load model model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg) # We call modelloader as that's where the patches are applied # despite the fact that we're not using it to load the model model_loader = ModelLoader(cfg, tokenizer) # Apply patch model_loader.patch_manager._apply_self_attention_lora_patch() # Verify patch was not applied assert attention_cls.forward == original_forward_method # Apply apply_lora_kernel_patches model_loader.patch_manager._apply_lora_kernel_patch(model) # Verify patch was not applied layers = get_layers(model) for layer in layers: for self_attn in find_self_attn_in_layer(layer): assert not hasattr(self_attn, "apply_qkv") assert not hasattr(self_attn, "apply_o") ================================================ FILE: tests/e2e/patched/test_4d_multipack_llama.py ================================================ """ E2E tests for multipack fft llama using 4d attention masks """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir class Test4dMultipackLlama(unittest.TestCase): """ Test case for Llama models using 4d attention with multipack """ @with_temp_dir def test_sdp_lora_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": False, "sdp_attention": True, "sample_packing": True, "pad_to_sequence_len": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "fp16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_torch_lora_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": False, "sdp_attention": False, "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "fp16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_activation_checkpointing.py ================================================ """ E2E tests for activation checkpointing """ import pytest import transformers from torch.utils.checkpoint import checkpoint from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists @pytest.fixture() def fix_checkpoint_after_test(): yield transformers.modeling_utils.checkpoint = checkpoint class TestActivationCheckpointing: """ E2E tests for activation checkpointing """ @pytest.mark.parametrize( "gradient_checkpointing", ["offload", "offload_disk"], ) def test_activation_checkpointing_offload( self, temp_dir, fix_checkpoint_after_test, gradient_checkpointing, ): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", "eos_token": "<|im_end|>", }, "datasets": [ { "chat_template": "chatml", "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "gradient_checkpointing": gradient_checkpointing, "save_first_step": False, "dataset_num_proc": 4, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_cli_integrations.py ================================================ """ test cases to make sure the plugin args are loaded from the config file """ from pathlib import Path import yaml from axolotl.cli.config import load_cfg from axolotl.utils.dict import DictDefault class TestPluginArgs: """ test class for plugin args loaded from the config file """ def test_liger_plugin_args(self, temp_dir): test_cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "learning_rate": 0.000001, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "plugins": ["axolotl.integrations.liger.LigerPlugin"], "liger_layer_norm": True, "liger_rope": True, "liger_rms_norm": False, "liger_glu_activation": True, "liger_fused_linear_cross_entropy": True, } ) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(test_cfg.to_dict())) cfg = load_cfg(str(Path(temp_dir) / "config.yaml")) assert cfg.liger_layer_norm is True assert cfg.liger_rope is True assert cfg.liger_rms_norm is False assert cfg.liger_glu_activation is True assert cfg.liger_fused_linear_cross_entropy is True ================================================ FILE: tests/e2e/patched/test_fa_xentropy.py ================================================ """ E2E tests for lora llama """ import pytest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, check_tensorboard class TestFAXentropyLlama: """ Test case for Llama models using LoRA w multipack """ @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 4], ) def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "flash_attn_cross_entropy": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "chat_template": "chatml", "datasets": [ { "path": "mlabonne/FineTome-100k", "field_messages": "conversations", "message_field_content": "value", "message_field_role": "from", "type": "chat_template", "split": "train[:2%]", }, ], "num_epochs": 1, "max_steps": 5, "save_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/patched/test_falcon_samplepack.py ================================================ """ E2E tests for falcon """ import unittest import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir class TestFalconPatched(unittest.TestCase): """ Test case for Falcon models """ @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_qlora(self, temp_dir): cfg = DictDefault( { "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "load_in_4bit": True, "adapter": "qlora", "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "lora_modules_to_save": ["word_embeddings", "lm_head"], "val_set_size": 0.05, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_flattening.py ================================================ """ E2E tests for flattening batches """ import pytest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, check_tensorboard class TestFAFlattening: """ Test case for Llama models using LoRA w batch flattening """ @pytest.mark.parametrize( "gradient_accumulation_steps", [1, 4], ) def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "batch_flattening": True, "flash_attention": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "chat_template": "chatml", "datasets": [ { "path": "mlabonne/FineTome-100k", "field_messages": "conversations", "message_field_content": "value", "message_field_role": "from", "type": "chat_template", "split": "train[:2%]", }, ], "num_epochs": 1, "max_steps": 5, "save_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/patched/test_fsdp2_qlora.py ================================================ """Integration tests for FSDP2 Params4bit patches.""" import pytest from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam class TestFSDPPatchIntegration: """Test FSDP patch integration.""" @pytest.mark.integration def test_fsdp2_init_patches(self): """Test that all patches can be applied together.""" from axolotl.monkeypatch.fsdp2_qlora import ( apply_init_sharded_param_patch, apply_init_unsharded_param_patch, ) original_init_sharded = FSDPParam._init_sharded_param original_init_unsharded = FSDPParam.init_unsharded_param # Apply patches apply_init_sharded_param_patch() apply_init_unsharded_param_patch() assert FSDPParam._init_sharded_param != original_init_sharded, ( "_init_sharded_param was not patched" ) assert FSDPParam.init_unsharded_param != original_init_unsharded, ( "init_unsharded_param was not patched" ) ================================================ FILE: tests/e2e/patched/test_fused_llama.py ================================================ """ E2E tests for lora llama """ import unittest import pytest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir @pytest.mark.skip("FIXME, mostly underused functionality") class TestFusedLlama(unittest.TestCase): """ Test case for Llama models using Fused layers """ @with_temp_dir def test_fft_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": True, "pad_to_sequence_len": True, "flash_attn_fuse_mlp": True, "sample_packing": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 10, "save_steps": 5, "eval_steps": 5, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_llama_s2_attention.py ================================================ """ E2E tests for llama w/ S2 attn """ import unittest import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir @pytest.mark.skip(reason="FIXME?") class TestLlamaShiftedSparseAttention(unittest.TestCase): """ Test case for Llama models using S2 Attn """ @with_temp_dir def test_lora_s2_attn(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 16384, "sample_packing": False, "flash_attention": True, "s2_attention": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "Yukang/LongAlpaca-12k", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 10, "save_steps": 5, "eval_steps": 5, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_fft_s2_attn(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 16384, "sample_packing": False, "flash_attention": True, "s2_attention": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "Yukang/LongAlpaca-12k", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 10, "save_steps": 5, "eval_steps": 5, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_lora_llama_multipack.py ================================================ """ E2E tests for lora llama """ import unittest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir class TestLoraLlama(unittest.TestCase): """ Test case for Llama models using LoRA w multipack """ @with_temp_dir def test_lora_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.2, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "max_steps": 20, "save_steps": 10, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_mistral_samplepack.py ================================================ """ E2E tests for lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, require_torch_2_6_0, with_temp_dir class TestMistral(unittest.TestCase): """ Test case for Llama models using LoRA """ @require_torch_2_6_0 @with_temp_dir def test_lora_packing(self, temp_dir): cfg = DictDefault( { "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "unk_token": "", "bos_token": "", "eos_token": "", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_ft_packing(self, temp_dir): cfg = DictDefault( { "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, "val_set_size": 0.05, "special_tokens": { "unk_token": "", "bos_token": "", "eos_token": "", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_mixtral_samplepack.py ================================================ """ E2E tests for mixtral """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir class TestMixtral(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_qlora(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "load_in_4bit": True, "adapter": "qlora", "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "save_steps": 3, "eval_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_model_patches.py ================================================ """ E2E smoke tests to check that the monkeypatches are in place for certain configurations """ import unittest import transformers from axolotl.loaders import ModelLoader, load_tokenizer from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import with_temp_dir class TestModelPatches(unittest.TestCase): """ TestCases for the multipack monkey patches """ @with_temp_dir def test_mixtral_multipack(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) tokenizer = load_tokenizer(cfg) ModelLoader(cfg, tokenizer, inference=False).load() @with_temp_dir def test_mistral_multipack(self, temp_dir): cfg = DictDefault( { "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) tokenizer = load_tokenizer(cfg) ModelLoader(cfg, tokenizer, inference=False).load() assert ( "torch.jit" in transformers.modeling_flash_attention_utils._get_unpad_data.__module__ ) ================================================ FILE: tests/e2e/patched/test_peft_embeddings.py ================================================ """ Test case for handling embeddings when using peft """ import torch from axolotl.train import setup_model_and_tokenizer from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault class TestLlamaPeftEmbeddings: """ test class for handling embeddings when using peft """ def test_peft_embeddings_upcast(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "load_in_4bit": True, "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, "lora_target_linear": True, "trust_remote_code": True, "sequence_len": 512, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": False, "bf16": "auto", "embeddings_skip_upcast": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) model, _, _, _ = setup_model_and_tokenizer(cfg) # Check if the embeddings are upcast correctly # only embed_tokens is a parameter that may be upcast assert model.base_model.model.model.embed_tokens.weight.dtype == torch.bfloat16 assert model.base_model.model.lm_head.weight.dtype == torch.bfloat16 ================================================ FILE: tests/e2e/patched/test_phi_multipack.py ================================================ """ E2E tests for lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, with_temp_dir class TestPhiMultipack(unittest.TestCase): """ Test case for Phi2 models """ @with_temp_dir def test_ft_packed(self, temp_dir): cfg = DictDefault( { "base_model": "microsoft/phi-1_5", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "pad_to_sequence_len": True, "load_in_8bit": False, "adapter": None, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "eval_steps": 3, "save_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_qlora_packed(self, temp_dir): cfg = DictDefault( { "base_model": "microsoft/phi-1_5", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "pad_to_sequence_len": True, "load_in_4bit": True, "adapter": "qlora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "eval_steps": 3, "save_steps": 4, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/patched/test_resume.py ================================================ """ E2E tests for resuming training """ import os import re import subprocess from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.callbacks.tokens_per_second import TOKENS_STATE_FILE from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, most_recent_subdir, require_torch_2_6_0 class TestResumeLlama: """ Test case for resuming training of llama models """ @require_torch_2_6_0 def test_resume_lora_packed(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.001, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "save_steps": 3, "save_total_limit": 5, "max_steps": 15, "use_tensorboard": True, "save_first_step": False, "include_tkps": True, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) initial_total_num_tokens = cfg.total_num_tokens assert initial_total_num_tokens is not None, ( "total_num_tokens should be calculated during load_datasets" ) train(cfg=cfg, dataset_meta=dataset_meta) checkpoint_path = f"{temp_dir}/checkpoint-9" tokens_state_path = os.path.join(checkpoint_path, TOKENS_STATE_FILE) assert os.path.isfile(tokens_state_path), ( f"{TOKENS_STATE_FILE} should exist in checkpoint at {tokens_state_path}" ) resume_cfg = cfg | DictDefault( { "resume_from_checkpoint": f"{temp_dir}/checkpoint-9/", } ) normalize_config(resume_cfg) assert resume_cfg.total_num_tokens == initial_total_num_tokens, ( f"total_num_tokens should be preserved on resume. " f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}" ) resume_dataset_meta = load_datasets(cfg=resume_cfg) assert resume_cfg.total_num_tokens == initial_total_num_tokens, ( f"total_num_tokens should not be recalculated when resuming. " f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}" ) train(cfg=resume_cfg, dataset_meta=resume_dataset_meta) assert resume_cfg.total_num_tokens == initial_total_num_tokens, ( f"total_num_tokens should remain unchanged after resume training. " f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}" ) check_model_output_exists(temp_dir, cfg) tb_log_path_1 = most_recent_subdir(temp_dir + "/runs") cmd = f"tensorboard --inspect --logdir {tb_log_path_1}" res = subprocess.run( cmd, shell=True, text=True, capture_output=True, check=True ) pattern = r"first_step\s+(\d+)" first_steps = int(re.findall(pattern, res.stdout)[0]) assert first_steps == 10 ================================================ FILE: tests/e2e/patched/test_unsloth_integration.py ================================================ """Test module for checking whether the integration of Unsloth with Hugging Face Transformers is working as expected.""" import unittest import pytest @pytest.mark.skip( reason="Unsloth integration will be broken going into latest transformers" ) class TestUnslothIntegration(unittest.TestCase): """Unsloth monkeypatch integration tests.""" def test_is_self_attn_patchable(self): from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable # ensures the current version of transformers has loss code that matches our patching code self.assertTrue( check_self_attn_is_patchable(), "HF transformers self attention code has changed and isn't patchable", ) ================================================ FILE: tests/e2e/patched/test_unsloth_qlora.py ================================================ """ e2e tests for unsloth qlora """ import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, check_tensorboard @pytest.mark.skip( reason="Unsloth integration will be broken going into latest transformers" ) class TestUnslothQLoRA: """ Test class for Unsloth QLoRA Llama models """ @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": sample_packing, "flash_attention": True, "unsloth_lora_mlp": True, "unsloth_lora_qkv": True, "unsloth_lora_o": True, "load_in_4bit": True, "adapter": "qlora", "lora_r": 16, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "save_steps": 10, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) def test_unsloth_llama_qlora_unpacked(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "unsloth_lora_mlp": True, "unsloth_lora_qkv": True, "unsloth_lora_o": True, "sample_packing": False, "load_in_4bit": True, "adapter": "qlora", "lora_r": 16, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "save_steps": 10, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( "sdp_attention", [True, False], ) def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "unsloth_lora_mlp": True, "unsloth_lora_qkv": True, "unsloth_lora_o": True, "sample_packing": False, "load_in_4bit": True, "adapter": "qlora", "lora_r": 16, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "save_steps": 10, "micro_batch_size": 4, "gradient_accumulation_steps": 2, "sdp_attention": sdp_attention, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "fp16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/solo/__init__.py ================================================ ================================================ FILE: tests/e2e/solo/test_flex.py ================================================ """ E2E tests for packed training w/ flex attention """ import unittest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_tensorboard, require_torch_2_6_0, with_temp_dir class TestPackedFlex(unittest.TestCase): """ Test case for Packed training of llama models """ @require_torch_2_6_0 @with_temp_dir def test_loss_llama(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flex_attention": True, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "use_tensorboard": True, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/solo/test_relora_llama.py ================================================ """ E2E tests for relora llama """ import unittest from pathlib import Path from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from ..utils import check_model_output_exists, check_tensorboard, with_temp_dir class TestReLoraLlama(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_relora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "sample_packing": True, "pad_to_sequence_len": True, "flash_attention": True, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_modules": ["q_proj", "v_proj"], "relora": True, "jagged_restart_steps": 50, "jagged_restart_warmup_steps": 10, "jagged_restart_anneal_steps": 10, "relora_prune_ratio": 0.9, "relora_cpu_offload": True, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "chat_template": "chatml", "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "warmup_steps": 10, "num_epochs": 2, "max_steps": 105, # at least 2x relora_steps "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg) assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists(), ( "Relora model checkpoint not found" ) check_tensorboard( temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high" ) ================================================ FILE: tests/e2e/test_activation_offloading.py ================================================ """ E2E tests for activation offloading """ import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists class TestActivationOffloading: """ E2E test cases for activation offloading """ @pytest.mark.parametrize( "adapter", ["lora", "qlora", None], ) def test_activation_offloading( self, temp_dir, adapter, ): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", "eos_token": "<|im_end|>", }, "datasets": [ { "chat_template": "chatml", "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": "auto", "gradient_checkpointing": True, "activation_offloading": True, "save_first_step": False, "lora_r": 8, "lora_alpha": 16, "lora_target_linear": True, } ) if adapter == "lora": cfg["adapter"] = "lora" if adapter == "qlora": cfg["adapter"] = "qlora" cfg["load_in_4bit"] = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_deepseekv3.py ================================================ """ E2E tests for deepseekv3 """ from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import enable_hf_offline class TestDeepseekV3: """ Test case for DeepseekV3 models """ @enable_hf_offline @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_lora_deepseekv3(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/DeepSeek-V3-11M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "drop_system_message": True, "split": "train[:1%]", }, ], "special_tokens": { "bos_token": "<|begin▁of▁sentence|>", "eos_token": "<|end▁of▁sentence|>", }, "chat_template": "deepseek_v3", "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.safetensors").exists() @enable_hf_offline @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_fft_deepseekv3(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/DeepSeek-V3-11M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", "split": "train[:1%]", }, ], "chat_template": "deepseek_v3", "special_tokens": { "bos_token": "<|begin▁of▁sentence|>", "eos_token": "<|end▁of▁sentence|>", }, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() ================================================ FILE: tests/e2e/test_diffusion.py ================================================ """E2E smoke test for diffusion training plugin.""" from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists class TestDiffusion: """Test case for diffusion training plugin.""" def test_diffusion_smoke_test(self, temp_dir): """ Smoke test for diffusion training to ensure the plugin loads and trains without error. """ cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "trust_remote_code": True, "sequence_len": 256, "val_set_size": 0.1, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "bf16": True, "save_first_step": False, "logging_steps": 1, "eval_steps": 3, # Diffusion-specific config "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"], "diffusion": { # sample generation "generate_samples": True, "generation_interval": 1, "num_generation_samples": 1, "generation_steps": 2, "generation_max_length": 32, "generation_temperature": 0.0, # training-specific "mask_token_id": 16, "eps": 1e-3, "importance_weighting": False, }, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) def test_diffusion_sft_labels(self, temp_dir): """Test that diffusion training properly handles SFT data with labels.""" cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "trust_remote_code": True, "sequence_len": 256, "val_set_size": 0.1, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.0001, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "bf16": True, "save_first_step": False, "logging_steps": 1, "eval_steps": 2, # Diffusion-specific config "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"], "diffusion": { # sample generation "generate_samples": True, "generation_interval": 1, "num_generation_samples": 1, "generation_steps": 2, "generation_max_length": 32, "generation_temperature": 0.0, # training-specific "mask_token_id": 16, "eps": 1e-3, "importance_weighting": True, }, # Ensure we have proper SFT labels "train_on_inputs": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) # Verify that the dataset has labels sample = dataset_meta.train_dataset[0] assert "labels" in sample, "SFT dataset should have labels" # Check that some labels are -100 (prompt tokens) labels = sample["labels"] if hasattr(labels, "tolist"): labels = labels.tolist() assert -100 in labels, "SFT dataset should have -100 labels for prompt tokens" train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_dpo.py ================================================ """E2E tests for lora llama""" import unittest from pathlib import Path import pytest from axolotl.cli.args import TrainerCliArgs from axolotl.common.datasets import load_preference_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestDPOLlamaLora(unittest.TestCase): """ Test case for DPO Llama models using LoRA """ @with_temp_dir def test_dpo_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "datasets": [ { "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", "type": "chatml.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @with_temp_dir def test_dpo_nll_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "rpo_alpha": 0.5, "datasets": [ { "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", "type": "chatml.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @with_temp_dir def test_dpo_use_weighting(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "dpo_use_weighting": True, "datasets": [ { "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", "type": "chatml.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @pytest.mark.skip("kto_pair no longer supported in trl") @with_temp_dir def test_kto_pair_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "kto_pair", "datasets": [ { "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", "type": "chatml.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @with_temp_dir def test_ipo_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "ipo", "datasets": [ { "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", "type": "chatml.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @with_temp_dir def test_orpo_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "orpo", "orpo_alpha": 0.1, "remove_unused_columns": False, "chat_template": "chatml", "datasets": [ { "path": "argilla/distilabel-capybara-dpo-7k-binarized", "type": "chat_template.argilla", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) @pytest.mark.skip(reason="Fix the implementation") @with_temp_dir def test_kto_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "LlamaTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.1, "lora_target_linear": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "kto", "rl_beta": 0.5, "kto_desirable_weight": 1.0, "kto_undesirable_weight": 1.0, "remove_unused_columns": False, "datasets": [ # { # "path": "argilla/kto-mix-15k", # "type": "chatml.argilla_chat", # "split": "train", # }, { "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto", "type": "chatml.ultra", "split": "train", }, # { # "path": "argilla/kto-mix-15k", # "type": "llama3.argilla_chat", # "split": "train", # }, { "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto", "type": "llama3.ultra", "split": "train", }, ], "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "warmup_steps": 5, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": True}, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg) ================================================ FILE: tests/e2e/test_embeddings_lr.py ================================================ """ E2E tests for llama pretrain """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, check_tensorboard, with_temp_dir class TestEmbeddingsLrScale(unittest.TestCase): """ Test case for embedding_lr* """ @with_temp_dir def test_train_w_embedding_lr_scale(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": True, "sequence_len": 1024, "sample_packing": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "max_steps": 5, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "embedding_lr_scale": 0.5, "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" ) @with_temp_dir def test_train_w_embedding_lr(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": True, "sequence_len": 1024, "sample_packing": True, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "max_steps": 5, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "embedding_lr": 0.000005, "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" ) ================================================ FILE: tests/e2e/test_evaluate.py ================================================ """E2E smoke test for evaluate CLI command""" from pathlib import Path import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault class TestE2eEvaluate: """Test cases for evaluate CLI""" def test_evaluate(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "accelerate", "launch", "--num-processes", "2", "--main_process_port", f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.evaluate", str(Path(temp_dir) / "config.yaml"), ] ) ================================================ FILE: tests/e2e/test_falcon.py ================================================ """ E2E tests for falcon """ import unittest import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestFalcon(unittest.TestCase): """ Test case for falcon """ @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora(self, temp_dir): cfg = DictDefault( { "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": [ "word_embeddings", "lm_head", ], "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora_added_vocab(self, temp_dir): cfg = DictDefault( { "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": [ "word_embeddings", "lm_head", ], "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", }, "tokens": [ "<|im_start|>", "<|im_end|>", ], "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_gemma2.py ================================================ """ E2E tests for gemma2 """ from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault class TestGemma2: """ Test case for Gemma2 models """ @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_lora_gemma2(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/gemma-2-33M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "drop_system_message": True, "split": "train[:1%]", }, ], "special_tokens": { "bos_token": "", "eos_token": "", }, "chat_template": "gemma", # gemma2's template is same as gemma "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.safetensors").exists() @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_fft_gemma2(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/gemma-2-33M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "split": "train[:1%]", "drop_system_message": True, }, ], "chat_template": "gemma", # gemma2's template is same as gemma "special_tokens": { "bos_token": "", "eos_token": "", }, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() ================================================ FILE: tests/e2e/test_gemma3_text.py ================================================ """ E2E tests for gemma3_text """ from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault class TestGemma3Text: """ Test case for Gemma3Text models """ @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_lora_gemma3_text(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/gemma-3-34M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "split": "train[:1%]", }, ], "special_tokens": { "bos_token": "", "eos_token": "", }, "chat_template": "gemma3", "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.safetensors").exists() @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_fft_gemma3_text(self, temp_dir, sample_packing): cfg = DictDefault( { "base_model": "axolotl-ai-co/gemma-3-34M", "trust_remote_code": True, "sample_packing": sample_packing, "flash_attention": True, "sequence_len": 2048, "val_set_size": 0, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "split": "train[:1%]", }, ], "chat_template": "gemma3", "special_tokens": { "bos_token": "", "eos_token": "", }, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() ================================================ FILE: tests/e2e/test_imports.py ================================================ """ test module to import various submodules that have historically broken due to dependency issues """ import unittest class TestImports(unittest.TestCase): """ Test class to import various submodules that have historically broken due to dependency issues """ def test_import_causal_trainer(self): pass def test_import_rl_trainer(self): pass ================================================ FILE: tests/e2e/test_llama.py ================================================ """ E2E tests for llama """ import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists class TestLlama: """ Test case for Llama models """ def test_fft_trust_remote_code(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "trust_remote_code": True, "sequence_len": 512, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) def test_fix_untrained_tokens(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "fix_untrained_tokens": True, "sequence_len": 512, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", "bos_token": "<|custom_im_start|>", "eos_token": "<|custom_im_end|>", }, "datasets": [ { "chat_template": "jinja", "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}", "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) def test_fix_untrained_tokens_already_trained(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "fix_untrained_tokens": True, "sequence_len": 512, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "chat_template": "chatml", "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "split": "train[:10%]", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @pytest.mark.parametrize("tf32", ["auto", False]) def test_batch_flattening(self, tf32, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "trust_remote_code": True, "sequence_len": 512, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": False, "batch_flattening": True, "bf16": True, "tf32": tf32, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_llama_pretrain.py ================================================ """E2E tests for llama pretrain""" import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, check_tensorboard class TestPretrainLlama: """Test case for Llama models w pretraining""" @pytest.mark.parametrize( ("sample_packing", "pretrain_multipack_attn"), [ (False, False), (True, True), (True, False), ], ) def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": True, "sequence_len": 1024, "sample_packing": sample_packing, "pretrain_multipack_attn": pretrain_multipack_attn, "dataset_num_proc": 1, "special_tokens": { "pad_token": "<|endoftext|>", }, "pretraining_dataset": [ { "path": "allenai/c4", "name": "en", "type": "pretrain", } ], "max_steps": 5, "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) loss_threshold = 3.6 if sample_packing and not pretrain_multipack_attn: loss_threshold = 6.5 check_tensorboard( temp_dir + "/runs", "train/train_loss", loss_threshold, "Train Loss (%s) is too high", ) ================================================ FILE: tests/e2e/test_llama_vision.py ================================================ """ E2E tests for lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestLlamaVision(unittest.TestCase): """ Test case for Llama Vision models """ @with_temp_dir def test_lora_llama_vision_text_only_dataset(self, temp_dir): cfg = DictDefault( { "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision", "processor_type": "AutoProcessor", "skip_prepare_dataset": True, "remove_unused_columns": False, "sample_packing": False, "sequence_len": 1024, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj", "val_set_size": 0, "chat_template": "llama3_2_vision", "datasets": [ { "path": "LDJnr/Puffin", "type": "chat_template", "field_messages": "conversations", "message_field_role": "from", "message_field_content": "value", }, ], "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_lora_llama_vision_multimodal_dataset(self, temp_dir): cfg = DictDefault( { "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision", "processor_type": "AutoProcessor", "skip_prepare_dataset": True, "remove_unused_columns": False, "sample_packing": False, "sequence_len": 1024, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj", "val_set_size": 0, "chat_template": "llama3_2_vision", "datasets": [ { "path": "axolotl-ai-co/llava-instruct-mix-vsft-small", "type": "chat_template", "split": "train", "field_messages": "messages", }, ], "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_load_model.py ================================================ """Module for testing ModelLoader.""" import shutil import tempfile import pytest import torch from axolotl.loaders import ModelLoader, load_tokenizer from axolotl.utils.dict import DictDefault @pytest.fixture(name="temp_dir") def fixture_temp_dir(): temp_dir = tempfile.mkdtemp() yield temp_dir shutil.rmtree(temp_dir) class TestLoadModelUtils: """ Testing module testing ModelLoader. """ def setup_method(self): # load config self.cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "load_in_8bit": False, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "tensor_parallel_size": 1, "context_parallel_size": 1, } ) self.model_loader = ModelLoader( cfg=self.cfg, tokenizer="", inference=False, reference_model=True, ) @pytest.mark.parametrize("embedding_modules", ["embed_tokens", "lm_head"]) @pytest.mark.parametrize( "dist_dtype", [torch.bfloat16, torch.float16, torch.float32] ) @pytest.mark.parametrize("before_kbit_train_or_finetune", [True, False]) def test_convert_embedding_modules_dtype( self, temp_dir, embedding_modules, dist_dtype, before_kbit_train_or_finetune ): self.cfg.output_dir = temp_dir self.model_loader.tokenizer = load_tokenizer(self.cfg) self.model_loader.load() self.model_loader._convert_embedding_modules_dtype( embedding_modules, dist_dtype, before_kbit_train_or_finetune ) for name, module in self.model_loader.model.named_modules(): if ( "norm" in name or (before_kbit_train_or_finetune and name.endswith(".gate")) or ( any(m in name for m in embedding_modules) and hasattr(module, "weight") ) ): for _, param in module.named_parameters(): assert param.dtype == dist_dtype ================================================ FILE: tests/e2e/test_lora_llama.py ================================================ """ E2E tests for lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestLoraLlama(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_mamba.py ================================================ """ E2E tests for lora llama """ import unittest import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir @pytest.mark.skip(reason="skipping until upstreamed into transformers") class TestMamba(unittest.TestCase): """ Test case for Mamba models """ @with_temp_dir def test_fft(self, temp_dir): cfg = DictDefault( { "base_model": "state-spaces/mamba-130m", "model_type": "MambaLMHeadModel", "tokenizer_type": "AutoTokenizer", "tokenizer_config": "EleutherAI/gpt-neox-20b", "flash_attention": False, "sequence_len": 1024, "load_in_8bit": False, "val_set_size": 0.0, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "gradient_checkpointing": False, "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": None, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_mistral.py ================================================ """ E2E tests for lora llama """ import unittest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestMistral(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_lora(self, temp_dir): cfg = DictDefault( { "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "flash_attention": True, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", "eos_token": "", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", "eos_token": "", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_mixtral.py ================================================ """ E2E tests for mixtral """ import unittest import torch from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestMixtral(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_qlora_w_fa2(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sequence_len": 1024, "load_in_4bit": True, "adapter": "qlora", "lora_r": 4, "lora_alpha": 8, "lora_dropout": 0.1, "lora_target_modules": [ "o_proj", "w3", "k_proj", "v_proj", "w1", "q_proj", "w2", ], "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype == torch.float32 ) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_qlora_wo_fa2(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": False, "sequence_len": 1024, "load_in_4bit": True, "adapter": "qlora", "lora_r": 4, "lora_alpha": 8, "lora_dropout": 0.1, "lora_target_modules": [ "o_proj", "w3", "k_proj", "v_proj", "w1", "q_proj", "w2", ], "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype == torch.float32 ) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_16bit_lora_w_fa2(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sequence_len": 1024, "adapter": "lora", "lora_r": 4, "lora_alpha": 8, "lora_dropout": 0.1, "lora_target_modules": [ "o_proj", "w3", "k_proj", "v_proj", "w1", "q_proj", "w2", ], "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype == torch.float32 ) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_16bit_lora_wo_fa2(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": False, "sequence_len": 1024, "adapter": "lora", "lora_r": 4, "lora_alpha": 8, "lora_dropout": 0.1, "lora_target_modules": [ "o_proj", "w3", "k_proj", "v_proj", "w1", "q_proj", "w2", ], "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True dataset_meta = load_datasets(cfg=cfg) model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype == torch.float32 ) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { "base_model": "hf-internal-testing/Mixtral-tiny", "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 20, "save_steps": 10, "eval_steps": 10, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_optimizers.py ================================================ """ E2E tests for custom optimizers using Llama """ import unittest import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import ( check_model_output_exists, require_torch_2_5_1, require_torch_2_6_0, require_torch_2_7_0, with_temp_dir, ) class TestCustomOptimizers(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_optimi_adamw(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "optimi_adamw", "max_steps": 5, "lr_scheduler": "cosine", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) assert trainer.optimizer.optimizer.__class__.__name__ == "AdamW" @with_temp_dir @require_torch_2_5_1 def test_adopt_adamw(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adopt_adamw", "lr_scheduler": "cosine", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) assert "ADOPT" in trainer.optimizer.optimizer.__class__.__name__ @with_temp_dir @require_torch_2_5_1 def test_muon(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "muon", "lr_scheduler": "cosine", "weight_decay": 0.01, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) assert "Muon" in trainer.optimizer.optimizer.__class__.__name__ @with_temp_dir @require_torch_2_7_0 def test_dion(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "dion", "dion_lr": 0.01, "dion_momentum": 0.95, "lr_scheduler": "cosine", "weight_decay": 0.01, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) assert "Dion" in trainer.optimizer.optimizer.__class__.__name__ @with_temp_dir def test_fft_schedule_free_adamw(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "sequence_len": 1024, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "schedule_free_adamw", "lr_scheduler": "constant", "max_steps": 10, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir @require_torch_2_6_0 def test_came_pytorch(self, temp_dir): cfg = DictDefault( { "base_model": "JackFram/llama-68m", "tokenizer_type": "LlamaTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.1, "special_tokens": { "unk_token": "", "bos_token": "", "eos_token": "", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "came_pytorch", "adam_beta3": 0.9999, "adam_epsilon2": 1e-16, "max_steps": 5, "lr_scheduler": "cosine", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @require_torch_2_7_0 @pytest.mark.parametrize( "optimizer_name,expected_class,learning_rate", [ ("flash_adamw", "FlashAdamW", 0.00001), ("flash_adam", "FlashAdam", 0.00001), ("flash_sgd", "FlashSGD", 0.01), ("flash_sgdw", "FlashSGDW", 0.01), ("flash_lion", "FlashLion", 0.0001), ], ) def test_flash_optimizers(tmp_path, optimizer_name, expected_class, learning_rate): temp_dir = str(tmp_path) cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": learning_rate, "optimizer": optimizer_name, "max_steps": 5, "lr_scheduler": "cosine", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) assert trainer.optimizer.optimizer.__class__.__name__ == expected_class ================================================ FILE: tests/e2e/test_packing_loss.py ================================================ """ E2E tests for packed training """ import unittest from transformers.utils import is_torch_bf16_gpu_available from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_tensorboard, with_temp_dir class TestPackedLlama(unittest.TestCase): """ Test case for Packed training of llama models """ @with_temp_dir def test_loss_packed(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "use_tensorboard": True, "save_first_step": False, } ) if is_torch_bf16_gpu_available(): cfg.bf16 = True else: cfg.fp16 = True cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) ================================================ FILE: tests/e2e/test_phi.py ================================================ """ E2E tests for lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestPhi(unittest.TestCase): """ Test case for Phi2 models """ @with_temp_dir def test_phi_ft(self, temp_dir): cfg = DictDefault( { "base_model": "microsoft/phi-1_5", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, "sample_packing": False, "load_in_8bit": False, "adapter": None, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "max_steps": 10, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) @with_temp_dir def test_phi_qlora(self, temp_dir): cfg = DictDefault( { "base_model": "microsoft/phi-1_5", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, "sample_packing": False, "load_in_4bit": True, "adapter": "qlora", "lora_r": 64, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, "max_steps": 10, "save_steps": 10, "eval_steps": 10, "bf16": "auto", "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_preprocess.py ================================================ """E2E Test the preprocess cli""" from pathlib import Path import yaml from accelerate.test_utils import execute_subprocess_async from axolotl.utils.dict import DictDefault AXOLOTL_ROOT = Path(__file__).parent.parent.parent class TestPreprocess: """test cases for preprocess""" def test_w_deepspeed(self, temp_dir): """make sure preprocess doesn't choke when using deepspeed in the config""" cfg = DictDefault( { "base_model": "Qwen/Qwen2.5-0.5B", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ { "path": "tatsu-lab/alpaca", "type": "alpaca", "split": "train[:10%]", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "bf16": "auto", "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), "dataset_prepared_path": temp_dir + "/last_run_prepared", } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "axolotl", "preprocess", str(Path(temp_dir) / "config.yaml"), ] ) assert (Path(temp_dir) / "last_run_prepared").exists() ================================================ FILE: tests/e2e/test_process_reward_model_smollm2.py ================================================ """ E2E tests for process reward model w/ lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, check_tensorboard, with_temp_dir class TestProcessRewardSmolLM2(unittest.TestCase): """ Test case for Llama process reward models using LoRA """ @with_temp_dir def test_prm(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForTokenClassification", "num_labels": 2, "process_reward_model": True, "sequence_len": 512, "val_set_size": 0.0, "datasets": [ { "path": "trl-lib/math_shepherd", "type": "stepwise_supervised", "step_separator": "\n", "split": "train[:10%]", }, ], "max_steps": 100, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.0005, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "gradient_checkpointing": True, "warmup_ratio": 0.1, "use_tensorboard": True, "special_tokens": {"pad_token": "<|endoftext|>"}, "seed": 42, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_profiler.py ================================================ """ e2e gpu test for the pytorch profiler callback """ from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault @pytest.fixture(name="profiler_base_cfg") def fixture_profiler_base_cfg(): cfg = DictDefault( base_model="HuggingFaceTB/SmolLM2-135M", tokenizer_type="AutoTokenizer", sequence_len=1024, load_in_8bit=True, adapter="lora", lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_target_linear=True, val_set_size=0.02, special_tokens={"pad_token": "<|endoftext|>"}, datasets=[ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], num_epochs=1, micro_batch_size=2, gradient_accumulation_steps=1, learning_rate=0.00001, optimizer="adamw_torch_fused", lr_scheduler="cosine", ) return cfg class TestProfiler: """ test cases for the pytorch profiler callback """ def test_profiler_saves(self, profiler_base_cfg, temp_dir): cfg = profiler_base_cfg | DictDefault( output_dir=temp_dir, max_steps=5, profiler_steps=3, ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "snapshot.pickle").exists() def test_profiler_saves_w_start(self, profiler_base_cfg, temp_dir): cfg = profiler_base_cfg | DictDefault( output_dir=temp_dir, max_steps=5, profiler_steps=3, profiler_steps_start=1, ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "snapshot.pickle").exists() @pytest.mark.parametrize( "profiler_steps_start", [3, 5], ) def test_profiler_saves_past_end( self, profiler_base_cfg, temp_dir, profiler_steps_start ): cfg = profiler_base_cfg | DictDefault( output_dir=temp_dir, max_steps=5, profiler_steps=3, profiler_steps_start=profiler_steps_start, ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "snapshot.pickle").exists() def test_profiler_never_started(self, profiler_base_cfg, temp_dir): cfg = profiler_base_cfg | DictDefault( output_dir=temp_dir, max_steps=5, profiler_steps=3, profiler_steps_start=6, ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) assert not (Path(temp_dir) / "snapshot.pickle").exists() ================================================ FILE: tests/e2e/test_qat.py ================================================ """ E2E tests for QAT """ from pathlib import Path from axolotl.common.datasets import load_datasets, load_preference_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from axolotl.utils.schemas.enums import TorchAOQuantDType from axolotl.utils.schemas.quantization import QATConfig, validate_ao_dtype from .utils import check_model_output_exists, check_tensorboard class TestQATLlama: """ Test case for QAT Llama models """ def test_qat(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mlabonne/FineTome-100k", "type": "chat_template", "field_messages": "conversations", "message_property_mappings": { "role": "from", "content": "value", }, "drop_system_message": True, "split": "train[:1%]", }, ], "chat_template": "chatml", "qat": { "quantize_embedding": True, "activation_dtype": "int8", "weight_dtype": "int4", "group_size": 8, }, "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "max_steps": 5, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg) def test_qat_dpo(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 2048, "sample_packing": False, "eval_sample_packing": False, "pad_to_sequence_len": True, "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, "rl": "dpo", "chat_template": "chatml", "datasets": [ { "path": "fozziethebeat/alpaca_messages_2k_dpo_test", "type": "chat_template.default", "field_messages": "conversation", "field_chosen": "chosen", "field_rejected": "rejected", "message_field_role": "role", "message_field_content": "content", "roles": { "system": ["system"], "user": ["user"], "assistant": ["assistant"], }, }, ], "num_epochs": 1, "max_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "warmup_steps": 0, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, "use_tensorboard": True, "bf16": True, "qat": { "quantize_embedding": True, "activation_dtype": "int8", "weight_dtype": "int4", "group_size": 8, }, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_preference_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg) loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", "train/train_loss", loss_threshold, "Train Loss (%s) is too high", ) class TestMXFP4Schema: """Test MXFP4 schema validation""" def test_validate_mxfp4_dtype(self): result = validate_ao_dtype("mxfp4") assert result == TorchAOQuantDType.mxfp4 def test_qat_config_with_mxfp4(self): """Test QATConfig accepts mxfp4 weight_dtype""" config = QATConfig( weight_dtype="mxfp4", group_size=32, quantize_embedding=False, ) assert config.weight_dtype == TorchAOQuantDType.mxfp4 assert config.group_size == 32 def test_qat_config_mxfp4_invalid_group_size(self): """Test that invalid group_size raises appropriate error during quantization""" # Note: Schema validation doesn't check group_size compatibility, # that happens in get_quantization_config config = QATConfig( weight_dtype="mxfp4", group_size=16, # Invalid for mxfp4, but schema allows it ) assert config.group_size == 16 # Schema accepts it # Actual validation happens at runtime in get_quantization_config ================================================ FILE: tests/e2e/test_quantization.py ================================================ """ Tests for axolotl.utils.quantization """ import pytest import torch from torch import nn from torchao.prototype.qat import MXFakeQuantizeConfig from torchao.quantization import LinearActivationQuantizedTensor from torchao.quantization.qat.embedding import FakeQuantizedEmbedding from torchao.quantization.qat.linear import FakeQuantizedLinear from torchao.quantization.quant_api import ( Float8DynamicActivationFloat8WeightConfig, Float8DynamicActivationInt4WeightConfig, Int8DynamicActivationInt4WeightConfig, ) from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor from transformers import AutoModelForCausalLM from transformers.trainer_callback import TrainerState from axolotl.utils.callbacks.qat import QATCallback from axolotl.utils.quantization import ( convert_qat_model, get_quantization_config, prepare_model_for_qat, quantize_model, ) from axolotl.utils.schemas.enums import TorchAOQuantDType from axolotl.utils.schemas.quantization import QATConfig from tests.e2e.utils import ( require_torch_2_8_0, requires_cuda_ge_8_9, requires_sm_ge_100, ) @pytest.fixture() def model(): dummy_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-0.5B", device_map="auto", dtype=torch.bfloat16, ) with torch.device(dummy_model.device): dummy_model.model.embed_tokens = torch.nn.Embedding( dummy_model.model.embed_tokens.weight.shape[0], dummy_model.model.embed_tokens.weight.shape[1], dtype=dummy_model.model.embed_tokens.weight.dtype, ) yield dummy_model del dummy_model ptq_config_test_cases = [ # weight_dtype, activation_dtype, group_size, expected_type ( TorchAOQuantDType.int4, TorchAOQuantDType.int8, None, Int8DynamicActivationInt4WeightConfig, ), ( TorchAOQuantDType.float8_e4m3fn, TorchAOQuantDType.float8_e4m3fn, None, Float8DynamicActivationFloat8WeightConfig, ), ( TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, Float8DynamicActivationInt4WeightConfig, ), ] ptq_test_cases = [ # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class (TorchAOQuantDType.int4, None, 4, True, None, Int4Tensor), ( TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False, None, LinearActivationQuantizedTensor, ), # ( # TorchAOQuantDType.int4, # TorchAOQuantDType.float8_e4m3fn, # None, # False, # None, # Int4Tensor, # ), (TorchAOQuantDType.int4, None, None, False, None, Int4Tensor), # Deprecated configs (TorchAOQuantDType.int8, None, 8, False, ValueError, None), (TorchAOQuantDType.int4, TorchAOQuantDType.int4, 8, False, ValueError, None), (TorchAOQuantDType.int8, TorchAOQuantDType.int8, 8, True, ValueError, None), ] class TestQuantization: """ Test quantization utilities """ @pytest.mark.parametrize( "weight_dtype,activation_dtype,group_size,expected_type", ptq_config_test_cases, ) @requires_cuda_ge_8_9 @require_torch_2_8_0 def test_get_ptq_config( self, weight_dtype, activation_dtype, group_size, expected_type ): config = get_quantization_config(weight_dtype, activation_dtype, group_size) assert isinstance(config, expected_type) @require_torch_2_8_0 @requires_sm_ge_100 def test_get_ptq_config_mxfp4(self): config = get_quantization_config(TorchAOQuantDType.mxfp4, None, 32) assert isinstance(config, MXFakeQuantizeConfig) assert config.block_size == 32 @require_torch_2_8_0 @requires_sm_ge_100 def test_get_ptq_config_mxfp4_invalid_group_size(self): with pytest.raises( ValueError, match="MXFP4 quantization must use a block_size" ): get_quantization_config(TorchAOQuantDType.mxfp4, None, 16) @requires_cuda_ge_8_9 @require_torch_2_8_0 def test_get_ptq_config_int4_weight_only(self): from torchao.quantization.quant_api import Int4WeightOnlyConfig config = get_quantization_config(TorchAOQuantDType.int4, None, 4) assert isinstance(config, Int4WeightOnlyConfig) @pytest.mark.parametrize( "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception,expected_tensor_class", ptq_test_cases, ) @requires_cuda_ge_8_9 @require_torch_2_8_0 def test_quantize_model_for_ptq( self, model, weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class, ): if expected_exception: with pytest.raises(expected_exception): quantize_model( model, weight_dtype, group_size, activation_dtype, quantize_embedding, ) else: quantize_model( model, weight_dtype, group_size, activation_dtype, quantize_embedding ) if quantize_embedding: assert isinstance( model.model.embed_tokens.weight, expected_tensor_class ), "Embedding weight should be quantized" for child in list(model.children()): if isinstance(child, torch.nn.Linear): assert isinstance(child.weight, expected_tensor_class) @require_torch_2_8_0 @requires_sm_ge_100 def test_quantize_model_for_ptq_fp8( self, model, ): from torchao.quantization.quantize_.workflows.float8.float8_tensor import ( Float8Tensor, QuantizeTensorToFloat8Kwargs, ) quantize_model( model, TorchAOQuantDType.float8_e4m3fn, None, TorchAOQuantDType.float8_e4m3fn, ) for child in list(model.children()): if isinstance(child, torch.nn.Linear): assert isinstance(child.weight, Float8Tensor) assert child.weight.act_quant_kwargs is not None and isinstance( child.weight.act_quant_kwargs, QuantizeTensorToFloat8Kwargs ) @require_torch_2_8_0 @requires_sm_ge_100 def test_quantize_model_for_ptq_nvfp4( self, model, ): from torchao.prototype.mx_formats.nvfp4_tensor import ( NVFP4Tensor, QuantizeTensorToNVFP4Kwargs, ) quantize_model(model, TorchAOQuantDType.nvfp4, 16, TorchAOQuantDType.nvfp4) for child in list(model.children()): if isinstance(child, torch.nn.Linear): assert isinstance(child.weight, NVFP4Tensor) assert child.weight.act_quant_kwargs is not None and isinstance( child.weight.act_quant_kwargs, QuantizeTensorToNVFP4Kwargs ) @pytest.mark.parametrize( "weight_dtype,activation_dtype,group_size,quantize_embedding", [ (TorchAOQuantDType.int4, None, 8, False), (TorchAOQuantDType.int4, None, 16, True), (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False), (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 16, True), ( TorchAOQuantDType.float8_e4m3fn, TorchAOQuantDType.float8_e4m3fn, None, False, ), (TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, True), ], ) @require_torch_2_8_0 @requires_cuda_ge_8_9 def test_prepare_model_for_qat( self, model, weight_dtype, activation_dtype, group_size, quantize_embedding ): prepare_model_for_qat( model, weight_dtype, group_size, activation_dtype, quantize_embedding, ) if quantize_embedding: assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert hasattr(model.model.embed_tokens, "weight_fake_quantizer") assert ( model.model.embed_tokens.weight_fake_quantizer.config.dtype == weight_dtype.value ) if group_size: assert ( model.model.embed_tokens.weight_fake_quantizer.config.group_size == group_size ) for child in list(model.children()): if isinstance(child, torch.nn.Linear): assert isinstance(child, FakeQuantizedLinear) assert hasattr(child, "weight_fake_quantizer") assert child.weight_fake_quantizer.config.dtype == weight_dtype.value if group_size: assert child.weight_fake_quantizer.config.group_size == group_size if activation_dtype: assert hasattr(child, "activation_fake_quantizer") assert ( child.activation_fake_quantizer.config.dtype == activation_dtype.value ) else: assert child.activation_fake_quantizer is None @pytest.mark.parametrize( "weight_dtype,activation_dtype,group_size,quantize_embedding", [ (TorchAOQuantDType.mxfp4, None, 32, False), (TorchAOQuantDType.mxfp4, None, 32, True), ], ) @require_torch_2_8_0 @requires_sm_ge_100 def test_prepare_model_for_qat_mxfp4( self, model, weight_dtype, activation_dtype, group_size, quantize_embedding ): prepare_model_for_qat( model, weight_dtype, group_size, activation_dtype, quantize_embedding, ) if quantize_embedding: assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert hasattr(model.model.embed_tokens, "weight_fake_quantizer") for child in list(model.children()): if isinstance(child, torch.nn.Linear): assert isinstance(child, FakeQuantizedLinear) assert hasattr(child, "weight_fake_quantizer") @require_torch_2_8_0 @requires_cuda_ge_8_9 def test_convert_qat_model(self, model): config = QATConfig( weight_dtype="int4", activation_dtype="int8", group_size=8, quantize_embedding=True, ) # quantize model for qat prepare_model_for_qat( model, config.weight_dtype, config.group_size, config.activation_dtype, config.quantize_embedding, ) assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert isinstance(model.lm_head, FakeQuantizedLinear) # apply conversion convert_qat_model( model, config.quantize_embedding, ) # ensure modules have been swapped out assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert not isinstance(model.lm_head, FakeQuantizedLinear) # ensure weights have been quantized assert isinstance(model.model.embed_tokens.weight, nn.Parameter) assert isinstance(model.lm_head.weight, nn.Parameter) class TestQuantizationCallback: """ Test QATCallback """ @pytest.fixture() def trainer_state(self): return TrainerState( global_step=0, ) @require_torch_2_8_0 def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state): cfg = QATConfig( weight_dtype="int4", activation_dtype="int8", group_size=8, quantize_embedding=True, fake_quant_after_n_steps=100, ) prepare_model_for_qat( model, cfg.weight_dtype, cfg.group_size, cfg.activation_dtype, cfg.quantize_embedding, ) # ensure model has been quantized assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert model.model.embed_tokens.weight_fake_quantizer.enabled assert isinstance(model.lm_head, FakeQuantizedLinear) assert model.lm_head.weight_fake_quantizer.enabled qat_callback = QATCallback(cfg) # simulate first training step qat_callback.on_step_begin( args=None, state=trainer_state, control=None, model=model, ) # quantization should have been disabled assert not model.model.embed_tokens.weight_fake_quantizer.enabled assert not model.lm_head.weight_fake_quantizer.enabled trainer_state.global_step = 100 qat_callback.on_step_begin( args=None, state=trainer_state, control=None, model=model, ) # quantization should have been enabled assert model.model.embed_tokens.weight_fake_quantizer.enabled assert model.lm_head.weight_fake_quantizer.enabled @require_torch_2_8_0 def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state): cfg = QATConfig( weight_dtype="int4", activation_dtype="int8", group_size=8, quantize_embedding=True, fake_quant_after_n_steps=None, ) prepare_model_for_qat( model, cfg.weight_dtype, cfg.group_size, cfg.activation_dtype, cfg.quantize_embedding, ) # ensure model has been quantized assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding) assert model.model.embed_tokens.weight_fake_quantizer.enabled assert isinstance(model.lm_head, FakeQuantizedLinear) assert model.lm_head.weight_fake_quantizer.enabled qat_callback = QATCallback(cfg) # simulate first training step qat_callback.on_step_begin( args=None, state=trainer_state, control=None, model=model, ) # quantization should be enabled from the get-go assert model.model.embed_tokens.weight_fake_quantizer.enabled assert model.lm_head.weight_fake_quantizer.enabled ================================================ FILE: tests/e2e/test_qwen.py ================================================ """ E2E tests for qwen """ from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault class TestE2eQwen: """ Test cases for qwen models """ @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"]) def test_dpo(self, base_model, temp_dir): cfg = DictDefault( { "base_model": base_model, "rl": "dpo", "chat_template": "qwen_25", "sequence_len": 2048, "val_set_size": 0.0, "datasets": [ { "path": "fozziethebeat/alpaca_messages_2k_dpo_test", "split": "train", "type": "chat_template.default", "field_messages": "conversation", "field_chosen": "chosen", "field_rejected": "rejected", "message_property_mappings": { "role": "role", "content": "content", }, "roles": { "system": ["system"], "user": ["user"], "assistant": ["assistant"], }, }, ], "num_epochs": 1, "max_steps": 5, "warmup_steps": 20, "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "flash_attention": True, "bf16": "auto", "tf32": True, "gradient_checkpointing": True, "save_first_step": False, } ) # write cfg to yaml file Path(temp_dir).mkdir(parents=True, exist_ok=True) with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) execute_subprocess_async( [ "accelerate", "launch", "--num-processes", "2", "--main_process_port", f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) ================================================ FILE: tests/e2e/test_reward_model_smollm2.py ================================================ """ E2E tests for reward model lora llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, check_tensorboard, with_temp_dir class TestRewardModelLoraSmolLM2(unittest.TestCase): """ Test case for Llama reward models using LoRA """ @with_temp_dir def test_rm_lora(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForSequenceClassification", "num_labels": 1, "chat_template": "alpaca", "reward_model": True, "sequence_len": 2048, "pad_to_sequence_len": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.0, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "argilla/distilabel-intel-orca-dpo-pairs", "type": "bradley_terry.chat_template", "split": "train[:10%]", }, ], "lora_modules_to_save": ["embed_tokens", "lm_head"], "remove_unused_columns": False, "max_steps": 10, "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch", "lr_scheduler": "cosine", "gradient_checkpointing": True, "warmup_ratio": 0.1, "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_save_first_step.py ================================================ """ E2E tests for relora llama """ import unittest from pathlib import Path import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestSaveFirstStepCallback(unittest.TestCase): """Test cases for save_first_step callback config.""" @with_temp_dir def test_save_first_step(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 512, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "save_first_step": True, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg) @with_temp_dir def test_no_save_first_step(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 512, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "max_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", "flash_attention": True, "sample_packing": True, "bf16": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) with pytest.raises(AssertionError): check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg) ================================================ FILE: tests/e2e/test_schedulers.py ================================================ """ E2E tests for custom schedulers using Llama """ import unittest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, with_temp_dir class TestCustomSchedulers(unittest.TestCase): """ Test case for Llama models using LoRA """ @with_temp_dir def test_rex_scheduler(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "max_steps": 20, "lr_scheduler": "rex", "warmup_steps": 5, "cosine_min_lr_ratio": 0.05, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) ================================================ FILE: tests/e2e/test_streaming.py ================================================ """E2E tests for streaming dataset functionality""" # pylint: disable=duplicate-code import pytest from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from .utils import check_model_output_exists, check_tensorboard class TestStreamingDatasets: """Test case for streaming datasets""" @pytest.mark.parametrize( "sample_packing", [True, False], ) def test_streaming_dataset(self, temp_dir, sample_packing): """Test streaming datasets""" cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "flash_attention": True, "sequence_len": 1024, "sample_packing": sample_packing, "pretrain_multipack_attn": sample_packing, "streaming_multipack_buffer_size": 10000, "dataset_num_proc": 1, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], # Streaming config "streaming": True, "max_steps": 3, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "val_set_size": 0.0, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "bf16": "auto", "use_tensorboard": True, "save_first_step": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) # Verify training actually happened by checking loss decrease check_tensorboard( temp_dir + "/runs", "train/train_loss", 3.0, "Train Loss (%s) is too high", ) ================================================ FILE: tests/e2e/test_tokenizer.py ================================================ """ e2e test for saving the tokenizer """ from unittest.mock import patch from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_model_output_exists def test_tokenizer_no_save_jinja_files(temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, "chat_template": "chatml", "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "num_epochs": 1, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 5, "save_first_step": False, "fp16": False, "tokenizer_save_jinja_files": False, } ) cfg = validate_config(cfg) normalize_config(cfg) dataset_meta = load_datasets(cfg=cfg) with patch("axolotl.train.execute_training"): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) with open(f"{temp_dir}/tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_config = f.read() assert "chat_template" in tokenizer_config ================================================ FILE: tests/e2e/utils.py ================================================ """ helper utils for tests """ import importlib.util import os import shutil import tempfile import unittest from functools import wraps from pathlib import Path import torch from packaging import version from tbparse import SummaryReader from axolotl.utils.dict import DictDefault def with_temp_dir(test_func): @wraps(test_func) def wrapper(*args, **kwargs): # Create a temporary directory temp_dir = tempfile.mkdtemp() try: # Pass the temporary directory to the test function test_func(*args, temp_dir=temp_dir, **kwargs) finally: # Clean up the directory after the test shutil.rmtree(temp_dir) return wrapper def most_recent_subdir(path): base_path = Path(path) subdirectories = [d for d in base_path.iterdir() if d.is_dir()] if not subdirectories: return None subdir = max(subdirectories, key=os.path.getctime) return subdir def require_torch_2_4_1(test_case): """ Decorator marking a test that requires torch >= 2.5.1 """ def is_min_2_4_1(): torch_version = version.parse(torch.__version__) return torch_version >= version.parse("2.4.1") return unittest.skipUnless(is_min_2_4_1(), "test requires torch>=2.4.1")(test_case) def require_torch_2_5_1(test_case): """ Decorator marking a test that requires torch >= 2.5.1 """ def is_min_2_5_1(): torch_version = version.parse(torch.__version__) return torch_version >= version.parse("2.5.1") return unittest.skipUnless(is_min_2_5_1(), "test requires torch>=2.5.1")(test_case) def require_torch_2_6_0(test_case): """ Decorator marking a test that requires torch >= 2.6.0 """ def is_min_2_6_0(): torch_version = version.parse(torch.__version__) return torch_version >= version.parse("2.6.0") return unittest.skipUnless(is_min_2_6_0(), "test requires torch>=2.6.0")(test_case) def require_torch_2_7_0(test_case): """ Decorator marking a test that requires torch >= 2.7.0 """ def is_min_2_7_0(): torch_version = version.parse(torch.__version__) return torch_version >= version.parse("2.7.0") return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case) def require_torch_2_8_0(test_case): """ Decorator marking a test that requires torch >= 2.7.0 """ def is_min_2_8_0(): torch_version = version.parse(torch.__version__) return torch_version >= version.parse("2.8.0") return unittest.skipUnless(is_min_2_8_0(), "test requires torch>=2.8.0")(test_case) def require_torch_lt_2_6_0(test_case): """ Decorator marking a test that requires torch < 2.6.0 """ def is_max_2_6_0(): torch_version = version.parse(torch.__version__) return torch_version < version.parse("2.6.0") return unittest.skipUnless(is_max_2_6_0(), "test requires torch<2.6.0")(test_case) def require_vllm(test_case): """ Decorator marking a test that requires a vllm to be installed """ def is_vllm_installed(): return importlib.util.find_spec("vllm") is not None return unittest.skipUnless( is_vllm_installed(), "test requires vllm to be installed" )(test_case) def require_llmcompressor(test_case): """ Decorator marking a test that requires a llmcompressor to be installed """ def is_llmcompressor_installed(): return importlib.util.find_spec("llmcompressor") is not None return unittest.skipUnless( is_llmcompressor_installed(), "test requires llmcompressor to be installed" )(test_case) def requires_sm_ge_100(test_case): is_sm_ge_100 = ( torch.cuda.is_available() and torch.version.cuda and torch.cuda.get_device_capability() >= (10, 0) ) return unittest.skipUnless(is_sm_ge_100, "test requires sm>=100")(test_case) def requires_cuda_ge_8_9(test_case): is_cuda_ge_8_9 = ( torch.cuda.is_available() and torch.version.cuda and torch.cuda.get_device_capability() >= (8, 9) ) return unittest.skipUnless(is_cuda_ge_8_9, "test requires cuda>=8.9")(test_case) def is_hopper(): compute_capability = torch.cuda.get_device_capability() return compute_capability == (9, 0) def require_hopper(test_case): return unittest.skipUnless(is_hopper(), "test requires h100/hopper GPU")(test_case) def supports_fp8(test_case): compute_capability = torch.cuda.get_device_capability() return unittest.skipUnless( compute_capability >= (9, 0), "test requires h100 or newer GPU" )(test_case) def check_tensorboard( temp_run_dir: str, tag: str, lt_val: float, assertion_err: str, rtol: float = 0.02, gt_zero: bool = True, ) -> None: """ helper function to parse and check tensorboard logs """ tb_log_path = most_recent_subdir(temp_run_dir) event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) reader = SummaryReader(event_file) df = reader.scalars df = df[(df.tag == tag)] lt_val = (1 + rtol) * lt_val if "%s" in assertion_err: assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1] else: assert df.value.values[-1] < lt_val, assertion_err if gt_zero: assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero" def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: """ helper function to check if a model output file exists after training checks based on adapter or not (always safetensors in Transformers V5) """ if not cfg.adapter: assert (Path(temp_dir) / "model.safetensors").exists() else: assert (Path(temp_dir) / "adapter_model.safetensors").exists() ================================================ FILE: tests/fixtures/alpaca/alpaca.json ================================================ [ { "instruction": "You will be given a series of words. Output these words in reverse order, with each word on its own line.", "input": "Words: ['Hello', 'world'].", "output": "['world', 'Hello']" }, { "instruction": "In this task, you're given a short description of an event. Your job is to order the steps involved in the event from first to last. Note that there may be multiple correct answers for each event.", "input": "Description: A man walks into a bar and orders a drink. He pays for his drink and leaves the bar.", "output": "1. The man walks into the bar.\n2. He orders a drink.\n3. He pays for his drink.\n4. He leaves the bar." } ] ================================================ FILE: tests/fixtures/conversation.json ================================================ { "conversations": [ { "from": "human", "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?" }, { "from": "gpt", "value": "It's possible to use algebraic topology to solve Minesweeper, but it would likely be quite complex. Algebraic topology is a branch of mathematics that studies the properties of spaces that are preserved under continuous deformations, such as stretching and bending. In the context of Minesweeper, this could be used to analyze the structure of the game board and determine the locations of mines based on certain topological invariants. However, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper. It's not something that can be explained in a simple answer." }, { "from": "human", "value": "could you give me the gist of how it could though?" }, { "from": "gpt", "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer." }, { "from": "human", "value": "give me a specific example" }, { "from": "gpt", "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1 1 1 1 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 1 1 1 1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1 1 1 1 1\n 1 M ? ? 1\n 1 ? ? ? 1\n 1 ? ? M 1\n 1 1 1 1 1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper." }, { "from": "human", "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?" }, { "from": "gpt", "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper." }, { "from": "human", "value": "how would i do it with persistent homology or any of the other topological invariants?" }, { "from": "gpt", "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper." }, { "from": "human", "value": "how could i implement this in python?" }, { "from": "gpt", "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project." } ] } ================================================ FILE: tests/fixtures/conversation.missingturns.json ================================================ { "conversations": [ { "from": "human", "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?" }, { "from": "gpt", "value": "" }, { "from": "human", "value": "could you give me the gist of how it could though?" }, { "from": "gpt", "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer." }, { "from": "human", "value": "give me a specific example" }, { "from": "gpt", "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1 1 1 1 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 1 1 1 1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1 1 1 1 1\n 1 M ? ? 1\n 1 ? ? ? 1\n 1 ? ? M 1\n 1 1 1 1 1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper." }, { "from": "human", "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?" }, { "from": "gpt", "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper." }, { "from": "human", "value": "how would i do it with persistent homology or any of the other topological invariants?" }, { "from": "gpt", "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper." }, { "from": "human", "value": "how could i implement this in python?" }, { "from": "gpt", "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project." } ] } ================================================ FILE: tests/fixtures/conversation.tokenized.json ================================================ {"input_ids": [1, 319, 13563, 1546, 263, 12758, 1404, 322, 385, 23116, 21082, 20255, 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568, 6089, 304, 278, 1404, 29915, 29879, 5155, 29889, 29871, 3148, 1001, 29901, 920, 1033, 474, 2334, 263, 29086, 705, 11356, 5687, 393, 3667, 4637, 21531, 20159, 304, 4505, 1045, 3163, 29973, 29871, 319, 1799, 9047, 13566, 29901, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, 3148, 1001, 29901, 1033, 366, 2367, 592, 278, 330, 391, 310, 920, 372, 1033, 2466, 29973, 29871, 319, 1799, 9047, 13566, 29901, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, 3148, 1001, 29901, 2367, 592, 263, 2702, 1342, 29871, 319, 1799, 9047, 13566, 29901, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 10241, 474, 471, 4856, 411, 263, 6483, 8004, 310, 1716, 29892, 1033, 366, 5649, 278, 1021, 6964, 304, 592, 411, 393, 11833, 29973, 29871, 319, 1799, 9047, 13566, 29901, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 920, 723, 474, 437, 372, 411, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 29973, 29871, 319, 1799, 9047, 13566, 29901, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 920, 1033, 474, 2334, 445, 297, 3017, 29973, 29871, 319, 1799, 9047, 13566, 29901, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 21106, 29879, 29958, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "labels": [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 21106, 29879, 29958, 2]} ================================================ FILE: tests/fixtures/conversation.tokenized_llama2chat.json ================================================ {"input_ids": [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 3525, 1033, 474, 2334, 263, 29086, 705, 11356, 5687, 393, 3667, 4637, 21531, 20159, 304, 4505, 1045, 3163, 29973, 518, 29914, 25580, 29962, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, 1, 518, 25580, 29962, 1033, 366, 2367, 592, 278, 330, 391, 310, 920, 372, 1033, 2466, 29973, 518, 29914, 25580, 29962, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, 1, 518, 25580, 29962, 2367, 592, 263, 2702, 1342, 518, 29914, 25580, 29962, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 10241, 474, 471, 4856, 411, 263, 6483, 8004, 310, 1716, 29892, 1033, 366, 5649, 278, 1021, 6964, 304, 592, 411, 393, 11833, 29973, 518, 29914, 25580, 29962, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 920, 723, 474, 437, 372, 411, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 29973, 518, 29914, 25580, 29962, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 920, 1033, 474, 2334, 445, 297, 3017, 29973, 518, 29914, 25580, 29962, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 29889, 29871, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "labels": [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 29889, 29871, 2, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], "attention_mask": [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false]} ================================================ FILE: tests/hf_offline_utils.py ================================================ """ test utils for helpers and decorators """ import os from contextlib import contextmanager from functools import wraps def reload_modules(hf_hub_offline): # Force reload of the modules that check this variable import importlib import datasets import huggingface_hub.constants # from huggingface_hub.utils import reset_sessions # Reload the constants module first, as others depend on it importlib.reload(huggingface_hub.constants) huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline importlib.reload(datasets.config) datasets.config.HF_HUB_OFFLINE = hf_hub_offline def enable_hf_offline(test_func): """ test decorator that sets HF_HUB_OFFLINE environment variable to True and restores it after the test even if the test fails. :param test_func: :return: """ @wraps(test_func) def wrapper(*args, **kwargs): # Save the original value of HF_HUB_OFFLINE environment variable original_hf_offline = os.getenv("HF_HUB_OFFLINE") # Set HF_OFFLINE environment variable to True os.environ["HF_HUB_OFFLINE"] = "1" reload_modules(True) try: # Run the test function return test_func(*args, **kwargs) finally: # Restore the original value of HF_HUB_OFFLINE environment variable if original_hf_offline is not None: os.environ["HF_HUB_OFFLINE"] = original_hf_offline reload_modules(bool(original_hf_offline)) else: del os.environ["HF_HUB_OFFLINE"] reload_modules(False) return wrapper def disable_hf_offline(test_func): """ test decorator that sets HF_HUB_OFFLINE environment variable to False and restores it after the wrapped func :param test_func: :return: """ @wraps(test_func) def wrapper(*args, **kwargs): # Save the original value of HF_HUB_OFFLINE environment variable original_hf_offline = os.getenv("HF_HUB_OFFLINE") # Set HF_OFFLINE environment variable to True os.environ["HF_HUB_OFFLINE"] = "0" reload_modules(False) try: # Run the test function return test_func(*args, **kwargs) finally: # Restore the original value of HF_HUB_OFFLINE environment variable if original_hf_offline is not None: os.environ["HF_HUB_OFFLINE"] = original_hf_offline reload_modules(bool(original_hf_offline)) else: del os.environ["HF_HUB_OFFLINE"] reload_modules(False) return wrapper @contextmanager def hf_offline_context(hf_hub_offline): """ Context manager that sets HF_HUB_OFFLINE environment variable to the given value. :param hf_hub_offline: The new value for HF_HUB_OFFLINE. :return: A context manager. """ original_hf_offline = os.getenv("HF_HUB_OFFLINE") os.environ["HF_HUB_OFFLINE"] = str(hf_hub_offline) reload_modules(bool(hf_hub_offline)) yield # Restore the original value of HF_HUB_OFFLINE environment variable if original_hf_offline is not None: os.environ["HF_HUB_OFFLINE"] = original_hf_offline reload_modules(bool(original_hf_offline)) else: del os.environ["HF_HUB_OFFLINE"] reload_modules(False) ================================================ FILE: tests/integrations/__init__.py ================================================ ================================================ FILE: tests/integrations/test_diffusion.py ================================================ """Tests for diffusion trainer integration.""" # pylint: disable=redefined-outer-name,protected-access from unittest.mock import Mock import pytest import torch from axolotl.integrations.diffusion import DiffusionTrainer from axolotl.integrations.diffusion.utils import create_bidirectional_attention_mask from axolotl.utils.dict import DictDefault @pytest.fixture def mock_tokenizer(): """Create a mock tokenizer.""" tokenizer = Mock() tokenizer.bos_token_id = 1 tokenizer.eos_token_id = 2 tokenizer.pad_token_id = 0 return tokenizer @pytest.fixture def diffusion_config(): """Create a diffusion config.""" return DictDefault( { "diffusion": { "mask_token_id": 32000, "eps": 1e-3, "importance_weighting": False, }, "sample_packing": False, } ) @pytest.fixture def diffusion_trainer_instance(mock_tokenizer, diffusion_config): """Create a diffusion trainer instance for testing methods directly.""" # Create a minimal trainer instance just for testing methods trainer = object.__new__(DiffusionTrainer) # Bypass __init__ trainer.cfg = diffusion_config trainer._special_token_ids = {0, 1, 2} # pad, bos, eos trainer.processing_class = mock_tokenizer trainer.store_metrics = Mock() # Mock metrics storage return trainer class TestDiffusionTrainer: """Test the DiffusionTrainer class.""" def test_forward_process_basic(self, diffusion_trainer_instance): """Test basic forward process without labels.""" input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long) noisy_batch, masked_indices, p_mask = ( diffusion_trainer_instance._forward_process(input_ids, eps=0.1) ) # Check shapes assert noisy_batch.shape == input_ids.shape assert masked_indices.shape == input_ids.shape assert p_mask.shape == input_ids.shape # Check that special tokens are not masked special_token_positions = (input_ids == 1) | (input_ids == 2) | (input_ids == 0) assert not masked_indices[special_token_positions].any() # Check that mask token is applied mask_token_id = diffusion_trainer_instance.cfg.diffusion.mask_token_id masked_positions = masked_indices if masked_positions.any(): assert (noisy_batch[masked_positions] == mask_token_id).all() def test_forward_process_with_labels(self, diffusion_trainer_instance): """Test forward process with SFT labels.""" input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long) labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long) noisy_batch, masked_indices, p_mask = ( diffusion_trainer_instance._forward_process( input_ids, labels=labels, eps=0.1 ) ) # Check shapes assert noisy_batch.shape == input_ids.shape assert masked_indices.shape == input_ids.shape assert p_mask.shape == input_ids.shape # Check that only answer tokens can be masked (where labels != -100) non_answer_mask = labels == -100 # No masking should occur on non-answer tokens assert not masked_indices[non_answer_mask].any() # p_mask should be the same for all positions (sampled timestep), # but masking is only applied to answer tokens assert p_mask.shape == input_ids.shape # Verify that masked_indices respects the answer mask assert not masked_indices[non_answer_mask].any() def test_forward_process_with_attention_mask(self, diffusion_trainer_instance): """Test forward process with attention mask.""" input_ids = torch.tensor([[1, 10, 20, 0]], dtype=torch.long) attention_mask = torch.tensor([[1, 1, 1, 0]], dtype=torch.long) _, masked_indices, p_mask = diffusion_trainer_instance._forward_process( input_ids, attention_mask=attention_mask, eps=0.1 ) # Check that padding tokens are not masked padding_positions = attention_mask == 0 assert not masked_indices[padding_positions].any() assert (p_mask[padding_positions] == 0).all() def test_bidirectional_attention_mask_no_packing(self, diffusion_trainer_instance): """Test bidirectional attention mask without sample packing.""" input_ids = torch.tensor([[1, 10, 20, 2]], dtype=torch.long) mask = create_bidirectional_attention_mask(input_ids) # Should be all-to-all attention expected_shape = (1, 1, 4, 4) assert mask.shape == expected_shape assert mask.all() def test_bidirectional_attention_mask_with_packing( self, diffusion_trainer_instance ): """Test bidirectional attention mask with sample packing.""" diffusion_trainer_instance.cfg.sample_packing = True input_ids = torch.tensor([[1, 10, 20, 30, 40, 2]], dtype=torch.long) # Sample IDs: first sample (1), second sample (2) attention_mask = torch.tensor([[1, 1, 1, 2, 2, 2]], dtype=torch.long) mask = create_bidirectional_attention_mask( input_ids, attention_mask, sample_packing=True ) # Check that tokens within same sample can attend to each other # but not across samples assert mask[0, 0, 0, 1].item() # First sample tokens can attend to each other assert mask[0, 0, 1, 2].item() assert not mask[0, 0, 0, 3].item() # Can't attend across samples assert not mask[0, 0, 2, 4].item() assert mask[0, 0, 3, 4].item() # Second sample tokens can attend to each other def test_compute_loss_basic(self, diffusion_trainer_instance): """Test basic loss computation.""" # Mock model that returns logits mock_model = Mock() mock_outputs = Mock() vocab_size = 1000 seq_len = 5 mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True) mock_model.return_value = mock_outputs mock_model.training = True input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long) loss, outputs = diffusion_trainer_instance._compute_diffusion_loss( mock_model, input_ids ) # Check that loss is computed assert isinstance(loss, torch.Tensor) assert loss.requires_grad assert outputs == mock_outputs # Check that metrics were stored diffusion_trainer_instance.store_metrics.assert_called_once() def test_compute_loss_sft(self, diffusion_trainer_instance): """Test loss computation with SFT labels.""" # Mock model mock_model = Mock() mock_outputs = Mock() vocab_size = 1000 seq_len = 5 mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True) mock_model.return_value = mock_outputs mock_model.training = True diffusion_trainer_instance.cfg.datasets = Mock() input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long) labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long) loss, _ = diffusion_trainer_instance._compute_diffusion_loss( mock_model, input_ids, labels=labels ) # Check that loss is computed assert isinstance(loss, torch.Tensor) assert loss.requires_grad # Check that SFT metrics were added call_args = diffusion_trainer_instance.store_metrics.call_args[0][0] assert "answer_ratio" in call_args assert "avg_answer_length" in call_args def test_compute_loss_no_masked_tokens(self, diffusion_trainer_instance): """Test loss computation when no tokens are masked.""" # Mock model mock_model = Mock() mock_outputs = Mock() vocab_size = 1000 seq_len = 3 mock_outputs.logits = torch.randn(1, seq_len, vocab_size) mock_model.return_value = mock_outputs mock_model.training = True # Only special tokens (which won't be masked) input_ids = torch.tensor([[1, 0, 2]], dtype=torch.long) loss, _ = diffusion_trainer_instance._compute_diffusion_loss( mock_model, input_ids ) # Loss should be zero when no tokens are masked assert loss.item() == 0.0 assert loss.requires_grad def test_cache_special_token_ids(self, mock_tokenizer): """Test caching of special token IDs.""" trainer = object.__new__(DiffusionTrainer) trainer.processing_class = mock_tokenizer trainer._cache_special_token_ids() assert trainer._special_token_ids == {0, 1, 2} def test_cache_special_token_ids_no_tokenizer(self): """Test caching when no tokenizer is available.""" trainer = object.__new__(DiffusionTrainer) trainer.processing_class = None trainer._cache_special_token_ids() assert trainer._special_token_ids == set() def test_main_compute_loss_interface(self, diffusion_trainer_instance): """Test the main compute_loss interface.""" # Mock model mock_model = Mock() mock_outputs = Mock() mock_outputs.logits = torch.randn(1, 5, 1000) mock_model.return_value = mock_outputs mock_model.training = True inputs = { "input_ids": torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long), "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], dtype=torch.long), "labels": torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long), } # Test without return_outputs loss = diffusion_trainer_instance.compute_loss(mock_model, inputs) assert isinstance(loss, torch.Tensor) # Test with return_outputs loss, outputs = diffusion_trainer_instance.compute_loss( mock_model, inputs, return_outputs=True ) assert isinstance(loss, torch.Tensor) assert outputs == mock_outputs def test_missing_input_ids_raises_error(self, diffusion_trainer_instance): """Test that missing input_ids raises ValueError.""" mock_model = Mock() inputs = {"attention_mask": torch.tensor([[1, 1, 1]])} with pytest.raises(ValueError, match="input_ids is required"): diffusion_trainer_instance.compute_loss(mock_model, inputs) ================================================ FILE: tests/integrations/test_diffusion_callback.py ================================================ """Tests for diffusion generation callback dataloader selection and triggering.""" from types import SimpleNamespace from unittest.mock import Mock import pytest from axolotl.integrations.diffusion import DiffusionGenerationCallback class DummyTrainer: """Minimal trainer double with required attributes/methods for the callback.""" def __init__(self, use_eval: bool): # Config used by callback self.cfg = SimpleNamespace( diffusion=SimpleNamespace( generation_interval=1, num_generation_samples=1, generation_max_length=32, generation_steps=4, generation_temperature=0.0, mask_token_id=16, ), use_wandb=False, ) # Model/tokenizer are passed through to generate_samples; not used here self.model = Mock() self.processing_class = Mock() # Datasets and loaders self.eval_dataset = object() if use_eval else None self._train_loader = object() self._eval_loader = object() # State for world process check self.state = SimpleNamespace(is_world_process_zero=True) # Track which loader was requested self.requested: list[str] = [] def get_train_dataloader(self): self.requested.append("train") return self._train_loader def get_eval_dataloader(self): self.requested.append("eval") return self._eval_loader @pytest.mark.parametrize("use_eval", [False, True]) def test_callback_uses_correct_dataloader(monkeypatch, use_eval): trainer = DummyTrainer(use_eval=use_eval) callback = DiffusionGenerationCallback(trainer) captured = {} # Patch generate_samples in the callback module's namespace def fake_generate_samples(**kwargs): captured["dataloader"] = kwargs.get("dataloader") # Return one dummy sample to exercise logging path return [ { "original": "o", "masked": "m", "generated": "g", "mask_ratio": 0.5, "masked_tokens": 1, "total_tokens": 2, } ] monkeypatch.setattr( "axolotl.integrations.diffusion.callbacks.generate_samples", fake_generate_samples, ) # Trigger at step 1 (interval=1) args = SimpleNamespace() state = SimpleNamespace(global_step=1) control = SimpleNamespace() callback.on_step_end(args=args, state=state, control=control) # Assert the expected dataloader path was used if use_eval: assert trainer.requested[0] == "eval" assert captured["dataloader"] is trainer._eval_loader else: assert trainer.requested[0] == "train" assert captured["dataloader"] is trainer._train_loader ================================================ FILE: tests/integrations/test_kd_chat_template.py ================================================ """ Test for KD chat template strategies """ from unittest.mock import Mock import pytest from axolotl.integrations.kd.chat_template import ChatTemplateStrategyWithKDv2 class TestChatTemplateStrategyWithKDv2: """Test v2 strategy correctly handles target_token_ids""" @pytest.fixture def v2_strategy(self): """Create v2 strategy instance with mocked dependencies""" # Mock prompter mock_prompter = Mock() mock_prompter.roles = {"user": "user", "assistant": "assistant"} mock_prompter.chat_template_msg_variables = ["role", "content"] mock_prompter.chat_template = "{{ messages }}" # Mock tokenizer mock_tokenizer = Mock() mock_tokenizer.pad_token_id = 0 mock_tokenizer.eos_token_id = 2 mock_tokenizer.bos_token_id = 1 mock_tokenizer.eos_token = "<|endoftext|>" mock_tokenizer.apply_chat_template = Mock(return_value=[1, 10, 20, 30, 2]) mock_tokenizer.encode = Mock(return_value=[2]) return ChatTemplateStrategyWithKDv2( prompter=mock_prompter, tokenizer=mock_tokenizer, train_on_inputs=False, sequence_len=512, logprobs_field="logprobs", gen_temperature=1.0, kd_temperature=1.0, ) def test_v2_prepare_kd_fields_adds_target_token_ids(self, v2_strategy): """ Test that v2's _prepare_kd_fields hook adds target_token_ids. Validates the Template Method pattern fix where v2 overrides the hook to add target_token_ids before transform. """ tokenized = {"input_ids": [1, 10, 20, 30, 2], "labels": [1, 10, 20, 30, 2]} original = {"target_token_ids": [[10, 20], [30, 40]]} result = v2_strategy._prepare_kd_fields(tokenized, original) assert "target_token_ids" in result assert result["target_token_ids"] == [[10, 20], [30, 40]] def test_v2_prepare_kd_fields_handles_missing_field(self, v2_strategy): """Test hook handles missing target_token_ids gracefully""" tokenized = {"input_ids": [1, 10, 20, 30, 2], "labels": [1, 10, 20, 30, 2]} original = {} result = v2_strategy._prepare_kd_fields(tokenized, original) assert "target_token_ids" not in result def test_v2_transform_requires_target_token_ids(self, v2_strategy): """ Test v2's transform fails without target_token_ids. Validates the bug fix - transform expects target_token_ids to be added by the hook. """ sample = { "input_ids": [1, 10, 20, 30, 2], "labels": [1, 10, 20, 30, 2], "logprobs": [[-0.1, -0.2], [-0.3, -0.4]], } with pytest.raises(KeyError, match="target_token_ids"): v2_strategy.transform_logprobs(sample) ================================================ FILE: tests/integrations/test_liger.py ================================================ """ config validation tests for swiglu args """ from typing import Optional import pytest from axolotl.utils.config import prepare_plugins, validate_config from axolotl.utils.dict import DictDefault @pytest.fixture(name="minimal_liger_cfg") def fixture_cfg(): return DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "plugins": ["axolotl.integrations.liger.LigerPlugin"], } ) class TestValidation: """ Test the validation module for liger """ _caplog: Optional[pytest.LogCaptureFixture] = None @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): caplog.set_level("WARNING") self._caplog = caplog def test_deprecated_swiglu(self, minimal_liger_cfg): test_cfg = DictDefault( { "liger_swiglu": False, } | minimal_liger_cfg ) with self._caplog.at_level("WARNING", logger="axolotl.integrations.liger.args"): prepare_plugins(test_cfg) updated_cfg = validate_config(test_cfg) # TODO this test is brittle in CI # assert ( # "The 'liger_swiglu' argument is deprecated" # in self._caplog.records[0].message # ) assert updated_cfg.liger_swiglu is None assert updated_cfg.liger_glu_activation is False def test_conflict_swiglu_ligergluactivation(self, minimal_liger_cfg): test_cfg = DictDefault( { "liger_swiglu": False, "liger_glu_activation": True, } | minimal_liger_cfg ) with pytest.raises( ValueError, match=r".*You cannot have both `liger_swiglu` and `liger_glu_activation` set.*", ): prepare_plugins(test_cfg) validate_config(test_cfg) def test_use_token_scaling_require_flce(self, minimal_liger_cfg): test_cfg = DictDefault( { "liger_fused_linear_cross_entropy": False, "liger_use_token_scaling": True, } | minimal_liger_cfg ) with pytest.raises( ValueError, match=r"`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled.", ): prepare_plugins(test_cfg) validate_config(test_cfg) ================================================ FILE: tests/integrations/test_routing_parity.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Parity tests between scattermoe-lora and sonicmoe routing implementations. These tests verify that both implementations produce numerically identical results for the same inputs, ensuring safe centralization of the routing code. ScatterMoE returns 2D tensors [T, K]; SonicMoE returns flattened 1D [T*K]. The core algorithm should be identical — only the output format differs. """ from types import SimpleNamespace import pytest import torch def _require_triton(): pytest.importorskip("triton") # ============================================================================ # Fixtures / helpers # ============================================================================ def _make_softmax_block(T=8, H=16, E=4, K=2): """Qwen/OLMoE-style block usable by both implementations.""" gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, num_experts=E, norm_topk_prob=True, ) moe_block = SimpleNamespace(gate=gate) hidden = torch.randn(T, H) return moe_block, gate, hidden, T, H, E, K def _make_sigmoid_block( T=8, H=16, E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True ): """GLM/DeepSeek-style block usable by both implementations.""" if bias_on_gate: gate = SimpleNamespace( weight=torch.randn(E, H), e_score_correction_bias=torch.zeros(E), ) moe_block = SimpleNamespace( gate=gate, top_k=K, n_routed_experts=E, n_group=n_group, topk_group=topk_group, norm_topk_prob=True, routed_scaling_factor=1.0, ) else: # minimax_m2 style: bias on block gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, ) moe_block = SimpleNamespace( gate=gate, top_k=K, e_score_correction_bias=torch.zeros(E), ) return moe_block, gate, hidden_states(T, H), T, H, E, K def hidden_states(T, H): return torch.randn(T, H) # ============================================================================ # 1. Softmax routing parity # ============================================================================ class TestSoftmaxRoutingParity: """Verify scattermoe and sonicmoe softmax routing produce identical results.""" @pytest.fixture(autouse=True) def _require(self): _require_triton() def test_weights_match(self): """2D weights from scattermoe == reshaped 1D weights from sonicmoe.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _softmax_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing moe_block, gate, hidden, T, H, E, K = _make_softmax_block() # ScatterMoE path (no LoRA delta) sm_weights, sm_experts, sm_topk, sm_E = _softmax_topk_route( moe_block, gate, hidden, gate.weight, None ) # SonicMoE path sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = softmax_topk_routing( hidden, moe_block ) # ScatterMoE returns [T, K], SonicMoE returns [T*K] flattened sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) assert sm_topk == K assert sm_E == E # Both should select the same experts and produce the same weights assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype)) assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6) def test_logits_not_returned_by_scattermoe(self): """ScatterMoE doesn't return logits; SonicMoE does — verify SonicMoE logits shape.""" from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing moe_block, gate, hidden, T, H, E, K = _make_softmax_block() _, _, _, logits = softmax_topk_routing(hidden, moe_block) assert logits.shape == (T, E) def test_no_renorm(self): """With norm_topk_prob=False, both should skip renormalization.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _softmax_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing moe_block, gate, hidden, T, H, E, K = _make_softmax_block() gate.norm_topk_prob = False sm_weights, sm_experts, _, _ = _softmax_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype)) assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6) def test_various_expert_counts(self): """Parity across different E and K values.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _softmax_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing for E, K in [(2, 1), (8, 2), (16, 4), (32, 8)]: moe_block, gate, hidden, T, H, _, _ = _make_softmax_block(E=E, K=K) sm_weights, sm_experts, _, _ = _softmax_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype)), ( f"Expert mismatch for E={E}, K={K}" ) assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6), ( f"Weight mismatch for E={E}, K={K}" ) # ============================================================================ # 2. Sigmoid routing parity # ============================================================================ class TestSigmoidRoutingParity: """Verify scattermoe and sonicmoe sigmoid routing produce identical results.""" @pytest.fixture(autouse=True) def _require(self): _require_triton() def test_weights_match_with_groups(self): """Both implementations should produce identical weights with group selection.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True ) sm_weights, sm_experts, sm_topk, sm_E = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = sigmoid_topk_routing( hidden, moe_block ) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) assert sm_topk == K assert sm_E == E # Sort experts within each token to handle different topk orderings sm_sorted, sm_order = sm_experts.sort(dim=-1) sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1) assert torch.equal(sm_sorted, sonic_sorted) # Gather weights in sorted order for comparison sm_weights_sorted = sm_weights.gather(1, sm_order) sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order) assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6) def test_weights_match_no_groups(self): """Both implementations match without group selection (n_group=1).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( E=16, K=4, n_group=1, topk_group=1, bias_on_gate=True ) sm_weights, sm_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) # Sort for comparison (topk with sorted=False may differ in order) sm_sorted, sm_order = sm_experts.sort(dim=-1) sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1) assert torch.equal(sm_sorted, sonic_sorted) sm_weights_sorted = sm_weights.gather(1, sm_order) sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order) assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6) def test_bias_on_block_parity(self): """minimax_m2 style: bias on block, not gate.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( E=16, K=4, n_group=1, bias_on_gate=False ) sm_weights, sm_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) sm_sorted, sm_order = sm_experts.sort(dim=-1) sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1) assert torch.equal(sm_sorted, sonic_sorted) sm_weights_sorted = sm_weights.gather(1, sm_order) sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order) assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6) def test_scaling_factor_parity(self): """routed_scaling_factor applied identically by both.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( n_group=1, bias_on_gate=True ) moe_block.routed_scaling_factor = 2.5 sm_weights, sm_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) sm_sorted, sm_order = sm_experts.sort(dim=-1) sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1) assert torch.equal(sm_sorted, sonic_sorted) sm_weights_sorted = sm_weights.gather(1, sm_order) sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order) assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6) def test_no_renorm_parity(self): """norm_topk_prob=False produces same results in both.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( n_group=1, bias_on_gate=True ) moe_block.norm_topk_prob = False sm_weights, sm_experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block) sonic_weights_2d = sonic_scores.reshape(T, K) sonic_experts_2d = sonic_exp_idx.reshape(T, K) sm_sorted, sm_order = sm_experts.sort(dim=-1) sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1) assert torch.equal(sm_sorted, sonic_sorted) sm_weights_sorted = sm_weights.gather(1, sm_order) sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order) assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6) # ============================================================================ # 3. Shared expert parity # ============================================================================ class TestSharedExpertParity: """Verify both _compute_shared_expert implementations behave identically.""" @pytest.fixture(autouse=True) def _require(self): _require_triton() def _get_both_fns(self): from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert as scatter_compute, ) from axolotl.integrations.kernels.sonicmoe.patch import ( _compute_shared_expert as sonic_compute, ) return scatter_compute, sonic_compute def test_shared_expert_singular(self): scatter_fn, sonic_fn = self._get_both_fns() out = torch.randn(4, 8) block = SimpleNamespace(shared_expert=lambda x: out) hidden = torch.randn(4, 8) assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden)) def test_shared_experts_plural(self): scatter_fn, sonic_fn = self._get_both_fns() out = torch.randn(4, 8) block = SimpleNamespace(shared_experts=lambda x: out) hidden = torch.randn(4, 8) assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden)) def test_shared_mlp(self): scatter_fn, sonic_fn = self._get_both_fns() out = torch.randn(4, 8) block = SimpleNamespace(shared_mlp=lambda x: out) hidden = torch.randn(4, 8) assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden)) def test_no_shared_expert(self): scatter_fn, sonic_fn = self._get_both_fns() block = SimpleNamespace() hidden = torch.randn(4, 8) assert scatter_fn(block, hidden) is None assert sonic_fn(block, hidden) is None def test_shared_expert_gate_only_in_scattermoe(self): """ScatterMoE's _compute_shared_expert handles shared_expert_gate; SonicMoE's patch.py handles it externally in the forward function. This documents the known divergence: the scattermoe version applies sigmoid gating inline, while sonicmoe applies it in the forward. """ scatter_fn, sonic_fn = self._get_both_fns() H = 8 expert_out = torch.ones(4, H) gate_fn = lambda x: torch.zeros(4, H) # noqa: E731 # sigmoid(0) = 0.5 block = SimpleNamespace( shared_expert=lambda x: expert_out, shared_expert_gate=gate_fn, ) hidden = torch.randn(4, H) scatter_result = scatter_fn(block, hidden) sonic_result = sonic_fn(block, hidden) # ScatterMoE applies the gate: expert_out * sigmoid(0) = 0.5 expected_gated = expert_out * 0.5 assert torch.allclose(scatter_result, expected_gated, atol=1e-6) # SonicMoE does NOT apply the gate here (it does it in the forward) assert torch.equal(sonic_result, expert_out) # ============================================================================ # 4. Route dispatcher parity # ============================================================================ class TestRouteDispatcherParity: """Verify _route in scattermoe dispatches correctly and matches individual fns.""" @pytest.fixture(autouse=True) def _require(self): _require_triton() def test_route_dispatches_softmax(self): """_route should use softmax when no e_score_correction_bias.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _route, _softmax_topk_route, ) moe_block, gate, hidden, T, H, E, K = _make_softmax_block() route_w, route_e, route_k, route_E = _route( moe_block, gate, hidden, gate.weight, None ) direct_w, direct_e, direct_k, direct_E = _softmax_topk_route( moe_block, gate, hidden, gate.weight, None ) assert torch.equal(route_w, direct_w) assert torch.equal(route_e, direct_e) assert route_k == direct_k assert route_E == direct_E def test_route_dispatches_sigmoid(self): """_route should use sigmoid when e_score_correction_bias is present.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _route, _sigmoid_topk_route, ) moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block( n_group=1, bias_on_gate=True ) route_w, route_e, route_k, route_E = _route( moe_block, gate, hidden, gate.weight, None ) direct_w, direct_e, direct_k, direct_E = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert torch.equal(route_w, direct_w) assert torch.equal(route_e, direct_e) assert route_k == direct_k assert route_E == direct_E ================================================ FILE: tests/integrations/test_scattermoe_autotune_telemetry.py ================================================ """Tests for scattermoe autotune telemetry integration. These tests use mocking to verify the collection and reporting logic without requiring Triton or CUDA. """ import sys from types import SimpleNamespace from unittest.mock import MagicMock, patch # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- # Simulate the hash-suffixed module name that LocalLayerRepository creates. _FAKE_MODULE_NAME = "scattermoe_lora_abc123.kernels.lora_ops" # Patch target for _find_lora_ops_module inside the collector module. _FIND_MODULE_PATH = ( "axolotl.integrations.kernels.autotune_collector._find_lora_ops_module" ) def _make_mock_config(kwargs, num_warps=4, num_stages=3): """Create a mock triton.Config-like object.""" return SimpleNamespace(kwargs=kwargs, num_warps=num_warps, num_stages=num_stages) def _make_mock_kernel(cache=None): """Create a mock autotuned kernel object with a ``.cache`` dict.""" kernel = SimpleNamespace() kernel.cache = cache if cache is not None else {} return kernel def _make_mock_lora_ops( fwd_cache=None, dx_cache=None, bwd_cache=None, fused_cache=None ): """Build a mock ``lora_ops`` module with the four kernel attributes.""" mod = SimpleNamespace( _scatter2scatter_lora=_make_mock_kernel(fwd_cache), _scatter2scatter_lora_dX=_make_mock_kernel(dx_cache), _group_bwd_lora=_make_mock_kernel(bwd_cache), _group_bwd_lora_fused=_make_mock_kernel(fused_cache), ) return mod def _real_lora_ops_module_names(): """Return sys.modules keys that match the lora_ops discovery pattern. Other tests in the same xdist worker may have loaded the *real* lora_ops module. We need to temporarily hide those entries so the discovery test finds only the mock we inject. """ return [ name for name, mod in list(sys.modules.items()) if mod is not None and "lora_ops" in name and hasattr(mod, "_scatter2scatter_lora") ] # ========================================================================= # TestAutotuneCollector # ========================================================================= class TestAutotuneCollector: """Test ``collect_autotune_configs`` with mocked kernel objects. Collection tests patch ``_find_lora_ops_module`` directly so they are not affected by real ``lora_ops`` modules that other tests in the same pytest-xdist worker may have loaded into ``sys.modules``. """ def test_empty_cache_returns_empty_list(self): """When no kernel has been autotuned yet, return ``[]``.""" mock_lora_ops = _make_mock_lora_ops() with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops): from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) result = collect_autotune_configs() assert result == [] def test_populated_cache_returns_configs(self): """When a cache entry exists, it appears in the output.""" cfg = _make_mock_config( {"BLOCK_N": 128, "BLOCK_K": 64}, num_warps=8, num_stages=4 ) mock_lora_ops = _make_mock_lora_ops(fwd_cache={(2048, 4096, 1024): cfg}) with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops): from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) result = collect_autotune_configs() assert len(result) == 1 entry = result[0] assert entry["kernel"] == "scatter2scatter_lora_fwd" assert entry["key"] == {"M": 2048, "N": 4096, "K": 1024} assert entry["config"]["BLOCK_N"] == 128 assert entry["config"]["BLOCK_K"] == 64 assert entry["config"]["num_warps"] == 8 assert entry["config"]["num_stages"] == 4 def test_multiple_kernels_and_keys(self): """Multiple cache entries across kernels are all returned.""" cfg_fwd = _make_mock_config({"BLOCK_N": 128, "BLOCK_K": 32}) cfg_dx = _make_mock_config({"BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8) mock_lora_ops = _make_mock_lora_ops( fwd_cache={(16, 256, 128): cfg_fwd}, dx_cache={(16, 256, 128): cfg_dx}, ) with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops): from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) result = collect_autotune_configs() assert len(result) == 2 names = {r["kernel"] for r in result} assert "scatter2scatter_lora_fwd" in names assert "scatter2scatter_lora_dX" in names def test_extra_key_elements_stored(self): """Dtype or other extra elements in the cache key are captured.""" cfg = _make_mock_config({"BLOCK_N": 64, "BLOCK_K": 32}) cache_key = (512, 1024, 256, "float16", "float16") mock_lora_ops = _make_mock_lora_ops(fwd_cache={cache_key: cfg}) with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops): from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) result = collect_autotune_configs() assert len(result) == 1 key = result[0]["key"] assert key["M"] == 512 assert key["N"] == 1024 assert key["K"] == 256 assert key["_extra"] == ["float16", "float16"] def test_no_module_in_sys_modules_returns_empty(self): """If no lora_ops module is loaded, return ``[]``.""" from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) with patch(_FIND_MODULE_PATH, return_value=None): result = collect_autotune_configs() assert result == [] def test_finds_module_under_hash_suffixed_name(self): """Collector finds lora_ops regardless of the hash suffix.""" cfg = _make_mock_config({"BLOCK_N": 256, "BLOCK_K": 128}) mock_lora_ops = _make_mock_lora_ops(fwd_cache={(8, 512, 64): cfg}) # Use a different hash to prove it's not hardcoded. alt_name = "scattermoe_lora_deadbeef.kernels.lora_ops" # Temporarily hide any real lora_ops modules that other tests in # the same xdist worker may have loaded, so only our mock is found. real_names = _real_lora_ops_module_names() hide_patch = {name: None for name in real_names} with patch.dict(sys.modules, {alt_name: mock_lora_ops, **hide_patch}): from axolotl.integrations.kernels.autotune_collector import ( collect_autotune_configs, ) result = collect_autotune_configs() assert len(result) == 1 assert result[0]["config"]["BLOCK_N"] == 256 # ========================================================================= # TestAutotuneReportCallback # ========================================================================= class TestAutotuneReportCallback: """Test the callback fires once and sends the correct event.""" def test_reports_once_on_first_step(self): """Callback should call ``send_event`` exactly once.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) cb = AutotuneReportCallback() mock_state = MagicMock() mock_state.global_step = 1 fake_configs = [{"kernel": "test_fwd", "key": {}, "config": {}}] with ( patch( "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs", return_value=fake_configs, ), patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls, ): mock_tm = MagicMock() mock_tm.enabled = True mock_tm_cls.get_instance.return_value = mock_tm cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock()) assert mock_tm.send_event.call_count == 1 call_kwargs = mock_tm.send_event.call_args[1] assert call_kwargs["event_type"] == "scattermoe-autotune" assert call_kwargs["properties"]["kernel_count"] == 1 # Second call should NOT send again. cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock()) assert mock_tm.send_event.call_count == 1 def test_retries_until_step_5_then_gives_up(self): """If no configs found by step 5, stop retrying.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) cb = AutotuneReportCallback() with patch( "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs", return_value=[], ): for step in range(1, 7): mock_state = MagicMock() mock_state.global_step = step cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock()) assert cb._reported is True def test_reports_on_retry_when_data_arrives(self): """If step 1 has no data but step 2 does, report at step 2.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) cb = AutotuneReportCallback() fake_configs = [{"kernel": "fwd", "key": {}, "config": {}}] call_count = 0 def _collector(): nonlocal call_count call_count += 1 if call_count == 1: return [] return fake_configs with ( patch( "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs", side_effect=_collector, ), patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls, ): mock_tm = MagicMock() mock_tm.enabled = True mock_tm_cls.get_instance.return_value = mock_tm # Step 1 — empty, no report s1 = MagicMock() s1.global_step = 1 cb.on_step_end(args=MagicMock(), state=s1, control=MagicMock()) assert mock_tm.send_event.call_count == 0 # Step 2 — data arrives, report s2 = MagicMock() s2.global_step = 2 cb.on_step_end(args=MagicMock(), state=s2, control=MagicMock()) assert mock_tm.send_event.call_count == 1 def test_includes_gpu_info(self): """Event properties should include GPU identification.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) cb = AutotuneReportCallback() mock_state = MagicMock() mock_state.global_step = 1 fake_configs = [{"kernel": "fwd", "key": {}, "config": {}}] fake_gpu = { "gpu_name": "NVIDIA H100", "gpu_compute_capability": "9.0", "gpu_memory_bytes": 85899345920, } fake_smem = {"smem_capacity_bytes": 233472} with ( patch( "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs", return_value=fake_configs, ), patch( "axolotl.integrations.kernels.autotune_callback._get_gpu_info", return_value=fake_gpu, ), patch( "axolotl.integrations.kernels.autotune_callback._get_smem_capacity", return_value=fake_smem, ), patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls, ): mock_tm = MagicMock() mock_tm.enabled = True mock_tm_cls.get_instance.return_value = mock_tm cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock()) props = mock_tm.send_event.call_args[1]["properties"] assert props["gpu_name"] == "NVIDIA H100" assert props["gpu_compute_capability"] == "9.0" assert props["gpu_memory_bytes"] == 85899345920 assert props["smem_capacity_bytes"] == 233472 def test_skips_send_when_telemetry_disabled(self): """If telemetry is disabled, no event is sent.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) cb = AutotuneReportCallback() mock_state = MagicMock() mock_state.global_step = 1 with ( patch( "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs", return_value=[{"kernel": "fwd", "key": {}, "config": {}}], ), patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls, ): mock_tm = MagicMock() mock_tm.enabled = False mock_tm_cls.get_instance.return_value = mock_tm cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock()) assert mock_tm.send_event.call_count == 0 # Should still mark as reported so we don't retry. assert cb._reported is True # ========================================================================= # TestKernelsPluginCallbackRegistration # ========================================================================= class TestKernelsPluginCallbackRegistration: """Test that ``KernelsPlugin`` registers the callback correctly.""" def test_scattermoe_registers_callback(self): """When ``use_scattermoe=True``, plugin returns the callback.""" from axolotl.integrations.kernels.autotune_callback import ( AutotuneReportCallback, ) from axolotl.integrations.kernels.plugin import KernelsPlugin plugin = KernelsPlugin() cfg = MagicMock() cfg.use_scattermoe = True model = MagicMock() callbacks = plugin.add_callbacks_pre_trainer(cfg, model) assert len(callbacks) == 1 assert isinstance(callbacks[0], AutotuneReportCallback) def test_no_scattermoe_no_callback(self): """When ``use_scattermoe=False``, plugin returns empty list.""" from axolotl.integrations.kernels.plugin import KernelsPlugin plugin = KernelsPlugin() cfg = MagicMock() cfg.use_scattermoe = False model = MagicMock() callbacks = plugin.add_callbacks_pre_trainer(cfg, model) assert callbacks == [] ================================================ FILE: tests/integrations/test_scattermoe_lora.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Unit tests for scattermoe-lora. Tests cover: - KernelsArgs validator: disable_mlp_kernel - ParallelExperts: scaling=0.0 not treated as falsy - single2scatter: non-aligned K/N dimensions - group_compileable: coeff=None accepted - HFScatterMoEGatedMLP / ScatterMoEGatedMLP: return value contract - Routing strategy detection and sigmoid routing - Generic shared expert handling """ from types import SimpleNamespace from unittest.mock import patch import pytest import torch # ============================================================================ # 1. KernelsArgs: disable_mlp_kernel validator # ============================================================================ class TestKernelsArgsValidator: """Test that disable_mlp_kernel sets both flags correctly. These tests call the validator classmethod directly on raw dicts, since lora_mlp_kernel / mlp_kernel are not declared model fields. """ def test_disables_lora_mlp_kernel_when_scattermoe(self): """lora_mlp_kernel=True gets set to False when use_scattermoe=True.""" from axolotl.integrations.kernels.args import KernelsArgs data = { "use_kernels": True, "use_scattermoe": True, "lora_mlp_kernel": True, } result = KernelsArgs.disable_mlp_kernel(data) assert result["lora_mlp_kernel"] is False assert result["mlp_kernel"] is False def test_mlp_kernel_disabled_without_lora(self): """Even without lora_mlp_kernel, mlp_kernel should be disabled.""" from axolotl.integrations.kernels.args import KernelsArgs data = { "use_kernels": True, "use_scattermoe": True, } result = KernelsArgs.disable_mlp_kernel(data) assert result["mlp_kernel"] is False # lora_mlp_kernel was not in data, should not be added assert "lora_mlp_kernel" not in result def test_lora_mlp_kernel_false_unchanged(self): """lora_mlp_kernel=False should stay False (no warning, no change).""" from axolotl.integrations.kernels.args import KernelsArgs data = { "use_kernels": True, "use_scattermoe": True, "lora_mlp_kernel": False, } result = KernelsArgs.disable_mlp_kernel(data) assert result["lora_mlp_kernel"] is False def test_no_change_when_scattermoe_disabled(self): """When use_scattermoe is not True, nothing should be changed.""" from axolotl.integrations.kernels.args import KernelsArgs data = { "use_kernels": True, "use_scattermoe": False, "lora_mlp_kernel": True, } result = KernelsArgs.disable_mlp_kernel(data) assert result["lora_mlp_kernel"] is True class TestParallelExpertsScaling: """Test that scaling=0.0 is preserved and not overridden to 1.0.""" def test_scaling_zero_preserved(self): """scaling=0.0 should be passed as 0.0, not replaced with 1.0.""" pytest.importorskip("triton") from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import ( ParallelExperts, ) pe = ParallelExperts(num_experts=2, input_size=4, output_size=4) pe.set_lora( lora_A=torch.randn(4, 4), lora_B=torch.randn(4, 4), scaling=0.0, ) assert pe._lora_scaling == 0.0 # Patch parallel_linear_lora to capture the scaling arg with patch( "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora" ) as mock_pll: mock_pll.return_value = torch.randn(4, 4) # Create dummy routing tensors pe.forward( inputs=torch.randn(2, 4), k=1, sorted_expert_idxs=torch.tensor([0, 0, 1, 1]), sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]), expert_offsets=torch.tensor([2, 4]), ) # Check that scaling=0.0 was passed, not 1.0 call_kwargs = mock_pll.call_args assert ( call_kwargs.kwargs.get("scaling") == 0.0 or call_kwargs[1].get("scaling") == 0.0 ), f"Expected scaling=0.0 but got {call_kwargs}" def test_scaling_none_defaults_to_one(self): """scaling=None (no LoRA attached) should default to 1.0.""" pytest.importorskip("triton") from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import ( ParallelExperts, ) pe = ParallelExperts(num_experts=2, input_size=4, output_size=4) # No set_lora called, so _lora_scaling is None with patch( "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora" ) as mock_pll: mock_pll.return_value = torch.randn(4, 4) pe.forward( inputs=torch.randn(2, 4), k=1, sorted_expert_idxs=torch.tensor([0, 0, 1, 1]), sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]), expert_offsets=torch.tensor([2, 4]), ) call_kwargs = mock_pll.call_args scaling_val = call_kwargs.kwargs.get("scaling") or call_kwargs[1].get( "scaling" ) assert scaling_val == 1.0, ( f"Expected scaling=1.0 for None but got {scaling_val}" ) def test_scaling_positive_preserved(self): """Normal positive scaling should be preserved.""" pytest.importorskip("triton") from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import ( ParallelExperts, ) pe = ParallelExperts(num_experts=2, input_size=4, output_size=4) pe.set_lora( lora_A=torch.randn(4, 4), lora_B=torch.randn(4, 4), scaling=0.5, ) with patch( "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora" ) as mock_pll: mock_pll.return_value = torch.randn(4, 4) pe.forward( inputs=torch.randn(2, 4), k=1, sorted_expert_idxs=torch.tensor([0, 0, 1, 1]), sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]), expert_offsets=torch.tensor([2, 4]), ) call_kwargs = mock_pll.call_args scaling_val = call_kwargs.kwargs.get("scaling") or call_kwargs[1].get( "scaling" ) assert scaling_val == 0.5 # ============================================================================ # 4. single2scatter: non-aligned K/N dimensions (GPU only) # ============================================================================ @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") class TestSingle2ScatterBounds: """Test single2scatter with non-aligned dimensions.""" def test_non_aligned_k(self): """K not a multiple of BLOCK_K should produce correct results.""" from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import ( single2scatter, ) E, K, N = 2, 100, 128 # K=100 not a multiple of 128 W = torch.randn(E, K, N, device="cuda", dtype=torch.float32) X = torch.randn(1, K, device="cuda", dtype=torch.float32) expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long) Y = single2scatter(X, W, expert_idxs) assert Y.shape == (2, N) # Verify against manual computation Y_ref_0 = X[0] @ W[0] Y_ref_1 = X[0] @ W[1] torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2) torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2) def test_non_aligned_n(self): """N not a multiple of BLOCK_N should produce correct results.""" from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import ( single2scatter, ) E, K, N = 2, 128, 100 # N=100 not a multiple of 128 W = torch.randn(E, K, N, device="cuda", dtype=torch.float32) X = torch.randn(1, K, device="cuda", dtype=torch.float32) expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long) Y = single2scatter(X, W, expert_idxs) assert Y.shape == (2, N) Y_ref_0 = X[0] @ W[0] Y_ref_1 = X[0] @ W[1] torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2) torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2) def test_non_aligned_both(self): """Both K and N not aligned should produce correct results.""" from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import ( single2scatter, ) E, K, N = 2, 100, 100 # Neither aligned to 128 W = torch.randn(E, K, N, device="cuda", dtype=torch.float32) X = torch.randn(1, K, device="cuda", dtype=torch.float32) expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long) Y = single2scatter(X, W, expert_idxs) assert Y.shape == (2, N) Y_ref_0 = X[0] @ W[0] Y_ref_1 = X[0] @ W[1] torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2) torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2) # ============================================================================ # 5. group_compileable: coeff=None accepted # ============================================================================ @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") class TestGroupCoeffNone: """Test that group() works with coeff=None.""" def test_group_with_none_coeff(self): """group() should accept coeff=None without errors.""" from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import group M, K = 4, 32 A = torch.randn(M, K, device="cuda", dtype=torch.float32) sorted_expert_idxs = torch.tensor([0, 1, 2, 3], device="cuda", dtype=torch.long) # This should not raise a TypeError Y = group(A, sorted_expert_idxs, coeff=None, fan_out=1) assert Y.shape == (M, K) def test_group_with_coeff(self): """group() should also work with actual coeff values.""" from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import group M, K = 4, 32 A = torch.randn(M, K, device="cuda", dtype=torch.float32) sorted_expert_idxs = torch.tensor([0, 1, 2, 3], device="cuda", dtype=torch.long) coeff = torch.ones(M, device="cuda", dtype=torch.float32) * 0.5 Y = group(A, sorted_expert_idxs, coeff=coeff, fan_out=1) assert Y.shape == (M, K) # ============================================================================ # 6. Layer return value contracts # ============================================================================ class TestLayerReturnValues: """Test that layer forward methods return the correct types.""" def test_hf_scatter_moe_returns_single_tensor(self): """HFScatterMoEGatedMLP.forward should return a single tensor, not a tuple.""" pytest.importorskip("triton") # Verify the forward method signature and return annotation import inspect from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( HFScatterMoEGatedMLP, ) sig = inspect.signature(HFScatterMoEGatedMLP.forward) # It's a staticmethod taking (self, layer_input) params = list(sig.parameters.keys()) assert "self" in params assert "layer_input" in params def test_scatter_moe_gated_mlp_docstring_no_router_logits(self): """ScatterMoEGatedMLP.forward docstring should not mention router logits as return.""" pytest.importorskip("triton") from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( ScatterMoEGatedMLP, ) docstring = ScatterMoEGatedMLP.forward.__doc__ assert docstring is not None # The docstring should mention output tensor but NOT router logits assert "Output tensor" in docstring or "output tensor" in docstring.lower() assert "Router logits" not in docstring, ( "Docstring should not mention 'Router logits' in Returns section" ) # ============================================================================ # 7. Routing strategy detection and sigmoid routing # ============================================================================ def _make_softmax_gate(E=4, H=16, K=2): """Create a mock softmax-style gate (Qwen/OLMoE).""" return SimpleNamespace( weight=torch.randn(E, H), top_k=K, num_experts=E, norm_topk_prob=True, ) def _make_sigmoid_gate_with_bias(E=16, H=16): """Create a mock sigmoid-style gate with e_score_correction_bias on gate.""" return SimpleNamespace( weight=torch.randn(E, H), e_score_correction_bias=torch.zeros(E), ) def _make_sigmoid_moe_block( T=8, H=16, E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True ): """Create a mock GLM/DeepSeek-style MoE block for sigmoid routing tests.""" if bias_on_gate: gate = SimpleNamespace( weight=torch.randn(E, H), e_score_correction_bias=torch.zeros(E), ) moe_block = SimpleNamespace( gate=gate, top_k=K, n_routed_experts=E, n_group=n_group, topk_group=topk_group, norm_topk_prob=True, routed_scaling_factor=1.0, ) else: # minimax_m2 style: bias on block, not gate gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, ) moe_block = SimpleNamespace( gate=gate, top_k=K, e_score_correction_bias=torch.zeros(E), ) return moe_block, T, H, E, K def _skip_without_triton(): pytest.importorskip("triton") class TestSigmoidRoutingInScatterMoE: """Test _sigmoid_topk_route from layers.py.""" @pytest.fixture(autouse=True) def _require_triton(self): _skip_without_triton() def test_output_shapes(self): from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block() gate = moe_block.gate hidden = torch.randn(T, H) weights, experts, top_k, num_experts = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) assert experts.shape == (T, K) assert top_k == K assert num_experts == E def test_weights_nonnegative(self): from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block() gate = moe_block.gate hidden = torch.randn(T, H) weights, _, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert (weights >= 0).all() def test_group_selection_restricts_experts(self): """With n_group=4, topk_group=1, experts should be from selected groups.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block( E=16, K=2, n_group=4, topk_group=1 ) gate = moe_block.gate hidden = torch.randn(T, H) _, expert_idx, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) # Each token's experts should fall within a single group (size E//n_group=4) for t in range(T): experts_t = expert_idx[t] groups = experts_t // (E // moe_block.n_group) assert (groups == groups[0]).all() def test_scaling_factor_applied(self): from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block(n_group=1) gate = moe_block.gate hidden = torch.randn(T, H) weights_1x, _, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) moe_block.routed_scaling_factor = 2.0 weights_2x, _, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert torch.allclose(weights_2x, weights_1x * 2.0, atol=1e-5) def test_bias_on_gate(self): """e_score_correction_bias on gate is found.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=True) gate = moe_block.gate hidden = torch.randn(T, H) weights, experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) def test_bias_on_block(self): """e_score_correction_bias on moe_block (not gate) is found.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=False) gate = moe_block.gate hidden = torch.randn(T, H) weights, experts, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) def test_gate_lora_delta_applied(self): """Gate LoRA delta should affect routing logits.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) moe_block, T, H, E, K = _make_sigmoid_moe_block(n_group=1) gate = moe_block.gate hidden = torch.randn(T, H) weights_no_lora, _, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) # Large delta should change the results delta = torch.randn(E, H) * 10.0 weights_with_lora, _, _, _ = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, delta ) assert not torch.equal(weights_no_lora, weights_with_lora) def test_no_bias_does_not_crash(self): """Calling _sigmoid_topk_route with no e_score_correction_bias should not crash.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) T, H, E, K = 8, 16, 8, 2 gate = SimpleNamespace(weight=torch.randn(E, H)) moe_block = SimpleNamespace( gate=gate, top_k=K, n_routed_experts=E, n_group=1, norm_topk_prob=True, routed_scaling_factor=1.0, ) hidden = torch.randn(T, H) weights, experts, top_k, num_experts = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) assert experts.shape == (T, K) # Without bias, scores_for_choice == sigmoid(logits) — all positive assert (weights >= 0).all() def test_missing_topk_group_defaults_to_n_group(self): """When topk_group is absent but n_group > 1, should default to n_group (no-op masking).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _sigmoid_topk_route, ) T, H, E, K, n_group = 8, 16, 16, 2, 4 gate = SimpleNamespace( weight=torch.randn(E, H), e_score_correction_bias=torch.zeros(E), ) # Intentionally omit topk_group moe_block = SimpleNamespace( gate=gate, top_k=K, n_routed_experts=E, n_group=n_group, norm_topk_prob=True, routed_scaling_factor=1.0, ) hidden = torch.randn(T, H) # Should not raise AttributeError; defaults topk_group to n_group weights, experts, top_k_out, num_experts = _sigmoid_topk_route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) assert experts.shape == (T, K) class TestRoutingStrategyDetection: """Test that _route dispatches to the correct strategy.""" @pytest.fixture(autouse=True) def _require_triton(self): _skip_without_triton() def test_softmax_for_qwen_style(self): """Block without e_score_correction_bias should use softmax.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route gate = _make_softmax_gate(E=4, H=16, K=2) moe_block = SimpleNamespace(gate=gate) hidden = torch.randn(8, 16) weights, experts, top_k, num_experts = _route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (8, 2) assert experts.shape == (8, 2) assert top_k == 2 assert num_experts == 4 per_token_sums = weights.sum(dim=-1) assert torch.allclose(per_token_sums, torch.ones(8), atol=1e-5) def test_sigmoid_for_glm_style(self): """Block with e_score_correction_bias on gate should use sigmoid.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=True, n_group=1) gate = moe_block.gate hidden = torch.randn(T, H) weights, experts, top_k, num_experts = _route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) assert experts.shape == (T, K) assert (weights >= 0).all() def test_sigmoid_for_minimax_m2_style(self): """Block with e_score_correction_bias on block (not gate) should use sigmoid.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=False) gate = moe_block.gate hidden = torch.randn(T, H) weights, experts, top_k, num_experts = _route( moe_block, gate, hidden, gate.weight, None ) assert weights.shape == (T, K) assert (weights >= 0).all() # ============================================================================ # 8. Generic shared expert handling # ============================================================================ class TestGenericSharedExpert: """Test _compute_shared_expert from layers.py.""" @pytest.fixture(autouse=True) def _require_triton(self): _skip_without_triton() def test_shared_expert_singular(self): """shared_expert attribute (Qwen2MoE style).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert, ) called = torch.randn(4, 8) moe_block = SimpleNamespace( shared_expert=lambda x: called, ) result = _compute_shared_expert(moe_block, torch.randn(4, 8)) assert torch.equal(result, called) def test_shared_experts_plural(self): """shared_experts attribute (DeepSeek V3 style).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert, ) called = torch.randn(4, 8) moe_block = SimpleNamespace( shared_experts=lambda x: called, ) result = _compute_shared_expert(moe_block, torch.randn(4, 8)) assert torch.equal(result, called) def test_shared_mlp(self): """shared_mlp attribute (Hunyuan style).""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert, ) called = torch.randn(4, 8) moe_block = SimpleNamespace( shared_mlp=lambda x: called, ) result = _compute_shared_expert(moe_block, torch.randn(4, 8)) assert torch.equal(result, called) def test_shared_expert_with_gate(self): """shared_expert + shared_expert_gate applies sigmoid gating.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert, ) H = 8 expert_out = torch.ones(4, H) gate_fn = lambda x: torch.zeros(4, H) # noqa: E731 moe_block = SimpleNamespace( shared_expert=lambda x: expert_out, shared_expert_gate=gate_fn, ) result = _compute_shared_expert(moe_block, torch.randn(4, H)) expected = expert_out * 0.5 # sigmoid(0) = 0.5 assert torch.allclose(result, expected, atol=1e-6) def test_no_shared_expert(self): """No shared expert attributes returns None.""" from axolotl.integrations.kernels.libs.scattermoe_lora.layers import ( _compute_shared_expert, ) moe_block = SimpleNamespace() result = _compute_shared_expert(moe_block, torch.randn(4, 8)) assert result is None ================================================ FILE: tests/integrations/test_scattermoe_lora_kernels.py ================================================ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Axolotl AI # Licensed under the Apache License, Version 2.0 """ Unit tests for ScatterMoE LoRA Triton kernels. Tests correctness of: - scatter2scatter_lora (forward) - scatter2scatter_lora_dX (backward input gradient) - group_bwd_lora (backward LoRA weight gradients via split dA/dB) - ScatterMoELoRA autograd function (full forward + backward) Each kernel is tested against a pure PyTorch per-expert-loop reference implementation at multiple model shapes and LoRA ranks. """ import pytest import torch from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import ( lora_ops, ops as base_ops, ) from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import ( flatten_sort_count, ) from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import ( ScatterMoELoRA, ) DEVICE = "cuda" DTYPE = torch.bfloat16 def _requires_cuda(): return pytest.mark.skipif( not torch.cuda.is_available(), reason="CUDA not available" ) pytestmark = _requires_cuda() # ─── Helpers ───────────────────────────────────────────────────────────────── def _setup(E, K, N, T, top_k, R, seed=42): """Create synthetic expert weights, LoRA, routing, and grouped inputs.""" torch.manual_seed(seed) x = torch.randn(T, K, device=DEVICE, dtype=DTYPE) W = torch.randn(E, K, N, device=DEVICE, dtype=DTYPE) * 0.02 lora_A = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE) * 0.01 lora_B = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE) * 0.01 logits = torch.randn(T, E, device=DEVICE) _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1) sei, ssi, eo = flatten_sort_count(top_idx, E) return x, W, lora_A, lora_B, sei, ssi, eo def _reference_fwd(x, W, sei, ssi, eo, k, lora_A, lora_B, scaling, E): """Per-expert loop reference: Y = X@W + scaling*(X@A^T)@B^T.""" grouped_x = base_ops.group(x, ssi, fan_out=k) M, N = grouped_x.size(0), W.size(2) R = lora_A.size(0) // E out = torch.zeros(M, N, device=DEVICE, dtype=DTYPE) for e in range(E): s = eo[e - 1].item() if e > 0 else 0 end = eo[e].item() if s == end: continue xe = grouped_x[s:end].float() we = W[e].float() ae = lora_A[e * R : (e + 1) * R].float() be = lora_B[:, e * R : (e + 1) * R].float() out[s:end] = (xe @ we + scaling * (xe @ ae.T) @ be.T).to(DTYPE) result = torch.zeros(M, N, device=DEVICE, dtype=DTYPE) result[ssi] = out return result def _reference_dX(dy_grouped, W, sei, ssi, eo, lora_A, lora_B, scaling, E): """Per-expert loop reference: dX = dY@W^T + scaling*(dY@B)@A.""" M, K = dy_grouped.size(0), W.size(1) R = lora_A.size(0) // E out = torch.zeros(M, K, device=DEVICE, dtype=DTYPE) for e in range(E): s = eo[e - 1].item() if e > 0 else 0 end = eo[e].item() if s == end: continue dye = dy_grouped[s:end].float() we = W[e].float() ae = lora_A[e * R : (e + 1) * R].float() be = lora_B[:, e * R : (e + 1) * R].float() out[s:end] = (dye @ we.T + scaling * (dye @ be) @ ae).to(DTYPE) result = torch.zeros(M, K, device=DEVICE, dtype=DTYPE) result[ssi] = out return result def _reference_bwd_lora(dy, grouped_x, lora_A, lora_B, eo, E, scaling): """Per-expert loop reference: dA, dB for LoRA weight gradients.""" R = lora_A.size(0) // E dA = torch.zeros_like(lora_A) dB = torch.zeros_like(lora_B) for e in range(E): s = eo[e - 1].item() if e > 0 else 0 end = eo[e].item() if s == end: continue xe = grouped_x[s:end].float() dye = dy[s:end].float() ae = lora_A[e * R : (e + 1) * R].float() be = lora_B[:, e * R : (e + 1) * R].float() dA[e * R : (e + 1) * R] = (scaling * (dye @ be).T @ xe).to(DTYPE) dB[:, e * R : (e + 1) * R] = (scaling * dye.T @ (xe @ ae.T)).to(DTYPE) return dA, dB # ─── Model shape configs ──────────────────────────────────────────────────── # (E, K, N, T, top_k, R, description) CONFIGS_SMALL = [ (32, 128, 64, 64, 2, 4, "tiny"), (64, 256, 128, 128, 4, 8, "small"), ] CONFIGS_REAL = [ (256, 2048, 1024, 2048, 8, 16, "qwen35_gate_up"), (256, 512, 2048, 2048, 8, 16, "qwen35_down"), (64, 2048, 2048, 2048, 8, 16, "olmoe_gate_up"), (128, 2048, 1536, 2048, 8, 16, "qwen3_gate_up"), ] SCALING = 2.0 # ─── Forward tests ────────────────────────────────────────────────────────── class TestScatter2ScatterLoRAForward: """Test scatter2scatter_lora forward kernel vs reference.""" @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL) def config(self, request): return request.param def test_matches_reference(self, config): E, K, N, T, k, R, desc = config x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) kernel_out = lora_ops.scatter2scatter_lora( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=k, lora_A=lA, lora_B=lB, scaling=SCALING, ) ref_out = _reference_fwd(x, W, sei, ssi, eo, k, lA, lB, SCALING, E) err = (kernel_out.float() - ref_out.float()).abs().max().item() assert err < 1.0, f"[{desc}] fwd max_err={err}" def test_output_shape(self, config): E, K, N, T, k, R, desc = config x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) out = lora_ops.scatter2scatter_lora( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=k, lora_A=lA, lora_B=lB, scaling=SCALING, ) assert out.shape == (T * k, N) assert out.dtype == DTYPE # ─── Backward dX tests ────────────────────────────────────────────────────── class TestScatter2ScatterLoRADX: """Test scatter2scatter_lora_dX backward kernel vs reference.""" @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL) def config(self, request): return request.param def test_matches_reference(self, config): E, K, N, T, k, R, desc = config x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) gx = base_ops.group(x, ssi, fan_out=k) dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE) kernel_dx = lora_ops.scatter2scatter_lora_dX( DY=dy, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=1, lora_A=lA, lora_B=lB, scaling=SCALING, dy_grouped=True, dx_grouped=False, ) ref_dx = _reference_dX(dy, W, sei, ssi, eo, lA, lB, SCALING, E) err = (kernel_dx.float() - ref_dx.float()).abs().max().item() assert err < 1.0, f"[{desc}] dX max_err={err}" # ─── Backward LoRA gradient tests ─────────────────────────────────────────── class TestGroupBwdLoRA: """Test group_bwd_lora (split dA/dB kernel) vs reference.""" @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL) def config(self, request): return request.param def test_matches_reference(self, config): E, K, N, T, k, R, desc = config x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) gx = base_ops.group(x, ssi, fan_out=k) dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE) kern_dA, kern_dB = lora_ops.group_bwd_lora( DY=dy, X=gx, lora_A=lA, lora_B=lB, expert_offsets=eo, E=E, scaling=SCALING, ) ref_dA, ref_dB = _reference_bwd_lora(dy, gx, lA, lB, eo, E, SCALING) # Use norm-relative error: bf16 accumulation order differs between # kernel (tiled + different reduction order) and reference (per-expert # fp32 loop), so max absolute error can be large on individual elements # while the overall tensor is correct. dA_norm_err = ( (kern_dA.float() - ref_dA.float()).norm() / (ref_dA.float().norm() + 1e-6) ).item() dB_norm_err = ( (kern_dB.float() - ref_dB.float()).norm() / (ref_dB.float().norm() + 1e-6) ).item() assert dA_norm_err < 0.01, f"[{desc}] dA norm_rel_err={dA_norm_err}" assert dB_norm_err < 0.01, f"[{desc}] dB norm_rel_err={dB_norm_err}" def test_zero_expert_tokens(self): """Experts with zero routed tokens produce zero gradients.""" E, K, N, R = 8, 64, 32, 4 torch.manual_seed(42) # Route all tokens to expert 0 only T, k = 16, 1 top_idx = torch.zeros(T, k, dtype=torch.long, device=DEVICE) sei, ssi, eo = flatten_sort_count(top_idx, E) gx = torch.randn(T, K, device=DEVICE, dtype=DTYPE) dy = torch.randn(T, N, device=DEVICE, dtype=DTYPE) lA = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE) lB = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE) dA, dB = lora_ops.group_bwd_lora( DY=dy, X=gx, lora_A=lA, lora_B=lB, expert_offsets=eo, E=E, scaling=2.0, ) # Experts 1..7 should have zero gradients for e in range(1, E): assert dA[e * R : (e + 1) * R].abs().max() == 0, f"Expert {e} dA not zero" assert dB[:, e * R : (e + 1) * R].abs().max() == 0, ( f"Expert {e} dB not zero" ) # ─── Full autograd tests ──────────────────────────────────────────────────── class TestScatterMoELoRAAutograd: """Test full forward + backward through ScatterMoELoRA autograd function.""" @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL[:2]) def config(self, request): return request.param def test_gradients_exist_and_finite(self, config): E, K, N, T, k, R, desc = config x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) x = x.requires_grad_(True) lA = lA.requires_grad_(True) lB = lB.requires_grad_(True) out = ScatterMoELoRA.apply( x, W, k, sei, ssi, eo, lA, lB, SCALING, None, None, False, False, True, False, ) out.sum().backward() assert x.grad is not None, f"[{desc}] x.grad is None" assert lA.grad is not None, f"[{desc}] lA.grad is None" assert lB.grad is not None, f"[{desc}] lB.grad is None" assert torch.isfinite(x.grad).all(), f"[{desc}] x.grad has non-finite" assert torch.isfinite(lA.grad).all(), f"[{desc}] lA.grad has non-finite" assert torch.isfinite(lB.grad).all(), f"[{desc}] lB.grad has non-finite" assert x.grad.abs().sum() > 0, f"[{desc}] x.grad all zero" assert lA.grad.abs().sum() > 0, f"[{desc}] lA.grad all zero" def test_split_matches_fused(self): """Split dispatch (for few large experts) matches fused kernel.""" # Use a shape where split would be dispatched (large K*N, few E) E, K, N, T, k, R = 8, 512, 1024, 128, 2, 16 x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) # Force fused path orig = lora_ops._SPLIT_LORA_FWD_THRESHOLD lora_ops._SPLIT_LORA_FWD_THRESHOLD = 10**18 out_fused = lora_ops.scatter2scatter_lora( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=k, lora_A=lA, lora_B=lB, scaling=SCALING, ) # Force split path lora_ops._SPLIT_LORA_FWD_THRESHOLD = 0 out_split = lora_ops.scatter2scatter_lora( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=k, lora_A=lA, lora_B=lB, scaling=SCALING, ) lora_ops._SPLIT_LORA_FWD_THRESHOLD = orig norm_err = ( (out_fused.float() - out_split.float()).norm() / (out_fused.float().norm() + 1e-6) ).item() assert norm_err < 0.01, f"split vs fused norm_err={norm_err}" def test_scaling_zero_gives_base_only(self): """With scaling=0.0, LoRA contribution vanishes. Output = X@W.""" E, K, N, T, k, R = 16, 64, 32, 32, 2, 4 x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R) out_lora = ScatterMoELoRA.apply( x, W, k, sei, ssi, eo, lA, lB, 0.0, None, None, False, False, True, False, ) out_base = base_ops.scatter2scatter( X=x, W=W, sorted_expert_idxs=sei, sorted_scattered_idxs=ssi, k=k, ) err = (out_lora.float() - out_base.float()).abs().max().item() assert err < 0.01, f"scaling=0 should match base: err={err}" ================================================ FILE: tests/integrations/test_sonicmoe.py ================================================ """Unit tests for the SonicMoE integration.""" from types import SimpleNamespace import pytest import torch from axolotl.integrations.kernels.args import KernelsArgs from axolotl.integrations.kernels.sonicmoe.routing import ( sigmoid_topk_routing, softmax_topk_routing, ) from axolotl.integrations.kernels.sonicmoe.weight_converter import ( ConcatenatedToInterleaved, InterleavedToConcatenated, register_sonicmoe_weight_converter, ) class TestKernelsArgs: def test_mutual_exclusivity_raises(self): with pytest.raises(ValueError, match="Cannot use both"): KernelsArgs.model_validate({"use_scattermoe": True, "use_sonicmoe": True}) def test_sonicmoe_only(self): result = KernelsArgs.model_validate({"use_sonicmoe": True}) assert result.use_sonicmoe is True assert result.use_scattermoe is None def test_scattermoe_only(self): result = KernelsArgs.model_validate({"use_scattermoe": True}) assert result.use_scattermoe is True assert result.use_sonicmoe is None def test_neither_set(self): result = KernelsArgs.model_validate({}) assert result.use_scattermoe is None assert result.use_sonicmoe is None def test_disables_mlp_kernel_when_sonicmoe(self): data = {"use_sonicmoe": True, "lora_mlp_kernel": True} result = KernelsArgs.disable_mlp_kernel(data) assert result["lora_mlp_kernel"] is False assert result["mlp_kernel"] is False class TestConcatenatedToInterleaved: @pytest.fixture def sample_tensor(self): """Create a test tensor [E=2, 2*I=4, H=3] with distinct gate/up values.""" E, I, H = 2, 2, 3 # noqa: E741 gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H) up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H) return torch.cat([gate, up], dim=1) def test_interleave_rows_alternate(self, sample_tensor): op = ConcatenatedToInterleaved(dim=1) result = op.convert( {"test": sample_tensor}, source_patterns=["test"], target_patterns=["test"], ) interleaved = result["test"] # For expert 0: even rows should be gate, odd rows should be up E, two_I, H = sample_tensor.shape I = two_I // 2 # noqa: E741 gate_orig = sample_tensor[:, :I, :] up_orig = sample_tensor[:, I:, :] assert torch.equal(interleaved[:, 0::2, :], gate_orig) assert torch.equal(interleaved[:, 1::2, :], up_orig) def test_interleave_handles_list_input(self, sample_tensor): op = ConcatenatedToInterleaved(dim=1) result = op.convert( {"test": [sample_tensor]}, source_patterns=["test"], target_patterns=["test"], ) assert result["test"].shape == sample_tensor.shape def test_reverse_op_type(self): op = ConcatenatedToInterleaved(dim=1) assert isinstance(op.reverse_op, InterleavedToConcatenated) assert op.reverse_op.dim == 1 class TestInterleavedToConcatenated: @pytest.fixture def interleaved_tensor(self): """Create an interleaved tensor [E=2, 2*I=4, H=3].""" E, I, H = 2, 2, 3 # noqa: E741 gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H) up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H) interleaved = torch.empty(E, 2 * I, H) interleaved[:, 0::2, :] = gate interleaved[:, 1::2, :] = up return interleaved def test_deinterleave_gate_up_separated(self, interleaved_tensor): op = InterleavedToConcatenated(dim=1) result = op.convert( {"test": interleaved_tensor}, source_patterns=["test"], target_patterns=["test"], ) concatenated = result["test"] E, two_I, H = concatenated.shape I = two_I // 2 # noqa: E741 # First half should be gate (even rows from interleaved) assert torch.equal(concatenated[:, :I, :], interleaved_tensor[:, 0::2, :]) # Second half should be up (odd rows from interleaved) assert torch.equal(concatenated[:, I:, :], interleaved_tensor[:, 1::2, :]) def test_reverse_op_type(self): op = InterleavedToConcatenated(dim=1) assert isinstance(op.reverse_op, ConcatenatedToInterleaved) assert op.reverse_op.dim == 1 class TestRoundTrip: @pytest.fixture def concat_tensor(self): E, I, H = 4, 8, 16 # noqa: E741 gate = torch.randn(E, I, H) up = torch.randn(E, I, H) return torch.cat([gate, up], dim=1) def test_interleave_then_deinterleave_is_identity(self, concat_tensor): fwd = ConcatenatedToInterleaved(dim=1) rev = InterleavedToConcatenated(dim=1) interleaved = fwd.convert( {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"] )["k"] recovered = rev.convert( {"k": interleaved}, source_patterns=["k"], target_patterns=["k"] )["k"] assert torch.equal(concat_tensor, recovered) def test_reverse_op_chain_is_identity(self, concat_tensor): """Verify that op.reverse_op produces an exact inverse.""" op = ConcatenatedToInterleaved(dim=1) rev = op.reverse_op interleaved = op.convert( {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"] )["k"] recovered = rev.convert( {"k": interleaved}, source_patterns=["k"], target_patterns=["k"] )["k"] assert torch.equal(concat_tensor, recovered) def test_various_shapes(self): """Test with different expert counts and dimensions.""" fwd = ConcatenatedToInterleaved(dim=1) rev = InterleavedToConcatenated(dim=1) for E, I, H in [(1, 4, 8), (8, 16, 32), (16, 128, 256)]: # noqa: E741 concat = torch.randn(E, 2 * I, H) interleaved = fwd.convert( {"k": concat}, source_patterns=["k"], target_patterns=["k"] )["k"] recovered = rev.convert( {"k": interleaved}, source_patterns=["k"], target_patterns=["k"] )["k"] assert torch.equal(concat, recovered), ( f"Failed for shape ({E}, {2 * I}, {H})" ) class TestWeightConverterRegistration: def test_register_appends_interleave_op(self): from transformers.conversion_mapping import get_checkpoint_conversion_mapping register_sonicmoe_weight_converter("qwen3_moe") modified = get_checkpoint_conversion_mapping("qwen3_moe") # Find the gate_up_proj converter gate_up_converter = None for conv in modified: if hasattr(conv, "operations") and any( "gate_up_proj" in pat for pat in conv.target_patterns ): gate_up_converter = conv break assert gate_up_converter is not None assert isinstance(gate_up_converter.operations[-1], ConcatenatedToInterleaved) def test_double_registration_is_idempotent(self): from transformers.conversion_mapping import get_checkpoint_conversion_mapping register_sonicmoe_weight_converter("qwen3_moe") register_sonicmoe_weight_converter("qwen3_moe") modified = get_checkpoint_conversion_mapping("qwen3_moe") for conv in modified: if hasattr(conv, "operations") and any( "gate_up_proj" in pat for pat in conv.target_patterns ): interleave_count = sum( isinstance(op, ConcatenatedToInterleaved) for op in conv.operations ) assert interleave_count == 1, ( f"Expected 1 ConcatenatedToInterleaved op, got {interleave_count}" ) break def test_register_unsupported_model_type_warns(self): # A model type with no conversion mapping should warn but not raise register_sonicmoe_weight_converter("nonexistent_model_type_xyz") def _make_qwen_moe_block(T=8, H=16, E=4, K=2): """Create a mock qwen-style MoE block for routing tests.""" gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, num_experts=E, norm_topk_prob=True, ) return SimpleNamespace(gate=gate), T, H, E, K def _make_glm_moe_block(T=8, H=16, E=16, K=4, n_group=2, topk_group=1): """Create a mock GLM5-style MoE block for routing tests.""" gate = SimpleNamespace( weight=torch.randn(E, H), e_score_correction_bias=torch.zeros(E), ) moe_block = SimpleNamespace( gate=gate, top_k=K, n_routed_experts=E, n_group=n_group, topk_group=topk_group, norm_topk_prob=True, routed_scaling_factor=1.0, ) return moe_block, T, H, E, K def _make_minimax_m2_moe_block(T=8, H=16, E=16, K=4): """Create a mock minimax_m2-style MoE block for routing tests. minimax_m2 uses sigmoid->topk WITHOUT group selection: - e_score_correction_bias is on the moe_block (not on gate) - No n_group / topk_group attributes - Always normalizes (norm_topk_prob defaults to True) - No routed_scaling_factor (defaults to 1.0) """ gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, ) moe_block = SimpleNamespace( gate=gate, top_k=K, e_score_correction_bias=torch.zeros(E), ) return moe_block, T, H, E, K class TestSoftmaxTopkRouting: def test_output_shapes(self): moe_block, T, H, E, K = _make_qwen_moe_block() hidden = torch.randn(T, H) scores, token_idx, expert_idx, logits = softmax_topk_routing(hidden, moe_block) assert scores.shape == (T * K,) assert token_idx.shape == (T * K,) assert expert_idx.shape == (T * K,) assert logits.shape == (T, E) def test_scores_are_float32(self): moe_block, T, H, E, K = _make_qwen_moe_block() hidden = torch.randn(T, H) scores, _, _, _ = softmax_topk_routing(hidden, moe_block) assert scores.dtype == torch.float32 def test_token_indices_sorted_ascending(self): moe_block, T, H, E, K = _make_qwen_moe_block() hidden = torch.randn(T, H) _, token_idx, _, _ = softmax_topk_routing(hidden, moe_block) # Token indices must be sorted ascending (SonicMoE requirement) diffs = token_idx[1:] - token_idx[:-1] assert (diffs >= 0).all() def test_expert_indices_in_range(self): moe_block, T, H, E, K = _make_qwen_moe_block() hidden = torch.randn(T, H) _, _, expert_idx, _ = softmax_topk_routing(hidden, moe_block) assert (expert_idx >= 0).all() assert (expert_idx < E).all() def test_renormalized_scores_sum_to_one(self): moe_block, T, H, E, K = _make_qwen_moe_block() hidden = torch.randn(T, H) scores, _, _, _ = softmax_topk_routing(hidden, moe_block) per_token_sums = scores.reshape(T, K).sum(dim=-1) assert torch.allclose(per_token_sums, torch.ones(T), atol=1e-5) class TestSigmoidTopkRouting: def test_output_shapes(self): moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block) assert scores.shape == (T * K,) assert token_idx.shape == (T * K,) assert expert_idx.shape == (T * K,) assert logits.shape == (T, E) def test_scores_are_float32(self): moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block) assert scores.dtype == torch.float32 def test_token_indices_sorted_ascending(self): moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) _, token_idx, _, _ = sigmoid_topk_routing(hidden, moe_block) diffs = token_idx[1:] - token_idx[:-1] assert (diffs >= 0).all() def test_expert_indices_in_range(self): moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block) assert (expert_idx >= 0).all() assert (expert_idx < E).all() def test_scores_are_nonnegative(self): """Sigmoid outputs are in [0, 1], so scores should be non-negative.""" moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block) assert (scores >= 0).all() def test_scaling_factor_applied(self): moe_block, T, H, E, K = _make_glm_moe_block() hidden = torch.randn(T, H) # Get scores with scaling_factor=1.0 scores_1x, _, _, _ = sigmoid_topk_routing(hidden, moe_block) # Get scores with scaling_factor=2.0 moe_block.routed_scaling_factor = 2.0 scores_2x, _, _, _ = sigmoid_topk_routing(hidden, moe_block) assert torch.allclose(scores_2x, scores_1x * 2.0, atol=1e-5) def test_group_selection_restricts_experts(self): """With n_group=4 and topk_group=1, only 1/4 of experts should be selectable.""" moe_block, T, H, E, K = _make_glm_moe_block(E=16, K=2, n_group=4, topk_group=1) hidden = torch.randn(T, H) _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block) # Each token's experts should all fall within a single group (size E//n_group=4) expert_idx_2d = expert_idx.reshape(T, K) for t in range(T): experts = expert_idx_2d[t] groups = experts // (E // moe_block.n_group) # All selected experts should be from the same group assert (groups == groups[0]).all() class TestMiniMaxM2SigmoidRouting: """Tests for minimax_m2 routing: sigmoid->topk without group selection.""" def test_output_shapes(self): """Validates getattr defaults work: n_group=1, E from gate.weight.shape[0].""" moe_block, T, H, E, K = _make_minimax_m2_moe_block() hidden = torch.randn(T, H) scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block) assert scores.shape == (T * K,) assert token_idx.shape == (T * K,) assert expert_idx.shape == (T * K,) assert logits.shape == (T, E) def test_bias_on_block_not_gate(self): """Verify that e_score_correction_bias on the block (not gate) is used.""" T, H, E, K = 8, 16, 8, 2 gate = SimpleNamespace( weight=torch.randn(E, H), top_k=K, ) # Large positive bias on expert 0 should make it selected more often bias = torch.zeros(E) bias[0] = 100.0 moe_block = SimpleNamespace( gate=gate, top_k=K, e_score_correction_bias=bias, ) hidden = torch.randn(T, H) _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block) # Expert 0 should appear for every token due to the large bias expert_idx_2d = expert_idx.reshape(T, K) for t in range(T): assert 0 in expert_idx_2d[t] ================================================ FILE: tests/integrations/test_sonicmoe_gradients.py ================================================ """ Gradient correctness tests for SonicMoE routing functions (CPU-only). Uses torch.autograd.gradcheck with float32 inputs to match the production code path where routing happens in float32. """ import torch from axolotl.integrations.kernels.sonicmoe.routing import ( sigmoid_topk_routing, softmax_topk_routing, ) _GC_EPS = 1e-3 _GC_ATOL = 1e-3 _GC_RTOL = 1e-3 def _make_softmax_moe_block(weight): gate = torch.nn.Module() gate.weight = weight gate.top_k = 2 gate.norm_topk_prob = True moe_block = torch.nn.Module() moe_block.gate = gate return moe_block def _make_sigmoid_moe_block(weight, bias): gate = torch.nn.Module() gate.weight = weight gate.e_score_correction_bias = bias moe_block = torch.nn.Module() moe_block.gate = gate moe_block.top_k = 2 moe_block.n_routed_experts = weight.shape[0] moe_block.n_group = 1 moe_block.norm_topk_prob = True moe_block.routed_scaling_factor = 1.0 return moe_block class TestSoftmaxTopkRoutingGradcheck: """Numerical gradient verification for softmax_topk_routing.""" def test_gradcheck_wrt_gate_weight(self): T, H, E = 4, 8, 4 hidden = torch.randn(T, H, dtype=torch.float32) def fn(weight): moe_block = _make_softmax_moe_block(weight) scores, _, _, _ = softmax_topk_routing(hidden, moe_block) return scores weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) def test_gradcheck_wrt_hidden_states(self): T, H, E = 4, 8, 4 weight = torch.randn(E, H, dtype=torch.float32) moe_block = _make_softmax_moe_block(weight) def fn(hidden): scores, _, _, _ = softmax_topk_routing(hidden, moe_block) return scores hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) def test_gradcheck_wrt_router_logits(self): T, H, E = 4, 8, 4 hidden = torch.randn(T, H, dtype=torch.float32) def fn(weight): moe_block = _make_softmax_moe_block(weight) _, _, _, router_logits = softmax_topk_routing(hidden, moe_block) return router_logits weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) def test_no_norm_variant(self): T, H, E = 4, 8, 4 hidden = torch.randn(T, H, dtype=torch.float32) def fn(weight): moe_block = _make_softmax_moe_block(weight) moe_block.gate.norm_topk_prob = False scores, _, _, _ = softmax_topk_routing(hidden, moe_block) return scores weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) class TestSigmoidTopkRoutingGradcheck: """Numerical gradient verification for sigmoid_topk_routing.""" def test_gradcheck_wrt_gate_weight(self): T, H, E = 4, 8, 4 hidden = torch.randn(T, H, dtype=torch.float32) bias = torch.zeros(E, dtype=torch.float32) def fn(weight): moe_block = _make_sigmoid_moe_block(weight, bias) scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block) return scores weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) def test_gradcheck_wrt_hidden_states(self): T, H, E = 4, 8, 4 weight = torch.randn(E, H, dtype=torch.float32) bias = torch.zeros(E, dtype=torch.float32) moe_block = _make_sigmoid_moe_block(weight, bias) def fn(hidden): scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block) return scores hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck( fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL ) def test_gradcheck_wrt_bias(self): T, H, E = 4, 8, 4 hidden = torch.randn(T, H, dtype=torch.float32) weight = torch.randn(E, H, dtype=torch.float32) def fn(bias): moe_block = _make_sigmoid_moe_block(weight, bias) scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block) return scores bias = torch.zeros(E, dtype=torch.float32, requires_grad=True) torch.autograd.gradcheck(fn, (bias,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL) ================================================ FILE: tests/integrations/test_swanlab.py ================================================ # Copyright 2024 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Unit tests for SwanLab Integration Plugin. Tests conflict detection, configuration validation, and multi-logger warnings. """ import importlib.util import logging import os import time from unittest.mock import MagicMock, patch import pytest from pydantic import ValidationError from axolotl.integrations.swanlab.args import SwanLabConfig from axolotl.integrations.swanlab.plugins import SwanLabPlugin SWANLAB_INSTALLED = importlib.util.find_spec("swanlab") is not None @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabConfigValidators: """Tests for Pydantic field validators in SwanLabConfig.""" def test_valid_swanlab_mode_cloud(self): """Test that 'cloud' mode is valid.""" config = SwanLabConfig(swanlab_mode="cloud") assert config.swanlab_mode == "cloud" def test_valid_swanlab_mode_local(self): """Test that 'local' mode is valid.""" config = SwanLabConfig(swanlab_mode="local") assert config.swanlab_mode == "local" def test_valid_swanlab_mode_offline(self): """Test that 'offline' mode is valid.""" config = SwanLabConfig(swanlab_mode="offline") assert config.swanlab_mode == "offline" def test_valid_swanlab_mode_disabled(self): """Test that 'disabled' mode is valid.""" config = SwanLabConfig(swanlab_mode="disabled") assert config.swanlab_mode == "disabled" def test_invalid_swanlab_mode(self): """Test that invalid mode raises ValueError.""" with pytest.raises(ValidationError) as exc_info: SwanLabConfig(swanlab_mode="invalid") error_msg = str(exc_info.value) assert "Invalid swanlab_mode" in error_msg assert "cloud" in error_msg assert "local" in error_msg assert "offline" in error_msg assert "disabled" in error_msg def test_swanlab_mode_none_allowed(self): """Test that None mode is allowed (will use default).""" config = SwanLabConfig(swanlab_mode=None) assert config.swanlab_mode is None def test_valid_swanlab_project(self): """Test that valid project name is accepted.""" config = SwanLabConfig(swanlab_project="my-project") assert config.swanlab_project == "my-project" def test_swanlab_project_none_allowed(self): """Test that None project is allowed.""" config = SwanLabConfig(swanlab_project=None) assert config.swanlab_project is None def test_empty_swanlab_project_rejected(self): """Test that empty string project name is rejected.""" with pytest.raises(ValidationError) as exc_info: SwanLabConfig(swanlab_project="") error_msg = str(exc_info.value) assert "cannot be an empty string" in error_msg def test_whitespace_only_project_rejected(self): """Test that whitespace-only project name is rejected.""" with pytest.raises(ValidationError) as exc_info: SwanLabConfig(swanlab_project=" ") error_msg = str(exc_info.value) assert "cannot be an empty string" in error_msg def test_use_swanlab_true_requires_project(self): """Test that use_swanlab=True requires swanlab_project.""" with pytest.raises(ValidationError) as exc_info: SwanLabConfig(use_swanlab=True, swanlab_project=None) error_msg = str(exc_info.value) assert "swanlab_project" in error_msg.lower() assert "not set" in error_msg.lower() def test_use_swanlab_true_with_project_valid(self): """Test that use_swanlab=True with project is valid.""" config = SwanLabConfig(use_swanlab=True, swanlab_project="my-project") assert config.use_swanlab is True assert config.swanlab_project == "my-project" def test_use_swanlab_false_no_project_valid(self): """Test that use_swanlab=False without project is valid.""" config = SwanLabConfig(use_swanlab=False, swanlab_project=None) assert config.use_swanlab is False assert config.swanlab_project is None def test_use_swanlab_none_no_project_valid(self): """Test that use_swanlab=None without project is valid.""" config = SwanLabConfig(use_swanlab=None, swanlab_project=None) assert config.use_swanlab is None assert config.swanlab_project is None @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabPluginRegister: """Tests for SwanLabPlugin.register() conflict detection.""" def test_register_without_use_swanlab(self): """Test that register works when SwanLab is not enabled.""" plugin = SwanLabPlugin() cfg = {"use_swanlab": False} # Should not raise plugin.register(cfg) def test_register_use_swanlab_missing_project(self): """Test that use_swanlab=True without project raises ValueError.""" plugin = SwanLabPlugin() cfg = {"use_swanlab": True} with pytest.raises(ValueError) as exc_info: plugin.register(cfg) error_msg = str(exc_info.value) assert "swanlab_project" in error_msg assert "not set" in error_msg assert "Solutions" in error_msg def test_register_use_swanlab_with_project_valid(self): """Test that use_swanlab=True with project is valid.""" plugin = SwanLabPlugin() cfg = {"use_swanlab": True, "swanlab_project": "my-project"} # Should not raise plugin.register(cfg) def test_register_invalid_mode(self): """Test that invalid swanlab_mode raises ValueError.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "swanlab_mode": "invalid-mode", } with pytest.raises(ValueError) as exc_info: plugin.register(cfg) error_msg = str(exc_info.value) assert "Invalid swanlab_mode" in error_msg assert "cloud" in error_msg assert "local" in error_msg def test_register_valid_modes(self): """Test that all valid modes are accepted.""" plugin = SwanLabPlugin() valid_modes = ["cloud", "local", "offline", "disabled"] for mode in valid_modes: cfg = { "use_swanlab": True, "swanlab_project": "my-project", "swanlab_mode": mode, } # Should not raise plugin.register(cfg) def test_register_auto_enable_swanlab(self): """Test that providing swanlab_project auto-enables use_swanlab.""" plugin = SwanLabPlugin() cfg = {"swanlab_project": "my-project"} plugin.register(cfg) assert cfg["use_swanlab"] is True def test_register_cloud_mode_without_api_key_warns(self, caplog): """Test that cloud mode without API key logs warning.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "swanlab_mode": "cloud", } # Clear environment variable to ensure it's not set with patch.dict(os.environ, {}, clear=True): with caplog.at_level(logging.WARNING): plugin.register(cfg) # Should log warning about missing API key warning_messages = [record.message for record in caplog.records] assert any("API key" in msg for msg in warning_messages) @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestMultiLoggerDetection: """Tests for multi-logger conflict detection.""" def test_single_logger_no_warning(self, caplog): """Test that single logger doesn't trigger warning.""" plugin = SwanLabPlugin() cfg = {"use_swanlab": True, "swanlab_project": "my-project"} with caplog.at_level(logging.WARNING): plugin.register(cfg) # Should not log multi-logger warning warning_messages = [record.message for record in caplog.records] assert not any("Multiple logging tools" in msg for msg in warning_messages) def test_two_loggers_warning(self, caplog): """Test that two loggers trigger warning.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "use_wandb": True, } with caplog.at_level(logging.WARNING): plugin.register(cfg) # Should log multi-logger warning warning_messages = [record.message for record in caplog.records] assert any("Multiple logging tools" in msg for msg in warning_messages) assert any("SwanLab" in msg and "WandB" in msg for msg in warning_messages) def test_three_loggers_error(self, caplog): """Test that three loggers trigger error-level warning.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "use_wandb": True, "use_mlflow": True, } with caplog.at_level(logging.ERROR): plugin.register(cfg) # Should log error-level warning error_messages = [ record.message for record in caplog.records if record.levelno >= logging.ERROR ] assert any("logging tools enabled" in msg for msg in error_messages) def test_multi_logger_with_comet(self, caplog): """Test that Comet is detected in multi-logger scenario.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "comet_api_key": "test-key", } with caplog.at_level(logging.WARNING): plugin.register(cfg) # Should detect Comet warning_messages = [record.message for record in caplog.records] assert any("Comet" in msg for msg in warning_messages) def test_multi_logger_with_comet_project(self, caplog): """Test that Comet is detected via comet_project_name.""" plugin = SwanLabPlugin() cfg = { "use_swanlab": True, "swanlab_project": "my-project", "comet_project_name": "test-project", } with caplog.at_level(logging.WARNING): plugin.register(cfg) # Should detect Comet warning_messages = [record.message for record in caplog.records] assert any("Comet" in msg for msg in warning_messages) @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabPluginPreModelLoad: """Tests for SwanLabPlugin.pre_model_load() runtime checks.""" def test_pre_model_load_disabled(self): """Test that pre_model_load does nothing when SwanLab is disabled.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = False # Should not raise plugin.pre_model_load(cfg) def test_pre_model_load_import_error(self): """Test that missing swanlab package raises clear ImportError.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True with patch( "builtins.__import__", side_effect=ImportError("No module named 'swanlab'") ): with pytest.raises(ImportError) as exc_info: plugin.pre_model_load(cfg) error_msg = str(exc_info.value) assert "SwanLab is not installed" in error_msg assert "pip install swanlab" in error_msg @patch("axolotl.utils.distributed.is_main_process") @patch("axolotl.utils.distributed.get_world_size") def test_pre_model_load_non_main_process_skips( self, mock_get_world_size, mock_is_main_process ): """Test that non-main process skips SwanLab initialization.""" mock_get_world_size.return_value = 2 mock_is_main_process.return_value = False plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True with patch("swanlab.init") as mock_init: plugin.pre_model_load(cfg) # Should NOT call swanlab.init mock_init.assert_not_called() @patch("axolotl.utils.distributed.is_main_process") @patch("axolotl.utils.distributed.get_world_size") def test_pre_model_load_distributed_logging( self, mock_get_world_size, mock_is_main_process, caplog ): """Test that distributed training logs world size info.""" mock_get_world_size.return_value = 4 mock_is_main_process.return_value = True plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "cloud" with patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"): with caplog.at_level(logging.INFO): plugin.pre_model_load(cfg) # Should log distributed training info info_messages = [record.message for record in caplog.records] assert any("world_size=4" in msg for msg in info_messages) assert any("Only rank 0" in msg for msg in info_messages) @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabInitKwargs: """Tests for SwanLab initialization with direct parameter passing.""" def test_custom_branding_added_to_config(self): """Test that Axolotl custom branding is added to SwanLab config.""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() cfg = DictDefault( { "use_swanlab": True, "swanlab_project": "test-project", } ) init_kwargs = plugin._get_swanlab_init_kwargs(cfg) # Verify custom branding is present assert "config" in init_kwargs assert init_kwargs["config"]["UPPERFRAME"] == "🦎 Axolotl" def test_api_key_passed_directly(self): """Test that API key is passed directly to swanlab.init() instead of via env var.""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() cfg = DictDefault( { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_api_key": "test-api-key-12345", } ) init_kwargs = plugin._get_swanlab_init_kwargs(cfg) # Verify API key is in init_kwargs (not set as env var) assert "api_key" in init_kwargs assert init_kwargs["api_key"] == "test-api-key-12345" def test_private_deployment_hosts_passed_directly(self): """Test that private deployment hosts are passed directly to swanlab.init().""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() cfg = DictDefault( { "use_swanlab": True, "swanlab_project": "internal-project", "swanlab_web_host": "https://swanlab.company.com", "swanlab_api_host": "https://api-swanlab.company.com", } ) init_kwargs = plugin._get_swanlab_init_kwargs(cfg) # Verify private deployment hosts are in init_kwargs assert "web_host" in init_kwargs assert init_kwargs["web_host"] == "https://swanlab.company.com" assert "api_host" in init_kwargs assert init_kwargs["api_host"] == "https://api-swanlab.company.com" @patch("axolotl.utils.distributed.is_main_process") def test_full_private_deployment_init(self, mock_is_main_process): """Test complete initialization with private deployment configuration.""" mock_is_main_process.return_value = True from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() cfg = DictDefault( { "use_swanlab": True, "swanlab_project": "secure-project", "swanlab_experiment_name": "experiment-001", "swanlab_mode": "cloud", "swanlab_api_key": "private-key-xyz", "swanlab_web_host": "https://swanlab.internal.net", "swanlab_api_host": "https://api.swanlab.internal.net", "swanlab_workspace": "research-team", } ) with patch("swanlab.init") as mock_init: plugin.pre_model_load(cfg) # Verify swanlab.init was called with all parameters mock_init.assert_called_once() call_kwargs = mock_init.call_args[1] assert call_kwargs["project"] == "secure-project" assert call_kwargs["experiment_name"] == "experiment-001" assert call_kwargs["mode"] == "cloud" assert call_kwargs["api_key"] == "private-key-xyz" assert call_kwargs["web_host"] == "https://swanlab.internal.net" assert call_kwargs["api_host"] == "https://api.swanlab.internal.net" assert call_kwargs["workspace"] == "research-team" assert call_kwargs["config"]["UPPERFRAME"] == "🦎 Axolotl" def test_env_vars_not_set_for_api_params(self): """Test that environment variables are NOT set for API parameters.""" import os from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault # Clear any existing env vars for key in [ "SWANLAB_API_KEY", "SWANLAB_WEB_HOST", "SWANLAB_API_HOST", "SWANLAB_MODE", ]: os.environ.pop(key, None) plugin = SwanLabPlugin() cfg = DictDefault( { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_api_key": "test-key", "swanlab_web_host": "https://test.com", "swanlab_api_host": "https://api-test.com", "swanlab_mode": "cloud", } ) with ( patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("swanlab.init"), ): plugin.pre_model_load(cfg) # Verify env vars were NOT set (simplified approach) # The old _setup_swanlab_env() method is removed, so these shouldn't be set # Note: SwanLab itself might set these, but our plugin shouldn't # We're just testing that our plugin doesn't call _setup_swanlab_env() @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestLarkNotificationIntegration: """Tests for Lark (Feishu) notification integration.""" def test_lark_callback_registration_with_webhook_only(self): """Test Lark callback registration with webhook URL only (no secret).""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "local" cfg.swanlab_lark_webhook_url = ( "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook" ) cfg.swanlab_lark_secret = None with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("swanlab.register_callbacks") as mock_register, patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): # Mock LarkCallback import with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback: mock_lark_instance = MagicMock() MockLarkCallback.return_value = mock_lark_instance plugin.pre_model_load(cfg) # Verify LarkCallback was instantiated with correct params MockLarkCallback.assert_called_once_with( webhook_url="https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook", secret=None, ) # Verify callback was registered mock_register.assert_called_once_with([mock_lark_instance]) def test_lark_callback_registration_with_secret(self): """Test Lark callback registration with webhook URL and HMAC secret.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "local" cfg.swanlab_lark_webhook_url = ( "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook" ) cfg.swanlab_lark_secret = "test-hmac-secret" with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("swanlab.register_callbacks") as mock_register, patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback: mock_lark_instance = MagicMock() MockLarkCallback.return_value = mock_lark_instance plugin.pre_model_load(cfg) # Verify LarkCallback was instantiated with secret MockLarkCallback.assert_called_once_with( webhook_url="https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook", secret="test-hmac-secret", ) mock_register.assert_called_once_with([mock_lark_instance]) def test_lark_callback_not_registered_without_webhook(self): """Test that Lark callback is NOT registered when webhook URL not provided.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "local" cfg.swanlab_lark_webhook_url = None # No webhook cfg.swanlab_lark_secret = None with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("swanlab.register_callbacks") as mock_register, patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): plugin.pre_model_load(cfg) # Verify register_callbacks was NOT called mock_register.assert_not_called() def test_lark_import_error_handled_gracefully(self, caplog): """Test that ImportError for Lark plugin is handled gracefully.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "local" cfg.swanlab_lark_webhook_url = ( "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook" ) cfg.swanlab_lark_secret = None with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): # Mock ImportError for LarkCallback with patch( "swanlab.plugin.notification.LarkCallback", side_effect=ImportError( "No module named 'swanlab.plugin.notification'" ), ): with caplog.at_level(logging.WARNING): plugin.pre_model_load(cfg) # Should log warning about missing Lark plugin warning_messages = [record.message for record in caplog.records] assert any( "Failed to import SwanLab Lark plugin" in msg for msg in warning_messages ) assert any("SwanLab >= 0.3.0" in msg for msg in warning_messages) def test_lark_warning_for_missing_secret(self, caplog): """Test that warning is logged when Lark webhook has no HMAC secret.""" plugin = SwanLabPlugin() cfg = MagicMock() cfg.use_swanlab = True cfg.swanlab_project = "test-project" cfg.swanlab_mode = "local" cfg.swanlab_lark_webhook_url = ( "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook" ) cfg.swanlab_lark_secret = None # No secret with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("swanlab.register_callbacks"), patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): with patch("swanlab.plugin.notification.LarkCallback"): with caplog.at_level(logging.WARNING): plugin.pre_model_load(cfg) # Should log warning about missing secret warning_messages = [record.message for record in caplog.records] assert any( "no secret configured" in msg.lower() for msg in warning_messages ) assert any("swanlab_lark_secret" in msg for msg in warning_messages) @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabPluginIntegration: """Integration tests for SwanLab plugin lifecycle.""" def test_full_lifecycle_valid_config(self): """Test full plugin lifecycle with valid configuration.""" plugin = SwanLabPlugin() # Register cfg_dict = { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_mode": "local", } plugin.register(cfg_dict) # Pre-model load (mock SwanLab) cfg_obj = MagicMock() cfg_obj.use_swanlab = True cfg_obj.swanlab_project = "test-project" cfg_obj.swanlab_mode = "local" cfg_obj.swanlab_lark_webhook_url = None # No Lark with ( patch("swanlab.init") as mock_init, patch("swanlab.__version__", "0.3.0"), patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): plugin.pre_model_load(cfg_obj) # Should call swanlab.init mock_init.assert_called_once() def test_lifecycle_with_multi_logger_warning(self, caplog): """Test lifecycle with multi-logger warning.""" plugin = SwanLabPlugin() cfg_dict = { "use_swanlab": True, "swanlab_project": "test-project", "use_wandb": True, } with caplog.at_level(logging.WARNING): plugin.register(cfg_dict) # Should have multi-logger warning warning_messages = [record.message for record in caplog.records] assert any("Multiple logging tools" in msg for msg in warning_messages) def test_lifecycle_invalid_config_fails_early(self): """Test that invalid config fails at register stage.""" plugin = SwanLabPlugin() cfg_dict = { "use_swanlab": True, # Missing swanlab_project } # Should fail at register, not pre_model_load with pytest.raises(ValueError): plugin.register(cfg_dict) def test_full_lifecycle_with_lark_notifications(self): """Test full lifecycle including Lark notification registration.""" plugin = SwanLabPlugin() # Register cfg_dict = { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_mode": "cloud", } plugin.register(cfg_dict) # Pre-model load with Lark config cfg_obj = MagicMock() cfg_obj.use_swanlab = True cfg_obj.swanlab_project = "test-project" cfg_obj.swanlab_mode = "cloud" cfg_obj.swanlab_lark_webhook_url = ( "https://open.feishu.cn/open-apis/bot/v2/hook/test" ) cfg_obj.swanlab_lark_secret = "secret123" with ( patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"), patch("swanlab.register_callbacks") as mock_register, patch("axolotl.utils.distributed.is_main_process", return_value=True), patch("axolotl.utils.distributed.get_world_size", return_value=1), ): with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback: mock_lark_instance = MagicMock() MockLarkCallback.return_value = mock_lark_instance plugin.pre_model_load(cfg_obj) # Verify both SwanLab init AND Lark callback registration MockLarkCallback.assert_called_once() mock_register.assert_called_once_with([mock_lark_instance]) @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestCompletionLogger: """Tests for CompletionLogger utility class.""" def test_completion_logger_initialization(self): """Test CompletionLogger initializes with correct maxlen.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=64) assert logger.maxlen == 64 assert len(logger) == 0 def test_add_dpo_completion(self): """Test adding DPO completions to buffer.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_dpo_completion( step=0, prompt="What is AI?", chosen="Artificial Intelligence is...", rejected="AI means...", reward_diff=0.5, ) assert len(logger) == 1 entry = logger.data[0] assert entry["step"] == 0 assert entry["prompt"] == "What is AI?" assert entry["chosen"] == "Artificial Intelligence is..." assert entry["rejected"] == "AI means..." assert entry["reward_diff"] == 0.5 def test_add_kto_completion(self): """Test adding KTO completions to buffer.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_kto_completion( step=1, prompt="Explain quantum physics", completion="Quantum physics is...", label=True, reward=0.8, ) assert len(logger) == 1 entry = logger.data[0] assert entry["step"] == 1 assert entry["prompt"] == "Explain quantum physics" assert entry["completion"] == "Quantum physics is..." assert entry["label"] == "desirable" assert entry["reward"] == 0.8 def test_add_orpo_completion(self): """Test adding ORPO completions to buffer.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_orpo_completion( step=2, prompt="Write a poem", chosen="Roses are red...", rejected="Violets are blue...", log_odds_ratio=1.2, ) assert len(logger) == 1 entry = logger.data[0] assert entry["step"] == 2 assert entry["chosen"] == "Roses are red..." assert entry["rejected"] == "Violets are blue..." assert entry["log_odds_ratio"] == 1.2 def test_add_grpo_completion(self): """Test adding GRPO completions to buffer.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_grpo_completion( step=3, prompt="Solve this problem", completion="The answer is 42", reward=0.9, advantage=0.3, ) assert len(logger) == 1 entry = logger.data[0] assert entry["step"] == 3 assert entry["completion"] == "The answer is 42" assert entry["reward"] == 0.9 assert entry["advantage"] == 0.3 def test_memory_bounded_buffer(self): """Test that buffer respects maxlen and drops oldest entries.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=3) # Add 5 completions for i in range(5): logger.add_dpo_completion( step=i, prompt=f"Prompt {i}", chosen=f"Chosen {i}", rejected=f"Rejected {i}", ) # Should only keep last 3 assert len(logger) == 3 assert logger.data[0]["step"] == 2 # Oldest kept assert logger.data[1]["step"] == 3 assert logger.data[2]["step"] == 4 # Newest def test_log_to_swanlab_when_not_initialized(self): """Test logging gracefully fails when SwanLab not initialized.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_dpo_completion( step=0, prompt="Test", chosen="A", rejected="B", ) with patch("swanlab.get_run", return_value=None): result = logger.log_to_swanlab() assert result is False # Should fail gracefully def test_log_to_swanlab_success(self): """Test successful logging to SwanLab.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_dpo_completion( step=0, prompt="Test prompt", chosen="Chosen response", rejected="Rejected response", reward_diff=0.5, ) with ( patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log, patch("swanlab.echarts.Table") as MockTable, ): mock_get_run.return_value = MagicMock() # SwanLab initialized mock_table_instance = MagicMock() MockTable.return_value = mock_table_instance result = logger.log_to_swanlab(table_name="test_table") assert result is True mock_log.assert_called_once() mock_table_instance.add.assert_called_once() def test_clear_buffer(self): """Test clearing the completion buffer.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=10) logger.add_dpo_completion( step=0, prompt="Test", chosen="A", rejected="B", ) assert len(logger) == 1 logger.clear() assert len(logger) == 0 def test_repr(self): """Test string representation.""" from axolotl.integrations.swanlab.completion_logger import CompletionLogger logger = CompletionLogger(maxlen=128) logger.add_dpo_completion( step=0, prompt="Test", chosen="A", rejected="B", ) repr_str = repr(logger) assert "CompletionLogger" in repr_str assert "maxlen=128" in repr_str assert "buffered=1/128" in repr_str @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabRLHFCompletionCallback: """Tests for SwanLabRLHFCompletionCallback.""" def test_callback_initialization(self): """Test callback initializes with correct parameters.""" from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback callback = SwanLabRLHFCompletionCallback( log_interval=50, max_completions=64, table_name="custom_table", ) assert callback.log_interval == 50 assert callback.logger.maxlen == 64 assert callback.table_name == "custom_table" assert callback.trainer_type is None def test_trainer_type_detection_dpo(self): """Test DPO trainer type is detected correctly.""" from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback callback = SwanLabRLHFCompletionCallback() # Mock trainer with DPO in name mock_trainer = MagicMock() mock_trainer.__class__.__name__ = "AxolotlDPOTrainer" callback.on_init_end( args=MagicMock(), state=MagicMock(), control=MagicMock(), trainer=mock_trainer, ) assert callback.trainer_type == "dpo" def test_trainer_type_detection_kto(self): """Test KTO trainer type is detected correctly.""" from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback callback = SwanLabRLHFCompletionCallback() mock_trainer = MagicMock() mock_trainer.__class__.__name__ = "AxolotlKTOTrainer" callback.on_init_end( args=MagicMock(), state=MagicMock(), control=MagicMock(), trainer=mock_trainer, ) assert callback.trainer_type == "kto" def test_on_train_end_logs_completions(self): """Test that completions are logged at end of training.""" from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback callback = SwanLabRLHFCompletionCallback() callback.trainer_type = "dpo" # Add some completions to buffer callback.logger.add_dpo_completion( step=0, prompt="Test", chosen="A", rejected="B", ) with patch.object(callback.logger, "log_to_swanlab") as mock_log: callback.on_train_end( args=MagicMock(), state=MagicMock(global_step=100), control=MagicMock(), ) # Should log remaining completions mock_log.assert_called_once() @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabPluginCompletionIntegration: """Integration tests for completion logging in SwanLabPlugin.""" def test_completion_callback_registered_for_dpo_trainer(self): """Test that completion callback is registered for DPO trainer.""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() plugin.swanlab_initialized = True # Simulate SwanLab initialized cfg = { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_log_completions": True, "swanlab_completion_log_interval": 50, "swanlab_completion_max_buffer": 64, } cfg_obj = DictDefault(cfg) # Mock DPO trainer mock_trainer = MagicMock() mock_trainer.__class__.__name__ = "AxolotlDPOTrainer" mock_trainer.state = MagicMock(max_steps=1000) mock_trainer.args = MagicMock( num_train_epochs=3, train_batch_size=4, gradient_accumulation_steps=2, ) with patch("swanlab.config.update"): plugin.post_trainer_create(cfg_obj, mock_trainer) # Verify callback was added mock_trainer.add_callback.assert_called_once() callback = mock_trainer.add_callback.call_args[0][0] assert callback.__class__.__name__ == "SwanLabRLHFCompletionCallback" assert callback.log_interval == 50 assert callback.logger.maxlen == 64 def test_completion_callback_not_registered_for_non_rlhf_trainer(self): """Test that completion callback is NOT registered for non-RLHF trainers.""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() plugin.swanlab_initialized = True cfg = { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_log_completions": True, } cfg_obj = DictDefault(cfg) # Mock regular SFT trainer (not RLHF) mock_trainer = MagicMock() mock_trainer.__class__.__name__ = "AxolotlTrainer" # Not RLHF mock_trainer.state = MagicMock(max_steps=1000) mock_trainer.args = MagicMock() with patch("swanlab.config.update"): plugin.post_trainer_create(cfg_obj, mock_trainer) # Callback should NOT be added for non-RLHF trainer mock_trainer.add_callback.assert_not_called() def test_completion_callback_not_registered_when_disabled(self): """Test that completion callback is not registered when disabled in config.""" from axolotl.integrations.swanlab.plugins import SwanLabPlugin from axolotl.utils.dict import DictDefault plugin = SwanLabPlugin() plugin.swanlab_initialized = True cfg = { "use_swanlab": True, "swanlab_project": "test-project", "swanlab_log_completions": False, # Disabled } cfg_obj = DictDefault(cfg) # Mock DPO trainer mock_trainer = MagicMock() mock_trainer.__class__.__name__ = "AxolotlDPOTrainer" mock_trainer.state = MagicMock(max_steps=1000) mock_trainer.args = MagicMock() with patch("swanlab.config.update"): plugin.post_trainer_create(cfg_obj, mock_trainer) # Callback should NOT be added when disabled mock_trainer.add_callback.assert_not_called() @pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed") class TestSwanLabProfiling: """Tests for SwanLab profiling utilities.""" def test_profiling_context_logs_duration(self): """Test that profiling context logs execution duration.""" from axolotl.integrations.swanlab.profiling import swanlab_profiling_context # Mock trainer with SwanLab enabled mock_trainer = MagicMock() mock_trainer.cfg = MagicMock(use_swanlab=True) mock_trainer.__class__.__name__ = "TestTrainer" with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log: mock_get_run.return_value = MagicMock() # SwanLab initialized with swanlab_profiling_context(mock_trainer, "test_function"): time.sleep(0.01) # Simulate work # Verify log was called with correct metric name mock_log.assert_called_once() logged_data = mock_log.call_args[0][0] assert "profiling/Time taken: TestTrainer.test_function" in logged_data # Duration should be > 0.01 seconds assert ( logged_data["profiling/Time taken: TestTrainer.test_function"] >= 0.01 ) def test_profiling_context_skips_when_swanlab_disabled(self): """Test that profiling is skipped when SwanLab is disabled.""" from axolotl.integrations.swanlab.profiling import swanlab_profiling_context mock_trainer = MagicMock() mock_trainer.cfg = MagicMock(use_swanlab=False) # Disabled with patch("swanlab.log") as mock_log: with swanlab_profiling_context(mock_trainer, "test_function"): time.sleep(0.01) # Should NOT log when disabled mock_log.assert_not_called() def test_profiling_context_skips_when_swanlab_not_initialized(self): """Test that profiling is skipped when SwanLab not initialized.""" from axolotl.integrations.swanlab.profiling import swanlab_profiling_context mock_trainer = MagicMock() mock_trainer.cfg = MagicMock(use_swanlab=True) with ( patch("swanlab.get_run", return_value=None), patch("swanlab.log") as mock_log, ): with swanlab_profiling_context(mock_trainer, "test_function"): time.sleep(0.01) # Should NOT log when not initialized mock_log.assert_not_called() def test_profiling_decorator(self): """Test swanlab_profile decorator.""" from axolotl.integrations.swanlab.profiling import swanlab_profile class MockTrainer: def __init__(self): self.cfg = MagicMock(use_swanlab=True) @swanlab_profile def expensive_method(self, x): time.sleep(0.01) return x * 2 trainer = MockTrainer() with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log: mock_get_run.return_value = MagicMock() result = trainer.expensive_method(5) # Verify method still works correctly assert result == 10 # Verify profiling was logged mock_log.assert_called_once() logged_data = mock_log.call_args[0][0] assert "profiling/Time taken: MockTrainer.expensive_method" in logged_data def test_profiling_config(self): """Test ProfilingConfig class.""" from axolotl.integrations.swanlab.profiling import ProfilingConfig config = ProfilingConfig( enabled=True, min_duration_ms=1.0, log_interval=5, ) # Test enabled check assert config.enabled is True # Test minimum duration filtering assert config.should_log("func1", 0.0001) is False # 0.1ms < 1.0ms threshold assert config.should_log("func2", 0.002) is True # 2.0ms > 1.0ms threshold # Test log interval assert config.should_log("func3", 0.002) is True # 1st call assert config.should_log("func3", 0.002) is False # 2nd call assert config.should_log("func3", 0.002) is False # 3rd call assert config.should_log("func3", 0.002) is False # 4th call assert config.should_log("func3", 0.002) is True # 5th call (interval=5) def test_profiling_config_when_disabled(self): """Test ProfilingConfig when disabled.""" from axolotl.integrations.swanlab.profiling import ProfilingConfig config = ProfilingConfig(enabled=False) # Should never log when disabled assert config.should_log("func1", 100.0) is False def test_profiling_context_advanced(self): """Test advanced profiling context with custom config.""" from axolotl.integrations.swanlab.profiling import ( ProfilingConfig, swanlab_profiling_context_advanced, ) mock_trainer = MagicMock() mock_trainer.cfg = MagicMock(use_swanlab=True) mock_trainer.__class__.__name__ = "TestTrainer" # Config that filters out very fast operations config = ProfilingConfig(min_duration_ms=10.0) # 10ms minimum with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log: mock_get_run.return_value = MagicMock() # Fast operation (< 10ms) - should NOT log with swanlab_profiling_context_advanced(mock_trainer, "fast_op", config): time.sleep(0.001) # 1ms mock_log.assert_not_called() # Slow operation (> 10ms) - should log with swanlab_profiling_context_advanced(mock_trainer, "slow_op", config): time.sleep(0.015) # 15ms mock_log.assert_called_once() def test_profiling_with_exception(self): """Test that profiling still logs even when exception occurs.""" from axolotl.integrations.swanlab.profiling import swanlab_profiling_context mock_trainer = MagicMock() mock_trainer.cfg = MagicMock(use_swanlab=True) mock_trainer.__class__.__name__ = "TestTrainer" with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log: mock_get_run.return_value = MagicMock() try: with swanlab_profiling_context(mock_trainer, "error_function"): time.sleep(0.01) raise ValueError("Test error") except ValueError: pass # Expected # Should still log duration even with exception mock_log.assert_called_once() ================================================ FILE: tests/monkeypatch/test_llama_attn_hijack_flash.py ================================================ """ Unit tests for the monkeypatch utils """ import unittest import torch from axolotl.monkeypatch.utils import ( get_cu_seqlens, get_cu_seqlens_from_pos_ids, get_max_seqlen_in_batch, get_unpad_data, ) class TestMonkeyPatchUtils(unittest.TestCase): """ Unit test class for monkeypatch utils """ def test_get_cu_seqlens_1d(self): attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]]) target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32) self.assertTrue(torch.allclose(get_cu_seqlens(attn_mask)[0], target_res)) def test_get_cu_seqlens_from_pos_ids_1d(self): position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0]]) target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32) self.assertTrue( torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res) ) def test_get_cu_seqlens_from_pos_ids_2d(self): position_ids = torch.tensor( [ [0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0], [0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 0], ] ) target_res = torch.tensor( [[0, 4, 7, 12, 14, 16], [0, 5, 8, 15, 16, 16]], dtype=torch.int32 ) self.assertTrue( torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res) ) def test_get_max_seqlen_in_batch(self): attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]]) target_res = torch.tensor([4, 3, 5, 2], dtype=torch.int32) self.assertTrue(torch.allclose(get_max_seqlen_in_batch(attn_mask), target_res)) def test_get_unpad_data(self): attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]]) target_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) target_cu_seqlen = torch.tensor([0, 4, 7, 12, 14], dtype=torch.int32) target_max_seqlen_in_batch = 5 indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask) self.assertTrue(torch.allclose(target_indices, indices)) self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen)) self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch) attn_mask = torch.tensor( [ [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0], [1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5], ] ) target_indices = torch.tensor( [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ] ) target_cu_seqlen = torch.tensor( [0, 4, 7, 12, 14, 17, 22, 24, 27, 30], dtype=torch.int32 ) target_max_seqlen_in_batch = 5 indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask) self.assertTrue(torch.allclose(target_indices, indices)) self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen)) self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/monkeypatch/test_pixtral_flash_attention_patch.py ================================================ """Integration tests for Pixtral Flash Attention patches.""" import pytest import torch class TestPixtralFlashAttentionPatchIntegration: """Test Pixtral Flash Attention patch integration.""" @pytest.mark.integration def test_pixtral_flash_attention_patch(self): """Test that Pixtral Flash Attention patch can be applied and works correctly.""" try: from transformers import modeling_flash_attention_utils except ImportError: pytest.skip("Flash Attention utils not available") from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import ( apply_patch_is_packed_sequence, ) # Store original method original_is_packed_sequence = modeling_flash_attention_utils._is_packed_sequence # Apply patch and get unpatch function unpatch_fn = apply_patch_is_packed_sequence() # Verify patch was applied assert ( modeling_flash_attention_utils._is_packed_sequence != original_is_packed_sequence ), "_is_packed_sequence was not patched" # Test the patched function with 1D position_ids patched_fn = modeling_flash_attention_utils._is_packed_sequence # Test 1D position_ids 1 sequence position_ids_1d = torch.tensor([0, 1, 2, 3]) result = patched_fn(position_ids_1d, batch_size=1) assert isinstance(result, bool), "Function should return a boolean" assert result is False, "1D sequential position_ids should not be packed" # Test 1D packed 2 sequences position_ids_1d_packed = torch.tensor([0, 1, 2, 0, 1, 2]) result = patched_fn(position_ids_1d_packed, batch_size=1) assert isinstance(result, bool), "Function should return a boolean" assert result is True, "1D packed position_ids should be detected as packed" # Test 2D packed 2 sequences position_ids_2d_packed = torch.tensor([[0, 1, 2, 3, 0, 1]]) result = patched_fn(position_ids_2d_packed, batch_size=1) assert isinstance(result, bool), "Function should return a boolean" assert result is True, "2D packed position_ids should be detected as packed" # Test 2D 1 sequence position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5]]) result = patched_fn(position_ids_2d_normal, batch_size=1) assert isinstance(result, bool), "Function should return a boolean" assert result is False, "2D sequential position_ids should not be packed" # Test 2D batch size 2 position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]]) result = patched_fn(position_ids_2d_normal, batch_size=2) assert isinstance(result, bool), "Function should return a boolean" assert result is False, "2D position_ids batch 2 should not be packed" # Test None case result = patched_fn(None, batch_size=1) assert isinstance(result, bool), "Function should return a boolean" assert result is False, "None position_ids should return False" # Test unpatch function unpatch_fn() assert ( modeling_flash_attention_utils._is_packed_sequence == original_is_packed_sequence ), "unpatch function did not restore original method" ================================================ FILE: tests/monkeypatch/test_qwen3_next_modeling_patch.py ================================================ """Integration tests for Qwen3 Next modeling patches.""" import pytest import torch # Skip entire module if qwen3_next not available qwen3_next = pytest.importorskip("transformers.models.qwen3_next.modeling_qwen3_next") class TestQwen3NextModelingPatchIntegration: """Test Qwen3 Next modeling patch integration.""" @pytest.mark.integration def test_qwen3_next_decoder_layer_patch(self): """Test that Qwen3Next decoder layer patch can be applied.""" from axolotl.monkeypatch.models.qwen3_next.modeling import ( patch_qwen3_next_decoder_layer, ) # Store original method original_forward = qwen3_next.Qwen3NextDecoderLayer.forward # Apply patch and get unpatch function unpatch_fn = patch_qwen3_next_decoder_layer() # Verify patch was applied assert qwen3_next.Qwen3NextDecoderLayer.forward != original_forward, ( "decoder layer forward method was not patched" ) # Verify the method is still callable assert callable(qwen3_next.Qwen3NextDecoderLayer.forward), ( "Patched method is not callable" ) # Test unpatch function if unpatch_fn: unpatch_fn() assert qwen3_next.Qwen3NextDecoderLayer.forward == original_forward, ( "unpatch function did not restore original method" ) @pytest.mark.integration def test_qwen3_next_gateddelta_layer_patch(self): """Test that Qwen3Next GatedDeltaNet patch can be applied.""" from axolotl.monkeypatch.models.qwen3_next.modeling import ( patch_qwen3_next_gateddelta_layer, ) # Store original method original_forward = qwen3_next.Qwen3NextGatedDeltaNet.forward # Apply patch and get unpatch function unpatch_fn = patch_qwen3_next_gateddelta_layer() # Verify patch was applied assert qwen3_next.Qwen3NextGatedDeltaNet.forward != original_forward, ( "GatedDeltaNet forward method was not patched" ) # Verify the method is still callable assert callable(qwen3_next.Qwen3NextGatedDeltaNet.forward), ( "Patched method is not callable" ) # Test unpatch function if unpatch_fn: unpatch_fn() assert qwen3_next.Qwen3NextGatedDeltaNet.forward == original_forward, ( "unpatch function did not restore original method" ) @pytest.mark.integration def test_qwen3_next_imports_patch(self): """Test that Qwen3Next imports patch can be applied without errors.""" from axolotl.monkeypatch.models.qwen3_next.modeling import ( patch_qwen3_next_imports, ) # Apply patch - should not raise any exceptions even if modules unavailable unpatch_fn = patch_qwen3_next_imports() # Test that unpatch function is returned (or None if skipped) assert unpatch_fn is None or callable(unpatch_fn), ( "patch_qwen3_next_imports should return None or callable unpatch function" ) @pytest.mark.integration def test_qwen3_next_modeling_packing_patch(self): """Test that all Qwen3Next modeling patches can be applied together.""" from axolotl.monkeypatch.models.qwen3_next.modeling import ( patch_qwen3_next_modeling_packing, ) # This should not raise any exceptions patch_qwen3_next_modeling_packing() @pytest.mark.integration def test_get_cu_seqlens_utility(): """Test the get_cu_seqlens utility function.""" from axolotl.monkeypatch.models.qwen3_next.modeling import get_cu_seqlens # Test with simple position_ids position_ids = torch.tensor([[0, 1, 2, 0, 1]]) cu_seqlens = get_cu_seqlens(position_ids) assert cu_seqlens.dtype == torch.int32, "Should be int32 dtype" # Should return tensor with start positions and total length expected = torch.tensor([0, 3, 5], dtype=torch.int32) assert torch.equal(cu_seqlens, expected), f"Expected {expected}, got {cu_seqlens}" ================================================ FILE: tests/monkeypatch/test_trainer_accelerator_args.py ================================================ """ Unit tests for trainer accelerator args monkeypatch """ import unittest from axolotl.monkeypatch.trainer_accelerator_args import ( check_create_accelerate_code_is_patchable, ) class TestTrainerAcceleratorArgs(unittest.TestCase): """ Unit test class for trainer accelerator args monkeypatch """ def test_check_create_accelerate_code_is_patchable(self): """ Test that the upstream transformers code is still patchable. This will fail if the patched code changes upstream. """ assert check_create_accelerate_code_is_patchable() if __name__ == "__main__": unittest.main() ================================================ FILE: tests/monkeypatch/test_trainer_context_parallel_patch.py ================================================ """Tests for the HF Trainer context parallel patch.""" import pytest from transformers import Trainer from axolotl.monkeypatch.transformers.trainer_context_parallel import ( GUARD_PATTERN, PATCHED_GUARD, patch_prepare_context_parallel_inputs, ) @pytest.fixture def restore_trainer_prepare_method(): """Ensure Trainer._prepare_context_parallel_inputs is restored after a test.""" original_method = getattr( Trainer, "_original_prepare_context_parallel_inputs", Trainer._prepare_context_parallel_inputs, ) patched_attr_present = hasattr( Trainer, "_axolotl_prepare_context_parallel_inputs_patched" ) yield Trainer._prepare_context_parallel_inputs = original_method if patched_attr_present: delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched") if hasattr(Trainer, "_original_prepare_context_parallel_inputs"): delattr(Trainer, "_original_prepare_context_parallel_inputs") if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"): delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source") def test_patch_attention_guard(restore_trainer_prepare_method): """Patch should swap the guard to allow sdpa or flash attention.""" # Ensure we start from the unpatched method if hasattr(Trainer, "_original_prepare_context_parallel_inputs"): Trainer._prepare_context_parallel_inputs = ( Trainer._original_prepare_context_parallel_inputs ) delattr(Trainer, "_original_prepare_context_parallel_inputs") if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"): delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched") patch_prepare_context_parallel_inputs() patched_method = Trainer._prepare_context_parallel_inputs assert patched_method is not None assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False) source = Trainer._axolotl_prepare_context_parallel_inputs_source assert GUARD_PATTERN not in source assert PATCHED_GUARD in source def test_patch_is_idempotent(restore_trainer_prepare_method): """Calling the patch twice should leave the same patched function in place.""" patch_prepare_context_parallel_inputs() first_patched = Trainer._prepare_context_parallel_inputs patch_prepare_context_parallel_inputs() second_patched = Trainer._prepare_context_parallel_inputs assert first_patched is second_patched ================================================ FILE: tests/monkeypatch/test_trainer_loss_calc.py ================================================ """Unit tests for trainer loss calc monkeypatch.""" import unittest from axolotl.monkeypatch.transformers.trainer_loss_calc import ( check_evaluation_loop_is_patchable, check_maybe_log_save_evaluate_is_patchable, ) class TestTrainerLossCalc(unittest.TestCase): """ Unit test class for trainer loss calc monkeypatch """ def test_trainer_loss_calc_is_patchable(self): """ Test that the upstream transformers code is still patchable. This will fail if the patched code changes upstream. """ assert check_evaluation_loop_is_patchable() assert check_maybe_log_save_evaluate_is_patchable() if __name__ == "__main__": unittest.main() ================================================ FILE: tests/monkeypatch/test_trl_vllm.py ================================================ """Unit tests for TRL vLLM monkeypatches. Tests: - split_tensor_dict: scalar type preservation (int/float/bool) - shuffle_sequence_dict: scalar type preservation - extract_logprobs: NaN → 0.0 replacement - VLLMClient.batch_update_named_params: method exists after patch - VLLMGeneration: weight_sync_chunk_size attribute after patch - Patch idempotency: applying patch twice doesn't break anything """ import unittest from dataclasses import dataclass from unittest.mock import MagicMock import torch class TestSplitTensorDict(unittest.TestCase): """Tests for patched split_tensor_dict.""" def setUp(self): from axolotl.monkeypatch.trainer.trl_vllm import _patched_split_tensor_dict self.split = _patched_split_tensor_dict def test_scalar_int_preserved(self): d = {"a": torch.randn(4, 3), "count": 42} chunks = self.split(d, 2) self.assertEqual(len(chunks), 2) self.assertEqual(chunks[0]["count"], 42) self.assertEqual(chunks[1]["count"], 42) def test_scalar_float_preserved(self): d = {"a": torch.randn(6, 2), "lr": 1e-5} chunks = self.split(d, 3) for c in chunks: self.assertEqual(c["lr"], 1e-5) def test_scalar_bool_preserved(self): d = {"a": torch.randn(4, 2), "flag": True} chunks = self.split(d, 2) for c in chunks: self.assertTrue(c["flag"]) def test_none_preserved(self): d = {"a": torch.randn(4, 2), "b": None} chunks = self.split(d, 2) for c in chunks: self.assertIsNone(c["b"]) def test_tensor_split(self): t = torch.arange(8).reshape(4, 2) d = {"a": t, "n": 10} chunks = self.split(d, 2) self.assertEqual(chunks[0]["a"].shape, (2, 2)) self.assertEqual(chunks[1]["a"].shape, (2, 2)) torch.testing.assert_close(chunks[0]["a"], t[:2]) torch.testing.assert_close(chunks[1]["a"], t[2:]) def test_0d_tensor_preserved(self): d = {"a": torch.randn(4, 2), "scalar_t": torch.tensor(3.14)} chunks = self.split(d, 2) for c in chunks: self.assertAlmostEqual(c["scalar_t"].item(), 3.14, places=5) def test_list_split(self): d = {"a": torch.randn(4, 2), "names": ["a", "b", "c", "d"]} chunks = self.split(d, 2) self.assertEqual(chunks[0]["names"], ["a", "b"]) self.assertEqual(chunks[1]["names"], ["c", "d"]) class TestShuffleSequenceDict(unittest.TestCase): """Tests for patched shuffle_sequence_dict.""" def setUp(self): from axolotl.monkeypatch.trainer.trl_vllm import _patched_shuffle_sequence_dict self.shuffle = _patched_shuffle_sequence_dict def test_scalar_int_preserved(self): d = {"a": torch.randn(4, 3), "count": 42} result = self.shuffle(d) self.assertEqual(result["count"], 42) def test_scalar_float_preserved(self): d = {"a": torch.randn(4, 3), "lr": 1e-5} result = self.shuffle(d) self.assertEqual(result["lr"], 1e-5) def test_scalar_bool_preserved(self): d = {"a": torch.randn(4, 3), "flag": False} result = self.shuffle(d) self.assertFalse(result["flag"]) def test_none_preserved(self): d = {"a": torch.randn(4, 3), "b": None} result = self.shuffle(d) self.assertIsNone(result["b"]) def test_tensor_permuted(self): torch.manual_seed(42) t = torch.arange(4).float() d = {"a": t} result = self.shuffle(d) # Same elements, possibly different order self.assertEqual(sorted(result["a"].tolist()), sorted(t.tolist())) self.assertEqual(result["a"].shape, t.shape) def test_list_permuted(self): torch.manual_seed(42) d = {"a": torch.randn(3, 2), "names": ["x", "y", "z"]} result = self.shuffle(d) self.assertEqual(sorted(result["names"]), ["x", "y", "z"]) self.assertEqual(len(result["names"]), 3) def test_0d_tensor_preserved(self): d = {"a": torch.randn(4, 2), "scalar_t": torch.tensor(3.14)} result = self.shuffle(d) self.assertAlmostEqual(result["scalar_t"].item(), 3.14, places=5) class TestExtractLogprobs(unittest.TestCase): """Tests for patched extract_logprobs (NaN → 0.0).""" def setUp(self): from axolotl.monkeypatch.trainer.trl_vllm import _patched_extract_logprobs self.extract = _patched_extract_logprobs def _make_output(self, logprob_values): """Create a mock vLLM RequestOutput with given logprob values.""" @dataclass class LogprobItem: logprob: float rank: int @dataclass class SeqOutput: logprobs: list[dict[int, LogprobItem]] | None @dataclass class RequestOutput: outputs: list[SeqOutput] logprobs_list = [] for vals in logprob_values: lp_dict = {i: LogprobItem(logprob=v, rank=i) for i, v in enumerate(vals)} logprobs_list.append(lp_dict) return RequestOutput(outputs=[SeqOutput(logprobs=logprobs_list)]) def test_nan_replaced_with_zero(self): output = self._make_output([[float("nan"), 0.5], [-0.3, float("nan")]]) logprobs, token_ids = self.extract([output]) self.assertEqual(logprobs[0][0][0], 0.0) # NaN → 0.0 self.assertEqual(logprobs[0][0][1], 0.5) self.assertEqual(logprobs[0][1][0], -0.3) self.assertEqual(logprobs[0][1][1], 0.0) # NaN → 0.0 def test_normal_values_preserved(self): output = self._make_output([[-0.5, -1.2], [-0.1, -2.0]]) logprobs, token_ids = self.extract([output]) self.assertAlmostEqual(logprobs[0][0][0], -0.5) self.assertAlmostEqual(logprobs[0][0][1], -1.2) def test_none_logprobs_returns_none(self): @dataclass class SeqOutput: logprobs: None = None @dataclass class RequestOutput: outputs: list output = RequestOutput(outputs=[SeqOutput()]) logprobs, token_ids = self.extract([output]) self.assertIsNone(logprobs) self.assertIsNone(token_ids) def test_token_ids_extracted(self): output = self._make_output([[-0.5]]) logprobs, token_ids = self.extract([output]) self.assertEqual(token_ids[0][0], [0]) # token_id=0 from enumerate class TestPatchApplication(unittest.TestCase): """Tests for patch_trl_vllm() application.""" def test_batch_update_added_to_client(self): from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm patch_trl_vllm() from trl.generation.vllm_client import VLLMClient self.assertTrue(hasattr(VLLMClient, "batch_update_named_params")) def test_extract_logprobs_patched(self): from axolotl.monkeypatch.trainer.trl_vllm import ( _patched_extract_logprobs, patch_trl_vllm, ) patch_trl_vllm() from trl.generation import vllm_generation self.assertIs(vllm_generation.extract_logprobs, _patched_extract_logprobs) def test_utils_patched(self): from axolotl.monkeypatch.trainer.trl_vllm import ( _patched_shuffle_sequence_dict, _patched_split_tensor_dict, patch_trl_vllm, ) patch_trl_vllm() import trl.trainer.utils self.assertIs(trl.trainer.utils.split_tensor_dict, _patched_split_tensor_dict) self.assertIs( trl.trainer.utils.shuffle_sequence_dict, _patched_shuffle_sequence_dict ) def test_patch_idempotent(self): from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm patch_trl_vllm() patch_trl_vllm() # second call should not error from trl.generation.vllm_client import VLLMClient self.assertTrue(hasattr(VLLMClient, "batch_update_named_params")) class TestBatchUpdateChunking(unittest.TestCase): """Tests for batch_update_named_params chunking logic.""" def test_no_chunk_single_batch(self): from axolotl.monkeypatch.trainer.trl_vllm import _batch_update_named_params # Test that with chunk_size=None, all params go in one chunk client = MagicMock() client.base_url = "http://localhost:8000" client.session.post.return_value = MagicMock(status_code=200) client.communicator = MagicMock() client.communicator.group = MagicMock() client.rank = 0 params = [ ("layer.0.weight", torch.randn(10, 10)), ("layer.1.weight", torch.randn(10, 10)), ] _batch_update_named_params(client, params, chunk_size=None) # Should make exactly 1 HTTP call self.assertEqual(client.session.post.call_count, 1) def test_chunk_splits_params(self): from axolotl.monkeypatch.trainer.trl_vllm import _batch_update_named_params client = MagicMock() client.base_url = "http://localhost:8000" client.session.post.return_value = MagicMock(status_code=200) client.communicator = MagicMock() client.communicator.group = MagicMock() client.rank = 0 params = [ ("a", torch.randn(100)), # 100 elements ("b", torch.randn(100)), # 100 elements ("c", torch.randn(100)), # 100 elements ] _batch_update_named_params(client, params, chunk_size=150) # Should make 2 HTTP calls: [a,b] then [c] (100+100 > 150 triggers split) # Actually: a=100 < 150, a+b=200 > 150 → chunk [a], then b=100 < 150, # b+c=200 > 150 → chunk [b], then [c]. So 3 calls. # Wait: first a added (100 < 150), then b: 100+100=200 > 150, so chunk=[a], # new chunk starts with b (100 < 150), then c: 100+100=200 > 150, so chunk=[b], # final chunk=[c]. 3 HTTP calls. self.assertEqual(client.session.post.call_count, 3) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/monkeypatch/test_voxtral_modeling_patch.py ================================================ """Integration tests for Voxtral modeling patches.""" import pytest class TestVoxtralModelingPatchIntegration: """Test Voxtral modeling patch integration.""" @pytest.mark.integration def test_voxtral_conditional_generation_patch(self): """Test that Voxtral conditional generation patch can be applied.""" try: from transformers.models.voxtral.modeling_voxtral import ( VoxtralForConditionalGeneration, ) except ImportError: pytest.skip("VoxtralForConditionalGeneration not available") from axolotl.monkeypatch.models.voxtral.modeling import ( patch_voxtral_conditional_generation_forward, ) # Store original method original_forward = VoxtralForConditionalGeneration.forward # Apply patch and get unpatch function unpatch_fn = patch_voxtral_conditional_generation_forward() # Verify patch was applied assert VoxtralForConditionalGeneration.forward != original_forward, ( "forward method was not patched" ) # Verify the method is still callable assert callable(VoxtralForConditionalGeneration.forward), ( "Patched method is not callable" ) # Test unpatch function unpatch_fn() assert VoxtralForConditionalGeneration.forward == original_forward, ( "unpatch function did not restore original method" ) ================================================ FILE: tests/patched/test_validation.py ================================================ """Module for testing the validation module""" import os import warnings from typing import Optional import pytest from pydantic import ValidationError from axolotl.loaders.utils import check_model_config from axolotl.utils import is_comet_available from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault from axolotl.utils.mlflow_ import setup_mlflow_env_vars from axolotl.utils.schemas.config import AxolotlConfigWCapabilities from axolotl.utils.wandb_ import setup_wandb_env_vars warnings.filterwarnings("error") @pytest.fixture(name="minimal_cfg") def fixture_cfg(): return DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) class BaseValidation: """ Base validation module to setup the log capture """ _caplog: Optional[pytest.LogCaptureFixture] = None @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): self._caplog = caplog class TestValidation(BaseValidation): """ Test the validation module """ def test_defaults(self, minimal_cfg): test_cfg = DictDefault( { "weight_decay": None, } | minimal_cfg ) cfg = validate_config(test_cfg) assert cfg.train_on_inputs is False assert cfg.weight_decay is None def test_zero3_qlora_use_reentrant_false(self, minimal_cfg): test_cfg = DictDefault( { "deepspeed": "deepspeed_configs/zero3_bf16.json", "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": False}, "load_in_4bit": True, "adapter": "qlora", } | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(test_cfg) assert ( "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values" in self._caplog.records[0].message ) def test_deepspeed_empty(self, minimal_cfg): test_cfg = DictDefault( { "deepspeed": "", "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": False}, "load_in_4bit": True, "adapter": "qlora", } | minimal_cfg ) _ = validate_config(test_cfg) def test_deepspeed_not_set(self, minimal_cfg): test_cfg = DictDefault( { "deepspeed": None, "gradient_checkpointing": True, "gradient_checkpointing_kwargs": {"use_reentrant": False}, "load_in_4bit": True, "adapter": "qlora", } | minimal_cfg ) _ = validate_config(test_cfg) def test_datasets_min_length(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [], "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) with pytest.raises( ValidationError, match=r".*List should have at least 1 item after validation*", ): validate_config(cfg) def test_datasets_min_length_empty(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) with pytest.raises( ValueError, match=r".*either datasets or pretraining_dataset is required*" ): validate_config(cfg) def test_pretrain_dataset_min_length(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "pretraining_dataset": [], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "max_steps": 100, } ) with pytest.raises( ValidationError, match=r".*List should have at least 1 item after validation*", ): validate_config(cfg) def test_valid_pretrain_dataset(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "pretraining_dataset": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "max_steps": 100, } ) validate_config(cfg) def test_valid_sft_dataset(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) validate_config(cfg) def test_batch_size_unused_warning(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 4, "batch_size": 32, } ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert "batch_size is not recommended" in self._caplog.records[0].message def test_batch_size_more_params(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "batch_size": 32, } ) with pytest.raises(ValueError, match=r".*At least two of*"): validate_config(cfg) def test_lr_as_float(self, minimal_cfg): cfg = ( DictDefault( { "learning_rate": "5e-5", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.learning_rate == 0.00005 def test_model_config_remap(self, minimal_cfg): cfg = ( DictDefault( { "model_config": {"model_type": "mistral"}, } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.overrides_of_model_config["model_type"] == "mistral" def test_model_type_remap(self, minimal_cfg): cfg = ( DictDefault( { "model_type": "AutoModelForCausalLM", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.type_of_model == "AutoModelForCausalLM" def test_reward_model_defaults(self, minimal_cfg): cfg = ( DictDefault( { "reward_model": True, } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.num_labels == 1 assert new_cfg.type_of_model == "AutoModelForSequenceClassification" def test_process_reward_model_defaults(self, minimal_cfg): cfg = ( DictDefault( { "process_reward_model": True, } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.num_labels == 2 assert new_cfg.type_of_model == "AutoModelForTokenClassification" def test_model_revision_remap(self, minimal_cfg): cfg = ( DictDefault( { "model_revision": "main", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.revision_of_model == "main" def test_qlora(self, minimal_cfg): base_cfg = ( DictDefault( { "adapter": "qlora", } ) | minimal_cfg ) cfg = ( DictDefault( { "load_in_8bit": True, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*8bit.*"): validate_config(cfg) cfg = ( DictDefault( { "gptq": True, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*gptq.*"): validate_config(cfg) cfg = ( DictDefault( { "load_in_4bit": False, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*4bit.*"): validate_config(cfg) cfg = ( DictDefault( { "load_in_4bit": True, } ) | base_cfg ) validate_config(cfg) def test_qlora_merge(self, minimal_cfg): base_cfg = ( DictDefault( { "adapter": "qlora", "merge_lora": True, } ) | minimal_cfg ) cfg = ( DictDefault( { "load_in_8bit": True, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*8bit.*"): validate_config(cfg) cfg = ( DictDefault( { "gptq": True, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*gptq.*"): validate_config(cfg) cfg = ( DictDefault( { "load_in_4bit": True, } ) | base_cfg ) with pytest.raises(ValueError, match=r".*4bit.*"): validate_config(cfg) def test_hf_use_auth_token(self, minimal_cfg): cfg = ( DictDefault( { "push_dataset_to_hub": "namespace/repo", } ) | minimal_cfg ) with pytest.raises(ValueError, match=r".*hf_use_auth_token.*"): validate_config(cfg) cfg = ( DictDefault( { "push_dataset_to_hub": "namespace/repo", "hf_use_auth_token": True, } ) | minimal_cfg ) validate_config(cfg) def test_gradient_accumulations_or_batch_size(self): cfg = DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "gradient_accumulation_steps": 1, "batch_size": 1, } ) with pytest.raises( ValueError, match=r".*gradient_accumulation_steps or batch_size.*" ): validate_config(cfg) def test_falcon_fsdp(self, minimal_cfg): regex_exp = r".*FSDP is not supported for falcon models.*" # Check for lower-case cfg = ( DictDefault( { "base_model": "tiiuae/falcon-7b", "fsdp": ["full_shard", "auto_wrap"], } ) | minimal_cfg ) with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) # Check for upper-case cfg = ( DictDefault( { "base_model": "Falcon-7b", "fsdp": ["full_shard", "auto_wrap"], } ) | minimal_cfg ) with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) cfg = ( DictDefault( { "base_model": "tiiuae/falcon-7b", } ) | minimal_cfg ) validate_config(cfg) def test_mpt_gradient_checkpointing(self, minimal_cfg): regex_exp = r".*gradient_checkpointing is not supported for MPT models*" # Check for lower-case cfg = ( DictDefault( { "base_model": "mosaicml/mpt-7b", "gradient_checkpointing": True, } ) | minimal_cfg ) with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) def test_flash_optimum(self, minimal_cfg): cfg = ( DictDefault( { "flash_optimum": True, "adapter": "lora", "bf16": False, } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert any( "BetterTransformers probably doesn't work with PEFT adapters" in record.message for record in self._caplog.records ) cfg = ( DictDefault( { "flash_optimum": True, "bf16": False, } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert any( "probably set bfloat16 or float16" in record.message for record in self._caplog.records ) cfg = ( DictDefault( { "flash_optimum": True, "fp16": True, } ) | minimal_cfg ) regex_exp = r".*AMP is not supported.*" with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) cfg = ( DictDefault( { "flash_optimum": True, "bf16": True, } ) | minimal_cfg ) regex_exp = r".*AMP is not supported.*" with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) def test_adamw_hyperparams(self, minimal_cfg): cfg = ( DictDefault( { "optimizer": None, "adam_epsilon": 0.0001, } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert any( "adamw hyperparameters found, but no adamw optimizer set" in record.message for record in self._caplog.records ) cfg = ( DictDefault( { "optimizer": "adafactor", "adam_beta1": 0.0001, } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert any( "adamw hyperparameters found, but no adamw optimizer set" in record.message for record in self._caplog.records ) cfg = ( DictDefault( { "optimizer": "adamw_bnb_8bit", "adam_beta1": 0.9, "adam_beta2": 0.99, "adam_epsilon": 0.0001, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "optimizer": "adafactor", } ) | minimal_cfg ) validate_config(cfg) def test_deprecated_packing(self, minimal_cfg): cfg = ( DictDefault( { "max_packed_sequence_len": 1024, } ) | minimal_cfg ) with pytest.raises( DeprecationWarning, match=r"`max_packed_sequence_len` is no longer supported", ): validate_config(cfg) def test_packing(self, minimal_cfg): cfg = ( DictDefault( { "sample_packing": True, "pad_to_sequence_len": False, "flash_attention": True, } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert any( "`pad_to_sequence_len: true` is recommended when using sample_packing" in record.message for record in self._caplog.records ) def test_packing_autoset(self, minimal_cfg): cfg = ( DictDefault( { "sample_packing": True, "pad_to_sequence_len": None, "flash_attention": True, } ) | minimal_cfg ) with self._caplog.at_level("INFO"): cfg = validate_config(cfg) assert any( "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing" in record.message for record in self._caplog.records ) assert cfg.pad_to_sequence_len is True def test_merge_lora_no_bf16_fail(self, minimal_cfg): """ This is assumed to be run on a CPU machine, so bf16 is not supported. """ cfg = ( DictDefault( { "bf16": True, "capabilities": {"bf16": False}, "env_capabilities": { "torch_version": "2.6.0", }, } ) | minimal_cfg ) with pytest.raises(ValueError, match=r".*AMP is not supported on this GPU*"): AxolotlConfigWCapabilities(**cfg.to_dict()) cfg = ( DictDefault( { "bf16": True, "merge_lora": True, "capabilities": {"bf16": False}, } ) | minimal_cfg ) validate_config(cfg) def test_no_conflict_save_strategy(self, minimal_cfg): cfg = ( DictDefault( { "save_strategy": "epoch", "save_steps": 10, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*save_strategy and save_steps mismatch.*" ): validate_config(cfg) cfg = ( DictDefault( { "save_strategy": "no", "save_steps": 10, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*save_strategy and save_steps mismatch.*" ): validate_config(cfg) cfg = ( DictDefault( { "save_strategy": "steps", } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "save_strategy": "steps", "save_steps": 10, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "save_steps": 10, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "save_strategy": "no", } ) | minimal_cfg ) validate_config(cfg) def test_no_conflict_eval_strategy(self, minimal_cfg): cfg = ( DictDefault( { "eval_strategy": "epoch", "eval_steps": 10, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*eval_strategy and eval_steps mismatch.*" ): validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "no", "eval_steps": 10, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*eval_strategy and eval_steps mismatch.*" ): validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "steps", } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "steps", "eval_steps": 10, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_steps": 10, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "no", } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "epoch", "val_set_size": 0, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*", ): validate_config(cfg) cfg = ( DictDefault( { "eval_steps": 10, "val_set_size": 0, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*", ): validate_config(cfg) cfg = ( DictDefault( { "val_set_size": 0, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_steps": 10, "val_set_size": 0.01, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "eval_strategy": "epoch", "val_set_size": 0.01, } ) | minimal_cfg ) validate_config(cfg) def test_eval_table_size_conflict_eval_packing(self, minimal_cfg): cfg = ( DictDefault( { "sample_packing": True, "eval_table_size": 100, "flash_attention": True, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*Please set 'eval_sample_packing' to false.*" ): validate_config(cfg) cfg = ( DictDefault( { "sample_packing": True, "eval_sample_packing": False, "flash_attention": True, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "sample_packing": False, "eval_table_size": 100, "flash_attention": True, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "sample_packing": True, "eval_table_size": 100, "eval_sample_packing": False, "flash_attention": True, } ) | minimal_cfg ) validate_config(cfg) def test_load_in_x_bit_without_adapter(self, minimal_cfg): cfg = ( DictDefault( { "load_in_4bit": True, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*load_in_8bit and load_in_4bit are not supported without setting an adapter.*", ): validate_config(cfg) cfg = ( DictDefault( { "load_in_8bit": True, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*load_in_8bit and load_in_4bit are not supported without setting an adapter.*", ): validate_config(cfg) cfg = ( DictDefault( { "load_in_4bit": True, "adapter": "qlora", } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "load_in_8bit": True, "adapter": "lora", } ) | minimal_cfg ) validate_config(cfg) def test_warmup_step_no_conflict(self, minimal_cfg): cfg = ( DictDefault( { "warmup_steps": 10, "warmup_ratio": 0.1, } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*warmup_steps and warmup_ratio are mutually exclusive*", ): validate_config(cfg) cfg = ( DictDefault( { "warmup_steps": 10, } ) | minimal_cfg ) validate_config(cfg) cfg = ( DictDefault( { "warmup_ratio": 0.1, } ) | minimal_cfg ) validate_config(cfg) def test_unfrozen_parameters_w_peft_layers_to_transform(self, minimal_cfg): cfg = ( DictDefault( { "adapter": "lora", "unfrozen_parameters": [ "model.layers.2[0-9]+.block_sparse_moe.gate.*" ], "peft_layers_to_transform": [0, 1], } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*can have unexpected behavior*", ): validate_config(cfg) def test_hub_model_id_save_value_warns_save_stragey_no(self, minimal_cfg): cfg = DictDefault({"hub_model_id": "test", "save_strategy": "no"}) | minimal_cfg with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 1 def test_hub_model_id_save_value_warns_random_value(self, minimal_cfg): cfg = ( DictDefault({"hub_model_id": "test", "save_strategy": "test"}) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 1 def test_hub_model_id_save_value_steps(self, minimal_cfg): cfg = ( DictDefault({"hub_model_id": "test", "save_strategy": "steps"}) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 0 def test_hub_model_id_save_value_epochs(self, minimal_cfg): cfg = ( DictDefault({"hub_model_id": "test", "save_strategy": "epoch"}) | minimal_cfg ) with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 0 def test_hub_model_id_save_value_none(self, minimal_cfg): cfg = DictDefault({"hub_model_id": "test", "save_strategy": None}) | minimal_cfg with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 0 def test_hub_model_id_save_value_no_set_save_strategy(self, minimal_cfg): cfg = DictDefault({"hub_model_id": "test"}) | minimal_cfg with self._caplog.at_level("WARNING"): validate_config(cfg) assert len(self._caplog.records) == 0 def test_dpo_beta_deprecation(self, minimal_cfg): cfg = DictDefault({"dpo_beta": 0.2}) | minimal_cfg with self._caplog.at_level("WARNING"): new_cfg = validate_config(cfg) assert new_cfg["rl_beta"] == 0.2 assert new_cfg["dpo_beta"] is None assert len(self._caplog.records) == 1 def test_eval_strategy_remap(self, minimal_cfg): cfg = ( DictDefault( { "evaluation_strategy": "steps", } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): new_cfg = validate_config(cfg) assert new_cfg.eval_strategy == "steps" assert ( "evaluation_strategy is deprecated, use eval_strategy instead" in self._caplog.records[0].message ) def test_torch_version_adopt_req(self, minimal_cfg): cfg = ( DictDefault( { "optimizer": "adopt_adamw", } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*ADOPT optimizer is incompatible with torch version*", ): env_capabilities = {"torch_version": "2.3.0"} capabilities = {"bf16": False} _ = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) env_capabilities = {"torch_version": "2.6.0"} capabilities = {"bf16": False} _ = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) env_capabilities = {"torch_version": "2.5.2"} capabilities = {"bf16": False} _ = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) def test_cfg_throws_error_with_s2_attention_and_sample_packing(self, minimal_cfg): test_cfg = DictDefault( { "s2_attention": True, "sample_packing": True, } | minimal_cfg ) with pytest.raises( ValidationError, match=r".*shifted-sparse attention does not currently support sample packing*", ): validate_config(test_cfg) class TestTorchCompileValidation(BaseValidation): """ test suite for when torch_compile is set to 'auto' """ def test_torch_compile_auto(self, minimal_cfg): cfg = ( DictDefault( { "torch_compile": "auto", } ) | minimal_cfg ) env_capabilities = {"torch_version": "2.6.0"} capabilities = {"bf16": True} updated_cfg = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) assert updated_cfg.torch_compile is True env_capabilities = {"torch_version": "2.4.1"} capabilities = {"bf16": True} updated_cfg = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) assert updated_cfg.torch_compile is False env_capabilities = {} capabilities = {"bf16": True} updated_cfg = validate_config( cfg, capabilities=capabilities, env_capabilities=env_capabilities ) assert updated_cfg.torch_compile is False class TestSampleOptimConfigValidation(BaseValidation): """ test configurations for sample optimizations like batch flattening """ def test_batch_flattening_auto_enables(self, minimal_cfg): cfg = ( DictDefault( { "flash_attention": True, "sample_packing": None, "micro_batch_size": 2, "batch_flattening": "auto", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg["batch_flattening"] is True def test_batch_flattening_auto_no_fa(self, minimal_cfg): cfg = ( DictDefault( { "flash_attention": False, "sample_packing": None, "micro_batch_size": 2, "batch_flattening": "auto", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg["batch_flattening"] is False def test_batch_flattening_auto_mbsz_1(self, minimal_cfg): cfg = ( DictDefault( { "flash_attention": True, "sample_packing": None, "micro_batch_size": 1, "batch_flattening": "auto", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg["batch_flattening"] is False def test_batch_flattening_auto_packing(self, minimal_cfg): cfg = ( DictDefault( { "flash_attention": True, "sample_packing": True, "micro_batch_size": 2, "batch_flattening": "auto", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg["batch_flattening"] is False class TestValidationCheckModelConfig(BaseValidation): """ Test the validation for the config when the model config is available """ def test_llama_add_tokens_adapter(self, minimal_cfg): cfg = ( DictDefault( {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]} ) | minimal_cfg ) model_config = DictDefault({"model_type": "llama"}) with pytest.raises( ValueError, match=r".*`lora_modules_to_save` not properly set when adding new tokens*", ): check_model_config(cfg, model_config) cfg = ( DictDefault( { "adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"], "lora_modules_to_save": ["embed_tokens"], } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*`lora_modules_to_save` not properly set when adding new tokens*", ): check_model_config(cfg, model_config) cfg = ( DictDefault( { "adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"], "lora_modules_to_save": ["embed_tokens", "lm_head"], } ) | minimal_cfg ) check_model_config(cfg, model_config) def test_phi_add_tokens_adapter(self, minimal_cfg): cfg = ( DictDefault( {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]} ) | minimal_cfg ) model_config = DictDefault({"model_type": "phi"}) with pytest.raises( ValueError, match=r".*`lora_modules_to_save` not properly set when adding new tokens*", ): check_model_config(cfg, model_config) cfg = ( DictDefault( { "adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"], "lora_modules_to_save": ["embd.wte", "lm_head.linear"], } ) | minimal_cfg ) with pytest.raises( ValueError, match=r".*`lora_modules_to_save` not properly set when adding new tokens*", ): check_model_config(cfg, model_config) cfg = ( DictDefault( { "adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"], "lora_modules_to_save": ["embed_tokens", "lm_head"], } ) | minimal_cfg ) check_model_config(cfg, model_config) class TestValidationWandb(BaseValidation): """ Validation test for wandb """ def test_wandb_set_run_id_to_name(self, minimal_cfg): cfg = ( DictDefault( { "wandb_run_id": "foo", } ) | minimal_cfg ) with self._caplog.at_level("WARNING"): new_cfg = validate_config(cfg) assert any( "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead." in record.message for record in self._caplog.records ) assert new_cfg.wandb_name == "foo" and new_cfg.wandb_run_id == "foo" cfg = ( DictDefault( { "wandb_name": "foo", } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.wandb_name == "foo" and new_cfg.wandb_run_id is None def test_wandb_sets_env(self, minimal_cfg): cfg = ( DictDefault( { "wandb_project": "foo", "wandb_name": "bar", "wandb_run_id": "bat", "wandb_entity": "baz", "wandb_mode": "online", "wandb_watch": "false", "wandb_log_model": "checkpoint", } ) | minimal_cfg ) new_cfg = validate_config(cfg) setup_wandb_env_vars(new_cfg) assert os.environ.get("WANDB_PROJECT", "") == "foo" assert os.environ.get("WANDB_NAME", "") == "bar" assert os.environ.get("WANDB_RUN_ID", "") == "bat" assert os.environ.get("WANDB_ENTITY", "") == "baz" assert os.environ.get("WANDB_MODE", "") == "online" assert os.environ.get("WANDB_WATCH", "") == "false" assert os.environ.get("WANDB_LOG_MODEL", "") == "checkpoint" os.environ.pop("WANDB_PROJECT", None) os.environ.pop("WANDB_NAME", None) os.environ.pop("WANDB_RUN_ID", None) os.environ.pop("WANDB_ENTITY", None) os.environ.pop("WANDB_MODE", None) os.environ.pop("WANDB_WATCH", None) os.environ.pop("WANDB_LOG_MODEL", None) def test_wandb_set_disabled(self, minimal_cfg): cfg = DictDefault({}) | minimal_cfg new_cfg = validate_config(cfg) setup_wandb_env_vars(new_cfg) assert new_cfg.use_wandb is None cfg = ( DictDefault( { "wandb_project": "foo", } ) | minimal_cfg ) new_cfg = validate_config(cfg) setup_wandb_env_vars(new_cfg) assert new_cfg.use_wandb is True os.environ.pop("WANDB_PROJECT", None) @pytest.mark.skipif(is_comet_available() is False, reason="comet_ml is not installed") class TestValidationComet(BaseValidation): """ Validation test for comet """ def test_comet_sets_env(self, minimal_cfg): from axolotl.utils.comet_ import setup_comet_env_vars comet_config = { "comet_api_key": "foo", "comet_workspace": "some_workspace", "comet_project_name": "some_project", "comet_experiment_key": "some_experiment_key", "comet_mode": "get_or_create", "comet_online": False, "comet_experiment_config": { "auto_histogram_activation_logging": False, "auto_histogram_epoch_rate": 2, "auto_histogram_gradient_logging": True, "auto_histogram_tensorboard_logging": False, "auto_histogram_weight_logging": True, "auto_log_co2": False, "auto_metric_logging": True, "auto_metric_step_rate": 15, "auto_output_logging": False, "auto_param_logging": True, "comet_disabled": False, "display_summary_level": 2, "distributed_node_identifier": "some_distributed_node_identifier", "log_code": True, "log_env_cpu": False, "log_env_details": True, "log_env_disk": False, "log_env_gpu": True, "log_env_host": False, "log_env_network": True, "log_git_metadata": False, "log_git_patch": True, "log_graph": False, "name": "some_name", "offline_directory": "some_offline_directory", "parse_args": True, "tags": ["tag1", "tag2"], }, } cfg = DictDefault(comet_config) | minimal_cfg new_cfg = validate_config(cfg) setup_comet_env_vars(new_cfg) comet_env = { key: value for key, value in os.environ.items() if key.startswith("COMET_") } assert ( len(comet_env) == len(comet_config) + len(comet_config["comet_experiment_config"]) - 1 ) assert comet_env == { "COMET_API_KEY": "foo", "COMET_AUTO_LOG_CLI_ARGUMENTS": "true", "COMET_AUTO_LOG_CO2": "false", "COMET_AUTO_LOG_CODE": "true", "COMET_AUTO_LOG_DISABLE": "false", "COMET_AUTO_LOG_ENV_CPU": "false", "COMET_AUTO_LOG_ENV_DETAILS": "true", "COMET_AUTO_LOG_ENV_DISK": "false", "COMET_AUTO_LOG_ENV_GPU": "true", "COMET_AUTO_LOG_ENV_HOST": "false", "COMET_AUTO_LOG_ENV_NETWORK": "true", "COMET_AUTO_LOG_GIT_METADATA": "false", "COMET_AUTO_LOG_GIT_PATCH": "true", "COMET_AUTO_LOG_GRAPH": "false", "COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS": "false", "COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE": "2", "COMET_AUTO_LOG_HISTOGRAM_GRADIENTS": "true", "COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD": "false", "COMET_AUTO_LOG_HISTOGRAM_WEIGHTS": "true", "COMET_AUTO_LOG_METRIC_STEP_RATE": "15", "COMET_AUTO_LOG_METRICS": "true", "COMET_AUTO_LOG_OUTPUT_LOGGER": "false", "COMET_AUTO_LOG_PARAMETERS": "true", "COMET_DISPLAY_SUMMARY_LEVEL": "2", "COMET_DISTRIBUTED_NODE_IDENTIFIER": "some_distributed_node_identifier", "COMET_EXPERIMENT_KEY": "some_experiment_key", "COMET_OFFLINE_DIRECTORY": "some_offline_directory", "COMET_PROJECT_NAME": "some_project", "COMET_START_EXPERIMENT_NAME": "some_name", "COMET_START_EXPERIMENT_TAGS": "tag1,tag2", "COMET_START_MODE": "get_or_create", "COMET_START_ONLINE": "false", "COMET_WORKSPACE": "some_workspace", } for key in comet_env.keys(): os.environ.pop(key, None) class TestValidationMLflow(BaseValidation): """ Validation test for MLflow """ def test_hf_mlflow_artifacts_config_sets_env(self, minimal_cfg): cfg = ( DictDefault( { "hf_mlflow_log_artifacts": True, } ) | minimal_cfg ) new_cfg = validate_config(cfg) assert new_cfg.hf_mlflow_log_artifacts is True # Check it's not already present in env assert "HF_MLFLOW_LOG_ARTIFACTS" not in os.environ setup_mlflow_env_vars(new_cfg) assert os.environ.get("HF_MLFLOW_LOG_ARTIFACTS") == "true" os.environ.pop("HF_MLFLOW_LOG_ARTIFACTS", None) def test_mlflow_not_used_by_default(self, minimal_cfg): cfg = DictDefault({}) | minimal_cfg new_cfg = validate_config(cfg) setup_mlflow_env_vars(new_cfg) assert cfg.use_mlflow is not True cfg = ( DictDefault( { "mlflow_experiment_name": "foo", } ) | minimal_cfg ) new_cfg = validate_config(cfg) setup_mlflow_env_vars(new_cfg) assert new_cfg.use_mlflow is True os.environ.pop("MLFLOW_EXPERIMENT_NAME", None) class TestDataloaderValidation(BaseValidation): """ tests for dataloader_* sane defaults """ def test_dataloader_auto_defaults(self, minimal_cfg): cfg = minimal_cfg new_cfg = validate_config(cfg, {"n_gpu": 8}, {"torch_version": "2.6.0"}) assert new_cfg.dataloader_num_workers == 8 assert new_cfg.dataloader_pin_memory is True assert new_cfg.dataloader_prefetch_factor == 256 ================================================ FILE: tests/prompt_strategies/__init__.py ================================================ ================================================ FILE: tests/prompt_strategies/conftest.py ================================================ """ shared fixtures for prompt strategies tests """ import pytest from datasets import Dataset from transformers import AutoTokenizer from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer from axolotl.utils.chat_templates import _CHAT_TEMPLATES from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="assistant_dataset") def fixture_assistant_dataset(): return Dataset.from_list( [ { "messages": [ {"role": "user", "content": "hello"}, {"role": "assistant", "content": "hello"}, {"role": "user", "content": "goodbye"}, {"role": "assistant", "content": "goodbye"}, ] } ] ) @pytest.fixture(name="sharegpt_dataset") def fixture_sharegpt_dataset(): return Dataset.from_list( [ { "conversations": [ {"from": "human", "value": "hello"}, {"from": "gpt", "value": "hello"}, {"from": "human", "value": "goodbye"}, {"from": "gpt", "value": "goodbye"}, ] } ] ) @pytest.fixture(name="basic_dataset") def fixture_basic_dataset(): return Dataset.from_list( [ { "conversations": [ {"from": "system", "value": "You are an AI assistant."}, {"from": "human", "value": "Hello"}, {"from": "assistant", "value": "Hi there!"}, {"from": "human", "value": "How are you?"}, {"from": "assistant", "value": "I'm doing well, thank you!"}, ] } ] ) @pytest.fixture(name="toolcalling_dataset") def fixture_toolcalling_dataset(): return Dataset.from_list( [ { "messages": [ { "role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location.", }, { "role": "user", "content": "Hey, what's the temperature in Paris right now?", }, { "role": "assistant", "tool_calls": [ { "type": "function", "function": { "name": "get_current_temperature", "arguments": { "location": "Paris, France", "unit": "celsius", }, }, } ], }, { "role": "tool", "name": "get_current_temperature", "content": "22.0", }, { "role": "assistant", "content": "The temperature in Paris is 22.0 degrees Celsius.", }, ] } ] ) @pytest.fixture(name="llama3_tokenizer", scope="session", autouse=True) @enable_hf_offline def fixture_llama3_tokenizer( download_llama3_8b_instruct_model_fixture, ): tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct") return tokenizer @pytest.fixture(name="smollm2_tokenizer", scope="session", autouse=True) @enable_hf_offline def fixture_smollm2_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M") return tokenizer @pytest.fixture(name="mistralv03_tokenizer", scope="session", autouse=True) @enable_hf_offline def fixture_mistralv03_tokenizer( download_mlx_mistral_7b_model_fixture, ): tokenizer = AutoTokenizer.from_pretrained( "mlx-community/Mistral-7B-Instruct-v0.3-4bit" ) return tokenizer @pytest.fixture(name="phi35_tokenizer", scope="session", autouse=True) @enable_hf_offline def fixture_phi35_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct") return tokenizer @pytest.fixture(name="phi4_tokenizer", scope="session", autouse=True) @enable_hf_offline def fixture_phi4_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning") return tokenizer @pytest.fixture(name="gemma2_tokenizer", scope="session", autouse=True) def fixture_gemma2_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("mlx-community/gemma-2-9b-it-4bit") return tokenizer @pytest.fixture(name="magistral_tokenizer") def fixture_magistral_tokenizer(): from axolotl.utils.mistral import HFMistralTokenizer tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Magistral-Small-2506") return tokenizer @pytest.fixture(name="devstral_tokenizer") def fixture_devstral_tokenizer(): from axolotl.utils.mistral import HFMistralTokenizer tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2505") return tokenizer @pytest.fixture(name="devstral_1_1_tokenizer") def fixture_devstral_1_1_tokenizer(): from axolotl.utils.mistral import HFMistralTokenizer tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2507") return tokenizer @pytest.fixture(name="qwen3_tokenizer") @enable_hf_offline def qwen3_tokenizer_fixture( download_qwen3_half_billion_model, ): # pylint: disable=unused-argument,redefined-outer-name tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") return tokenizer @pytest.fixture(name="mistralv03_tokenizer_chat_template_jinja") def fixture_mistralv03_chat_template_jinja_w_system() -> str: return '{%- if messages[0]["role"] == "system" %}\n {%- set system_message = messages[0]["content"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}\n {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message["role"] == "user" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- "[AVAILABLE_TOOLS] [" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- \'{"type": "function", "function": {\' }}\n {%- for key, val in tool.items() if key != "return" %}\n {%- if val is string %}\n {{- \'"\' + key + \'": "\' + val + \'"\' }}\n {%- else %}\n {{- \'"\' + key + \'": \' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- ", " }}\n {%- endif %}\n {%- endfor %}\n {{- "}}" }}\n {%- if not loop.last %}\n {{- ", " }}\n {%- else %}\n {{- "]" }}\n {%- endif %}\n {%- endfor %}\n {{- "[/AVAILABLE_TOOLS]" }}\n {%- endif %}\n {%- if loop.first and system_message is defined %}\n {{- "[INST] " + system_message + "\\n\\n" + message["content"] + "[/INST]" }}\n {%- else %}\n {{- "[INST] " + message["content"] + "[/INST]" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- "[TOOL_CALLS] [" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n {%- endif %}\n {{- \', "id": "\' + tool_call.id + \'"}\' }}\n {%- if not loop.last %}\n {{- ", " }}\n {%- else %}\n {{- "]" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message["role"] == "assistant" %}\n {{- " " + message["content"]|trim + eos_token}}\n {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- \'[TOOL_RESULTS] {"content": \' + content|string + ", " }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n {%- endif %}\n {{- \'"call_id": "\' + message.tool_call_id + \'"}[/TOOL_RESULTS]\' }}\n {%- else %}\n {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n {%- endif %}\n{%- endfor %}\n' @pytest.fixture(name="gemma2_tokenizer_chat_template_jinja") def fixture_gemma2_chat_template_jinja_w_system() -> str: return "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" @pytest.fixture(name="llama3_2_vision_chat_template_jinja") def fixture_llama3_2_vision_with_hardcoded_date() -> str: """Hardcodes the date in the template to avoid the need for date logic in the prompt""" template = _CHAT_TEMPLATES["llama3_2_vision"] old_date_logic = """{%- if not date_string is defined %} {%- if strftime_now is defined %} {%- set date_string = strftime_now("%d %b %Y") %} {%- else %} {%- set date_string = "26 Jul 2024" %} {%- endif %} {%- endif %}""" new_date_logic = """{%- set date_string = "17 Dec 2024" %}""" modified_template = template.replace(old_date_logic, new_date_logic) return modified_template @pytest.fixture(name="chat_template_jinja_with_optional_fields") def fixture_chat_template_jinja_with_optional_fields() -> str: return """{% for message in messages %} {{'<|im_start|>'}}{{ message['role'] }} {% if message['thoughts'] is defined %}[Thoughts: {{ message['thoughts'] }}]{% endif %} {% if message['tool_calls'] is defined %}[Tool: {{ message['tool_calls'][0]['type'] }}]{% endif %} {{ message['content'] }}{{'<|im_end|>'}} {% endfor %}""" @pytest.fixture(name="basic_jinja_template_analyzer") def basic_jinja_template_analyzer(): return JinjaTemplateAnalyzer( """{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|> ' + message['content'] + '<|end|> '}}{% elif message['role'] == 'user' %}{{'<|user|> ' + message['content'] + '<|end|> '}}{% elif message['role'] == 'assistant' %}{{'<|assistant|> ' + message['content'] + '<|end|> '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}""" ) @pytest.fixture(name="mistral_jinja_template_analyzer") def mistral_jinja_template_analyzer(mistralv03_tokenizer_chat_template_jinja): return JinjaTemplateAnalyzer(mistralv03_tokenizer_chat_template_jinja) ================================================ FILE: tests/prompt_strategies/messages/__init__.py ================================================ ================================================ FILE: tests/prompt_strategies/messages/test_chat.py ================================================ """ tests for chat_template prompt strategy """ import unittest from axolotl.prompt_strategies.messages.chat import load from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__, log_level="DEBUG") class TestMessagesChatLlama3: """ Test class for assistant style datasets with llama-3 prompts using the messages chat llama3 strategy. """ def test_llama3_load(self, llama3_tokenizer, assistant_dataset): LOG.info("Loading llama-3 tokenizer with assistant dataset") strategy = load( llama3_tokenizer, DictDefault( { "train_on_inputs": False, "sequence_len": 512, } ), DictDefault( { "chat_template": "llama3", "message_field_role": "role", "message_field_content": "content", "field_messages": "messages", } ), ) res = strategy.wrap_dataset(assistant_dataset) input_ids = res[0]["input_ids"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 882, 128007, # user header 271, 15339, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 15339, 128009, # assistant response eot 128006, 882, 128007, 271, 19045, 29474, 128009, 128006, 78191, 128007, 271, 19045, 29474, 128009, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_alpaca.py ================================================ """ Test module for alpaca integration w chatml """ import pytest from datasets import Dataset from tokenizers import AddedToken from transformers import AutoTokenizer from axolotl.datasets import TokenizedPromptDataset from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter, PromptStyle from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="alpaca_dataset") def fixture_alpaca_dataset(): return Dataset.from_list( [ { "instruction": "Evaluate this sentence for spelling and grammar mistakes", "input": "He finnished his meal and left the resturant", "output": "He finished his meal and left the restaurant.", } ] ) @pytest.fixture(name="tokenizer") @enable_hf_offline def fixture_tokenizer(): tokenizer = AutoTokenizer.from_pretrained( "casperhansen/mistral-7b-instruct-v0.1-awq" ) tokenizer.add_special_tokens( { "eos_token": AddedToken( "<|im_end|>", rstrip=False, lstrip=False, normalized=False ) } ) tokenizer.add_tokens( [ AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False), ] ) return tokenizer class TestAlpacaChatml: """ Test class for alpaca prompter """ def test_no_double_im_end(self, alpaca_dataset, tokenizer): strategy = AlpacaPromptTokenizingStrategy( AlpacaPrompter(prompt_style=PromptStyle.CHATML.value), tokenizer, False, # train_on_inputs 2048, # sequence_len ) dataset_wrapper = TokenizedPromptDataset( strategy, alpaca_dataset, process_count=1 ) input_ids = dataset_wrapper[0]["input_ids"] # fmt: off assert input_ids == [ 1, # Bos 32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13, # instruction 32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13, # input 32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # output ] # fmt: on def test_no_train_on_input(self, alpaca_dataset, tokenizer): strategy = AlpacaPromptTokenizingStrategy( AlpacaPrompter(prompt_style=PromptStyle.CHATML.value), tokenizer, False, # train_on_inputs 2048, # sequence_len ) dataset_wrapper = TokenizedPromptDataset( strategy, alpaca_dataset, process_count=1 ) labels = dataset_wrapper[0]["labels"] # fmt: off assert labels == [ -100, # bos -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # instruction -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # input -100, -100, -100, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # Output ] # fmt: on def test_w_train_on_input(self, alpaca_dataset, tokenizer): strategy = AlpacaPromptTokenizingStrategy( AlpacaPrompter(prompt_style=PromptStyle.CHATML.value), tokenizer, True, # train_on_inputs 2048, # sequence_len ) dataset_wrapper = TokenizedPromptDataset( strategy, alpaca_dataset, process_count=1 ) labels = dataset_wrapper[0]["labels"] # fmt: off assert labels == [ 1, # Bos 32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13, # instruction 32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13, # input 32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # output ] # fmt: on ================================================ FILE: tests/prompt_strategies/test_chat_template_ds_schema_unification.py ================================================ """ Tests for chat template prompt strategy with schema unification for none fields """ import json import pytest from datasets import Dataset from axolotl.prompt_strategies.chat_template import StrategyLoader from axolotl.utils.dict import DictDefault @pytest.fixture(name="messages_w_tools") def fixture_messages_w_tools(): jsons = """ {"messages":[{"role":"user","content":"move to (0, 1)"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"move","arguments":{"x":0,"y":1}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false} {"messages":[{"role":"user","content":"turn 270 degree"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"turn","arguments":{"theta": 270}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false} {"messages":[{"role":"user","content":"jump high"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"invalid_prompt","arguments":{"message": "jump is not a valid action"}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false} """.strip().split("\n") rows = [json.loads(row) for row in jsons] return Dataset.from_list(rows) @pytest.fixture(name="qwen3_prompt_strategy") def qwen3_chat_template_strategy(qwen3_tokenizer): cfg = DictDefault( sequence_len=2048, chat_template="qwen3", eot_tokens=["<|im_end|>"], ) ds_cfg = DictDefault( type="chat_template", ) load = StrategyLoader() strat = load(qwen3_tokenizer, cfg, ds_cfg) return strat class TestSchemaUnification: """ Test class on handling null fields for tool calling """ def test_schema_unification_single_prompt( self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer ): for row in messages_w_tools: inputs = qwen3_prompt_strategy.tokenize_prompt(row) decoded = qwen3_tokenizer.decode(inputs["input_ids"]) tool_call = decoded.split("")[-1].split("")[0] assert '"message": null' not in tool_call assert '"theta": null' not in tool_call def test_schema_unification_batched( self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer ): rows = messages_w_tools.map(qwen3_prompt_strategy.tokenize_prompt, batched=True) for row in rows: decoded = qwen3_tokenizer.decode(row["input_ids"]) tool_call = decoded.split("")[-1].split("")[0] assert '"message": null' not in tool_call assert '"theta": null' not in tool_call ================================================ FILE: tests/prompt_strategies/test_chat_template_utils.py ================================================ """ Tests for utils in axolotl.utils.chat_templates """ import unittest import pytest from transformers import AutoTokenizer from axolotl.utils.chat_templates import ( _CHAT_TEMPLATES, extract_chat_template_args, get_chat_template, ) from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="llama3_tokenizer") @enable_hf_offline def fixture_llama3_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B") return tokenizer class TestGetChatTemplateUtils: """ Tests the get_chat_template function. """ def test_known_chat_template(self): chat_template_str = get_chat_template("llama3") assert chat_template_str == _CHAT_TEMPLATES["llama3"] def test_invalid_chat_template(self): with pytest.raises(ValueError) as exc: get_chat_template("invalid_template") assert str(exc) == "Template 'invalid_template' not found." def test_tokenizer_default_no_tokenizer(self): with pytest.raises(ValueError): get_chat_template("tokenizer_default", tokenizer=None) def test_tokenizer_default_no_chat_template_on_tokenizer(self, llama3_tokenizer): with pytest.raises(ValueError): get_chat_template("tokenizer_default", tokenizer=llama3_tokenizer) def test_tokenizer_default_with_chat_template_on_tokenizer(self, llama3_tokenizer): llama3_tokenizer.chat_template = "test_template" chat_template_str = get_chat_template( "tokenizer_default", tokenizer=llama3_tokenizer ) assert chat_template_str == "test_template" def test_tokenizer_default_fallback_no_tokenizer(self): with pytest.raises(ValueError): get_chat_template("tokenizer_default_fallback_test", tokenizer=None) def test_tokenizer_default_fallback_no_chat_template_on_tokenizer( self, llama3_tokenizer ): chat_template_str = get_chat_template( "tokenizer_default_fallback_chatml", tokenizer=llama3_tokenizer ) assert chat_template_str == get_chat_template("chatml") def test_tokenizer_default_fallback_with_chat_template_on_tokenizer( self, llama3_tokenizer ): llama3_tokenizer.chat_template = "test_template" chat_template_str = get_chat_template( "tokenizer_default_fallback_chatml", tokenizer=llama3_tokenizer ) assert chat_template_str == "test_template" def test_jinja_template_mode(self): jinja_template = "example_jinja_template" chat_template_str = get_chat_template("jinja", jinja_template=jinja_template) assert chat_template_str == jinja_template def test_jinja_template_mode_no_jinja_template(self): with pytest.raises(ValueError): get_chat_template("jinja", jinja_template=None) def test_extract_chat_template_args(self): # No ds_cfg chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg={"chat_template": "chatml"}, ) assert chat_template_choice == "chatml" assert chat_template_jinja is None # ds_cfg provided chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg={ "chat_template": "jinja", "chat_template_jinja": "global_jinja_template", }, ds_cfg={"chat_template": "llama3", "chat_template_jinja": None}, ) assert chat_template_choice == "llama3" assert chat_template_jinja is None # ds_cfg provided with jinja template chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg={"chat_template": "chatml", "chat_template_jinja": None}, ds_cfg={ "chat_template": "jinja", "chat_template_jinja": "ds_jinja_template", }, ) assert chat_template_choice == "jinja" assert chat_template_jinja == "ds_jinja_template" # ds_cfg provided with no chat_template chat_template_choice, chat_template_jinja = extract_chat_template_args( cfg={ "chat_template": "jinja", "chat_template_jinja": "global_jinja_template", }, ds_cfg={"chat_template": None, "chat_template_jinja": "ds_jinja_template"}, ) assert chat_template_choice == "jinja" assert chat_template_jinja == "global_jinja_template" if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_chat_templates.py ================================================ """ tests for chat_template prompt strategy """ import unittest from axolotl.prompt_strategies.chat_template import ( ChatTemplatePrompter, ChatTemplateStrategy, load, ) from axolotl.prompters import IGNORE_TOKEN_ID from axolotl.utils.chat_templates import get_chat_template from axolotl.utils.dict import DictDefault from axolotl.utils.logging import get_logger LOG = get_logger(__name__) class TestAssistantChatTemplateLlama3: """ Test class for assistant style datasets with llama-3 prompts using the chat_template strategy. """ def test_llama3_load(self, llama3_tokenizer, assistant_dataset): LOG.info("Loading llama-3 tokenizer with assistant dataset") strategy = load( llama3_tokenizer, DictDefault( { "train_on_inputs": False, "sequence_len": 512, } ), DictDefault( { "chat_template": "llama3", "message_field_role": "role", "message_field_content": "content", "message_property_mappings": { "role": "role", "content": "content", }, "roles": { "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, "field_messages": "messages", } ), ) res = strategy.tokenize_prompt(assistant_dataset[0]) input_ids = res["input_ids"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 882, 128007, # user header 271, 15339, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 15339, 128009, # assistant response eot 128006, 882, 128007, 271, 19045, 29474, 128009, 128006, 78191, 128007, 271, 19045, 29474, 128009, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) def test_llama3(self, llama3_tokenizer, assistant_dataset): LOG.info("Testing llama-3 with assistant dataset") strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template("llama3"), message_property_mappings={ "role": "role", "content": "content", }, roles={ "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, ), tokenizer=llama3_tokenizer, train_on_inputs=False, sequence_len=512, ) res = strategy.tokenize_prompt(assistant_dataset[0]) input_ids = res["input_ids"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 882, 128007, # user header 271, 15339, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 15339, 128009, # assistant response eot 128006, 882, 128007, 271, 19045, 29474, 128009, 128006, 78191, 128007, 271, 19045, 29474, 128009, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) def test_phi35(self, phi35_tokenizer, assistant_dataset): LOG.info("Testing phi-3.5 with assistant dataset") strategy = ChatTemplateStrategy( ChatTemplatePrompter( phi35_tokenizer, chat_template=get_chat_template("phi_35"), message_property_mappings={ "role": "role", "content": "content", }, roles={ "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, ), tokenizer=phi35_tokenizer, train_on_inputs=False, sequence_len=512, ) res = strategy.tokenize_prompt(assistant_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 32010, # user 22172, 32007, # user eot 32001, # assistant 22172, 32007, # assistant eot 32010, # user 1781, 26966, 32007, # user eot 32001, # assistant 1781, 26966, 32007, # assistant eot ] expected_labels = [ -100, # user -100, -100, # user eot -100, # assistant -100, -100, # assistant eot, -100, # user -100, -100, -100, # user eot -100, # assistant 1781, 26966, 32007, # assistant eot ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) LOG.debug(f"Expected labels : {expected_labels}") LOG.debug(f"Actual labels : {labels}") assert labels == expected_labels, ( f"Input IDs mismatch: {labels} != {expected_labels}" ) def test_llama3_with_training_data(self, llama3_tokenizer, assistant_dataset): LOG.info("Testing llama-3 with assistant dataset including training data") strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template("llama3"), message_field_training="training", message_property_mappings={ "role": "role", "content": "content", }, roles={ "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="none", sequence_len=512, roles_to_train=["assistant"], ) prompt_tokens = strategy.prompter.build_prompt( assistant_dataset[0]["messages"], False ) prompt = llama3_tokenizer.decode(prompt_tokens, skip_special_tokens=False) LOG.debug(f"Generated prompt: {prompt}") res = strategy.tokenize_prompt(assistant_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # fmt: off expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user prompt eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID, # assistant response eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID, ] # fmt: on LOG.debug(f"Expected labels: {expected_labels}") LOG.debug(f"Actual labels: {labels}") assert labels == expected_labels, ( f"Labels mismatch:\n" f"Expected: {expected_labels}\n" f"Actual: {labels}\n" f"Input IDs: {input_ids}\n" ) class TestSharegptChatTemplateLlama3: """ Test class for ShareGPT style datasets with llama-3 prompts using the chat_template strategy. """ def test_llama3_assistant(self, llama3_tokenizer, sharegpt_dataset): LOG.info("Testing ShareGPT style datasets with llama-3 assistant prompts") strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template("llama3"), message_property_mappings={ "role": "from", "content": "value", }, field_messages="conversations", ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="none", sequence_len=512, roles_to_train=["gpt"], ) res = strategy.tokenize_prompt(sharegpt_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 882, 128007, # user header 271, 15339, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 15339, 128009, # assistant response eot 128006, 882, 128007, 271, 19045, 29474, 128009, 128006, 78191, 128007, 271, 19045, 29474, 128009, ] expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user prompt eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID, # assistant response eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") LOG.debug(f"Expected labels: {expected_labels}") LOG.debug(f"Actual labels: {labels}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) assert labels == expected_labels, ( f"Labels mismatch: {labels} != {expected_labels}" ) def test_llama3_human(self, llama3_tokenizer, sharegpt_dataset): LOG.info("Testing ShareGPT style datasets with llama-3 human prompts") strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template("llama3"), message_property_mappings={ "role": "from", "content": "value", }, field_messages="conversations", ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="none", sequence_len=512, roles_to_train=["human"], ) res = strategy.tokenize_prompt(sharegpt_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 882, 128007, # user header 271, 15339, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 15339, 128009, # assistant response eot 128006, 882, 128007, 271, 19045, 29474, 128009, 128006, 78191, 128007, 271, 19045, 29474, 128009, ] expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID, # user prompt eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant response eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") LOG.debug(f"Expected labels: {expected_labels}") LOG.debug(f"Actual labels: {labels}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) assert labels == expected_labels, ( f"Labels mismatch: {labels} != {expected_labels}" ) def test_llama3_system_human(self, llama3_tokenizer, basic_dataset): LOG.info("Testing ShareGPT style datasets with llama-3 system/human prompts") strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template("llama3"), message_property_mappings={ "role": "from", "content": "value", }, field_messages="conversations", ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="none", sequence_len=512, roles_to_train=["system", "human"], ) res = strategy.tokenize_prompt(basic_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 9125, 128007, 271, 2675, 527, 459, 15592, 18328, 13, 128009, 128006, 882, 128007, # user header 271, 9906, 128009, # user prompt eot 128006, 78191, 128007, # assistant header 271, 13347, 1070, 0, 128009, # assistant response eot 128006, 882, 128007, 271, 4438, 527, 499, 30, 128009, 128006, 78191, 128007, 271, 40, 2846, 3815, 1664, 11, 9901, 499, 0, 128009, ] expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system header IGNORE_TOKEN_ID, 2675, 527, 459, 15592, 18328, 13, IGNORE_TOKEN_ID, # system prompt eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, 9906, IGNORE_TOKEN_ID, # user prompt eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant response eot IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, 4438, 527, 499, 30, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, ] # fmt: on LOG.debug(f"Expected input_ids: {expected_input_ids}") LOG.debug(f"Actual input_ids: {input_ids}") LOG.debug(f"Expected labels: {expected_labels}") LOG.debug(f"Actual labels: {labels}") assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) assert labels == expected_labels, ( f"Labels mismatch: {labels} != {expected_labels}" ) class TestAssistantToolCallingChatTemplateLlama32Vision: """ Test class for assistant style datasets with tool_calling prompts using the llama-32_vision chat template. """ def test_llama32vision_train_on_assistant( self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja ): LOG.info( "Testing assistant style datasets with tool_calling with llama-32 chat template, training on assistant" ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template( "jinja", jinja_template=llama3_2_vision_chat_template_jinja ), message_property_mappings={"role": "role", "content": "content"}, ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="turn", sequence_len=512, roles_to_train=["assistant"], ) res = strategy.tokenize_prompt(toolcalling_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 9125, 128007, 271, # system header 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1114, 3799, 220, 2366, 19, 271, # system date prompt 2675, 527, 264, 11164, 430, 31680, 311, 9282, 20126, 13, 1472, 1288, 10052, 449, 279, 5089, 1511, 304, 279, 79002, 3813, 13, 128009, # system message 128006, 882, 128007, 271, # user header 19182, 11, 1148, 596, 279, 9499, 304, 12366, 1314, 1457, 30, 128009, # user message 128006, 78191, 128007, 271, # assistant header 5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009, # assistant message 128006, 23799, 4690, 128007, 271, # tool header 1, 1313, 13, 15, 1, 128009, # tool message 128006, 78191, 128007, 271, # assistant header 791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009 # assistant message ] expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system date prompt IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header 5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009, # assistant message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # tool header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # tool message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header 791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009 # assistant message ] # fmt: on assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) assert labels == expected_labels, ( f"Labels mismatch: {labels} != {expected_labels}" ) def test_llama32vision_train_on_tools( self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja ): LOG.info( "Testing assistant style datasets with tool_calling with llama-32 chat template, training on tools" ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( llama3_tokenizer, chat_template=get_chat_template( "jinja", jinja_template=llama3_2_vision_chat_template_jinja ), message_property_mappings={"role": "role", "content": "content"}, ), tokenizer=llama3_tokenizer, train_on_inputs=False, train_on_eos="turn", sequence_len=512, roles_to_train=["assistant", "tool"], ) res = strategy.tokenize_prompt(toolcalling_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # fmt: off expected_input_ids = [ 128000, # bos 128006, 9125, 128007, 271, # system header 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1114, 3799, 220, 2366, 19, 271, # system date prompt 2675, 527, 264, 11164, 430, 31680, 311, 9282, 20126, 13, 1472, 1288, 10052, 449, 279, 5089, 1511, 304, 279, 79002, 3813, 13, 128009, # system message 128006, 882, 128007, 271, # user header 19182, 11, 1148, 596, 279, 9499, 304, 12366, 1314, 1457, 30, 128009, # user message 128006, 78191, 128007, 271, # assistant header 5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009, # assistant message 128006, 23799, 4690, 128007, 271, # tool header 1, 1313, 13, 15, 1, 128009, # tool message 128006, 78191, 128007, 271, # assistant header 791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009 # assistant message ] expected_labels = [ IGNORE_TOKEN_ID, # bos IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system date prompt IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # system message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user header IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # user message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header 5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009, # assistant message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # tool header IGNORE_TOKEN_ID, 1313, 13, 15, IGNORE_TOKEN_ID, 128009, # tool message IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, # assistant header 791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009 # assistant message ] # fmt: on assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) assert labels == expected_labels, ( f"Labels mismatch: {labels} != {expected_labels}" ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_chat_templates_advanced.py ================================================ """ tests for chat_template prompt strategy """ from copy import deepcopy import pytest from datasets import Dataset from tokenizers import AddedToken from transformers import PreTrainedTokenizer from axolotl.prompt_strategies.chat_template import ( ChatTemplatePrompter, ChatTemplateStrategy, ) from axolotl.prompters import IGNORE_TOKEN_ID from axolotl.utils.chat_templates import get_chat_template from axolotl.utils.logging import get_logger from tests.hf_offline_utils import enable_hf_offline LOG = get_logger(__name__) PARAMETRIZE_KEYS = "tokenizer, chat_template, chat_template_jinja, eos_token" PARAMETRIZE_PARAMS = [ ("llama3_tokenizer", "llama3", None, None), ("llama3_tokenizer", "chatml", None, "<|im_end|>"), ( "mistralv03_tokenizer", "jinja", "mistralv03_tokenizer_chat_template_jinja", "[/INST]", ), ( "gemma2_tokenizer", "jinja", "gemma2_tokenizer_chat_template_jinja", "", ), # ("phi35_tokenizer", "phi_35", None, "<|end|>"), # seems to be broken w transformers v5 ("phi4_tokenizer", "phi_4", None, "<|im_end|>"), ] @pytest.mark.parametrize( PARAMETRIZE_KEYS, PARAMETRIZE_PARAMS, ) class TestChatTemplateConfigurations: """ Test class for various configurations of ChatTemplateStrategy. """ @staticmethod def setup_tokenizer( tokenizer_name, chat_template, chat_template_jinja=None, eos_token=None, request=None, eot_token=None, ) -> tuple[PreTrainedTokenizer, str]: """ Helper function to set up the tokenizer and chat template for the test. """ tokenizer = deepcopy(request.getfixturevalue(tokenizer_name)) if chat_template == "jinja": chat_template_jinja = request.getfixturevalue(chat_template_jinja) if eos_token: tokenizer.add_special_tokens( { "eos_token": AddedToken( eos_token, rstrip=False, lstrip=False, normalized=False ) } ) if tokenizer.__class__.__name__ in ( "LlamaTokenizerFast", "CodeLlamaTokenizerFast", ): tokenizer.update_post_processor() if eot_token: tokenizer.add_special_tokens({"additional_special_tokens": [eot_token]}) return tokenizer, chat_template_jinja def _should_skip_turn(self, tokenizer, turn, turn_idx, start_idx, end_idx): """Helper method to determine if a turn should be skipped in testing. This is used to skip system messages for Mistral as the template does not output them without more turns. """ if ( turn_idx == 0 and turn.get("from") in ["system", "context"] and ("mistral" in tokenizer.name_or_path.lower()) ): assert start_idx == -1 and end_idx == -1, ( "Expected system message to be skipped" ) return True return False @enable_hf_offline def test_train_on_inputs_true( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_inputs=True") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=True, sequence_len=512, roles_to_train=["assistant"], ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Verify assistant responses are labeled for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) assert all( label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}" ) LOG.debug("Full labels: %s", labels) LOG.debug("Full input_ids: %s", input_ids) def test_train_on_inputs_false( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_inputs=False, on assistant only") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Process all turns and verify correct labeling based on role for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) # Verify that assistant responses are labeled and other inputs are not is_assistant = turn["from"] == "assistant" if is_assistant: assert all( label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}" ) else: assert all( label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}" ) def test_roles_to_train_human_assistant_only( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing roles_to_train with human assistant only") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant", "human"], ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Process all turns and verify correct labeling based on role for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) # Verify that non-system responses are labeled and system are not should_be_labelled = turn["from"] != "system" if should_be_labelled: assert all( label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}" ) else: assert all( label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}" ) def test_roles_to_train_all( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing roles_to_train with all roles") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=True, sequence_len=512, roles_to_train=["human", "assistant"], ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Verify that all responses are labeled (except for special tokens) for i, turn in enumerate(basic_dataset[0]["conversations"]): response = turn["value"] start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}" ) assert all( label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}" ) def test_empty_roles_to_train( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with empty roles_to_train") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=[], train_on_eos="none", # Add this line ) res = strategy.tokenize_prompt(basic_dataset[0]) labels = res["labels"] # Verify that no labels are set when roles_to_train is empty LOG.debug("Full labels: %s", labels) assert all(label == IGNORE_TOKEN_ID for label in labels), ( "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty" ) def test_train_on_eos_all( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_eos='all'") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="all", ) res = strategy.tokenize_prompt(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] eos_token_id = tokenizer.eos_token_id eos_indices = [ i for i, token_id in enumerate(input_ids) if token_id == eos_token_id ] assert len(eos_indices) > 0, "Expected at least one EOS token in the input" for eos_idx in eos_indices: assert labels[eos_idx] != IGNORE_TOKEN_ID, ( f"Expected EOS token at index {eos_idx} to be labeled" ) def test_train_on_eos_turn( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_eos='turn'") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="turn", ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] eos_token_id = tokenizer.eos_token_id # Process all turns and verify EOS token labeling for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) # Find the EOS token after this turn eos_idx = end_idx while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id: eos_idx += 1 assert eos_idx < len(input_ids), ( f"Could not find EOS token after '{response}'" ) LOG.debug( f"Turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}, eos_idx={eos_idx}" ) LOG.debug( f"Labels for turn {i}: {labels[start_idx:end_idx]}, EOS label: {labels[eos_idx]}" ) # Verify EOS token labeling based on role is_assistant = turn["from"] == "assistant" if is_assistant: assert labels[eos_idx] != IGNORE_TOKEN_ID, ( f"Expected EOS token after assistant response '{response}' to be labeled" ) else: assert labels[eos_idx] == IGNORE_TOKEN_ID, ( f"Expected EOS token after non-assistant input '{response}' to not be labeled" ) def test_train_on_eos_last( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_eos='last'") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="last", ) res = strategy.tokenize_prompt(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] eos_token_id = tokenizer.eos_token_id eos_indices = [ i for i, token_id in enumerate(input_ids) if token_id == eos_token_id ] assert len(eos_indices) > 0, "Expected at least one EOS token in the input" last_eos_idx = eos_indices[-1] # Check that only the last EOS token is labeled for idx in eos_indices[:-1]: assert labels[idx] == IGNORE_TOKEN_ID, ( f"Expected EOS token at index {idx} to not be labeled" ) assert labels[last_eos_idx] != IGNORE_TOKEN_ID, ( f"Expected last EOS token at index {last_eos_idx} to be labeled" ) def test_train_on_eos_none( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with train_on_eos='none'") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="none", ) res = strategy.tokenize_prompt(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] eos_token_id = tokenizer.eos_token_id eos_indices = [ i for i, token_id in enumerate(input_ids) if token_id == eos_token_id ] assert len(eos_indices) > 0, "Expected at least one EOS token in the input" for eos_idx in eos_indices: assert labels[eos_idx] == IGNORE_TOKEN_ID, ( f"Expected EOS token at index {eos_idx} to not be labeled" ) def test_drop_system_message( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): LOG.info("Testing with drop_system_message=True") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), drop_system_message=True, message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], ) res = strategy.tokenize_prompt(basic_dataset[0]) input_ids = res["input_ids"] # Check if system message is not present in input_ids system_message = "You are an AI assistant." decoded_message = tokenizer.decode(input_ids) assert system_message not in decoded_message, ( "Expected system message to be dropped" ) def test_custom_roles( self, tokenizer, chat_template, chat_template_jinja, eos_token, request, ): LOG.info("Testing with custom roles mapping") custom_roles = { "user": ["human", "user"], "assistant": ["ai", "assistant"], "system": ["context"], } tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), roles=custom_roles, message_property_mappings={"role": "from", "content": "value"}, ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["ai"], ) # Create a new dataset with modified role names modified_conversations = [ {"from": "context", "value": "You are an AI assistant."}, {"from": "human", "value": "Hello"}, {"from": "ai", "value": "Hi there!"}, {"from": "human", "value": "How are you?"}, {"from": "ai", "value": "I'm doing well, thank you!"}, ] modified_dataset = Dataset.from_dict({"messages": [modified_conversations]}) res = strategy.tokenize_prompt(modified_dataset[0]) turns = strategy.get_conversation_thread(modified_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Process all turns and verify labeling for i, turn in enumerate(modified_dataset[0]["messages"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) # Check if responses are labeled correctly based on role is_ai = turn["from"] == "ai" if is_ai: assert all( label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), f"Expected labels for AI response '{response}' to be set" else: assert all( label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx] ), ( f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID" ) def test_message_field_training( self, tokenizer, chat_template, chat_template_jinja, eos_token, request, ): LOG.info("Testing with message_field_training") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_field_training="train", message_field_training_detail="train_detail", message_property_mappings={"role": "from", "content": "value"}, ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=[], ) # Create a new dataset with the train and train_detail fields modified_conversation = [ {"from": "system", "value": "You are an AI assistant.", "train": False}, {"from": "human", "value": "Hello", "train": False}, {"from": "assistant", "value": "Hello", "train": True}, {"from": "human", "value": "How are you?", "train": True}, { "from": "assistant", "value": "I'm doing very well, thank you!", "train_detail": [ {"begin_offset": 0, "end_offset": 8, "train": False}, {"begin_offset": 9, "end_offset": 18, "train": True}, {"begin_offset": 19, "end_offset": 30, "train": False}, ], }, { "from": "human", "value": "I'm doing very well, thank you!", "train": False, }, {"from": "assistant", "value": "Hi there!", "train": True}, ] modified_dataset = Dataset.from_dict({"messages": [modified_conversation]}) res = strategy.tokenize_prompt(modified_dataset[0]) turns = strategy.get_conversation_thread(modified_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] def verify_labels(labels_span, should_train, context_message): """Helper to verify if a span of labels matches expected training state""" if should_train: assert all(label != IGNORE_TOKEN_ID for label in labels_span), ( f"Expected all labels for {context_message} to be set, but got {labels_span}" ) else: assert all(label == IGNORE_TOKEN_ID for label in labels_span), ( f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}" ) # Process all turns and verify labeling for i, turn in enumerate(modified_dataset[0]["messages"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx): continue decoded_response = tokenizer.decode(input_ids[start_idx:end_idx]) response = turn["value"] assert response in decoded_response, ( f"Response {response} not found in index {start_idx}:{end_idx} " f"decoded:{decoded_response}" ) LOG.debug( f"Processing turn {i}: role={turn['from']}, content='{turn['value']}', " f"start_idx={start_idx}, end_idx={end_idx}" ) if turn.get("train_detail", None) is not None: # Handle detailed token-level training control tokenized_output = tokenizer( turn["value"], return_offsets_mapping=True, add_special_tokens=False ) assert tokenized_output["input_ids"] == input_ids[start_idx:end_idx], ( f"Tokenized input mismatch for turn: {turn['value']}\n" f"Expected: {input_ids[start_idx:end_idx]}\nActual: {tokenized_output['input_ids']}\n" f"This will likely be a mismatch between template content and encoded content" ) token_offsets = tokenized_output["offset_mapping"] # Adjust token offsets for j in range(len(token_offsets) - 1): token_offsets[j] = ( token_offsets[j][0], token_offsets[j + 1][0] - 1, ) token_offsets[-1] = (token_offsets[-1][0], len(turn["value"]) - 1) adjusted_train_details = strategy.prompter.adjust_train_details( turn["train_detail"], token_offsets ) LOG.debug(f"Original train_details: {turn['train_detail']}") LOG.debug(f"Adjusted train_details: {adjusted_train_details}") # Get and verify token offsets turn_tokens = input_ids[start_idx:end_idx] token_offsets_unmasked = strategy.prompter.get_offsets_for_train_detail( text=turn["value"], train_details=adjusted_train_details, mask_untrainable=False, ) for i, offset in enumerate(token_offsets_unmasked): assert token_offsets[i][0] == offset, ( f"Token start offsets mismatch for turn: {turn['value']}\n" f"Expected: {token_offsets[i][0]}\nActual: {offset}" ) token_offsets_masked = strategy.prompter.get_offsets_for_train_detail( text=turn["value"], train_details=adjusted_train_details, mask_untrainable=True, ) LOG.debug(f"Token offsets: {token_offsets_masked}") # Verify expected labels against actual labels expected_labels = [IGNORE_TOKEN_ID] * len(turn_tokens) for i, offset in enumerate(token_offsets_masked): if offset != IGNORE_TOKEN_ID: expected_labels[i] = turn_tokens[i] actual_labels = labels[ start_idx : start_idx + len(token_offsets_masked) ] assert actual_labels == expected_labels, ( f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}" ) # Verify each detail section for detail in adjusted_train_details: detail_start = start_idx + next( j for j, offset in enumerate(token_offsets_unmasked) if offset >= detail["begin_offset"] ) detail_end = start_idx + next( ( j for j, offset in enumerate(token_offsets_unmasked) if offset > detail["end_offset"] ), len(token_offsets), ) detail_text = turn["value"][ detail["begin_offset"] : detail["end_offset"] + 1 ] detail_labels = labels[detail_start:detail_end] context = ( f"detail (ind {detail_start}:{detail_end}): '{detail_text}'\n" f"decoded: '{tokenizer.decode(input_ids[detail_start:detail_end])}')" ) verify_labels(detail_labels, detail["train"], context) else: # Handle regular turn-level training control should_train = turn.get("train", False) turn_labels = labels[start_idx:end_idx] context = ( f"turn (ind {start_idx}:{end_idx}): '{turn['value']}'\n" f"decoded: '{decoded_response}')" ) verify_labels(turn_labels, should_train, context) LOG.debug(f"Final labels: {labels}") LOG.debug(f"Final input_ids: {input_ids}") def test_get_chat_template_variables( self, tokenizer, chat_template, chat_template_jinja, eos_token, request ): LOG.info("Testing get_chat_template_variables") actual_tokenizer, actual_jinja_template = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) prompter = ChatTemplatePrompter( actual_tokenizer, chat_template=get_chat_template( chat_template, jinja_template=actual_jinja_template ), message_property_mappings={"from": "role", "value": "content"}, ) variables = prompter.get_chat_template_msg_variables( ( actual_jinja_template if actual_jinja_template else actual_tokenizer.get_chat_template() ), "messages", ) # Special case for Mistral with additional tool variables if chat_template == "jinja" and tokenizer == "mistralv03_tokenizer": expected_variables = {"role", "content", "tool_call_id", "tool_calls"} # Most chat templates use the standard role and content variables elif chat_template in ["llama3", "chatml", "phi_35", "phi_4"] or ( chat_template == "jinja" and tokenizer == "gemma2_tokenizer" ): expected_variables = {"role", "content"} else: LOG.warning( f"Unsupported chat template: {chat_template} with {chat_template_jinja}" ) raise ValueError( f"Unsupported chat template: {chat_template} with {chat_template_jinja}" ) assert variables == expected_variables, ( f"Expected variables: {expected_variables} from {tokenizer}/{chat_template}\n" f"Got: {variables}\n" f"Chat template: {actual_jinja_template}" ) def test_eot_tokens_conflict_with_eos_token( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): """Test that an error is raised when eot_tokens contains eos_token and train_on_eot/train_on_eos conflict""" LOG.info( "Testing conflict between eot_tokens containing eos_token and train_on_eot/train_on_eos mismatch" ) tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) # Create a situation where eot_tokens contains eos_token eot_tokens = [ tokenizer.eos_token, "[/INST]", ] # Deliberately including eos_token # Create conflicting train_on_eos and train_on_eot settings with pytest.raises( ValueError, match=".*eos_token is in eot_tokens and train_on_eos != train_on_eot.*", ): ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="none", # Setting to none train_on_eot="turn", # Different from train_on_eos eot_tokens=eot_tokens, ) def test_eot_token_backward_compatibility( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): """Test that eot_tokens inherits from eos_token when not specified""" LOG.info("Testing backward compatibility that eot_token inherits eos_token") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eos="turn", # Setting train_on_eos to "turn" ) # In backward compatibility mode, eot_tokens should be derived from eos_token assert strategy.eot_tokens == [tokenizer.eos_token], ( f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}" ) assert strategy.train_on_eot == "turn", ( f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}" ) def test_token_not_in_template( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): """Test runs even when tokens are not found in the template""" LOG.info("Testing runs even when tokens are not found in template") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) # Create a non-existent token that definitely won't be in the template non_existent_token = "[DEFINITELY_NOT_IN_TEMPLATE]" tokenizer.add_special_tokens( {"additional_special_tokens": [non_existent_token]} ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], eot_tokens=[non_existent_token], ) # Force template check by calling tokenize_prompt strategy.tokenize_prompt(basic_dataset[0]) # We can also check that a warning was logged, but there's # caplog conflicts when running with other tests # assert any( # "not found in chat_template" in record.message for record in self._caplog.records # ), "Expected warning about token not found in template was not logged" def test_custom_eot_tokens( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): """Test with custom EOT tokens to ensure proper masking and training""" LOG.info("Testing with custom EOT tokens") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, None, request ) # Add custom EOT tokens to the tokenizer custom_eot = "[EOT]" tokenizer.add_special_tokens({"additional_special_tokens": [custom_eot]}) # Create a custom chat template that uses our EOT token custom_template = """{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}User: {{ message['content'] }}{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}[EOT]{% endif %}{% endfor %}""" strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=custom_template, message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eot="turn", # Train on EOT token after each turn eot_tokens=[custom_eot], ) res = strategy.tokenize_prompt(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] # Find indices of the EOT token eot_token_id = tokenizer.convert_tokens_to_ids(custom_eot) eot_indices = [ i for i, token_id in enumerate(input_ids) if token_id == eot_token_id ] assert len(eot_indices) > 0, "Expected at least one EOT token in the input" # Verify labeling for EOT tokens based on role turns = strategy.get_conversation_thread(basic_dataset[0]) assistant_turn_indices = [] non_assistant_turn_indices = [] for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if start_idx != -1 and end_idx != -1: # If turn is found if turn["from"] == "assistant": assistant_turn_indices.append((start_idx, end_idx)) else: non_assistant_turn_indices.append((start_idx, end_idx)) # Check EOT tokens after assistant turns are labeled for eot_idx in eot_indices: is_after_assistant = any( start_idx <= eot_idx <= end_idx + 1 # +1 to include the EOT token for start_idx, end_idx in assistant_turn_indices ) if is_after_assistant: assert labels[eot_idx] != IGNORE_TOKEN_ID, ( f"Expected EOT token after assistant turn at index {eot_idx} to be labeled" ) else: assert labels[eot_idx] == IGNORE_TOKEN_ID, ( f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled" ) def test_multiple_train_on_eot_settings( self, tokenizer, chat_template, chat_template_jinja, eos_token, basic_dataset, request, ): """Test different train_on_eot settings""" LOG.info("Testing different train_on_eot settings") tokenizer, chat_template_jinja = self.setup_tokenizer( tokenizer, chat_template, chat_template_jinja, eos_token, request ) # Create a list to test different train_on_eot settings test_settings = [ ("none", lambda idx, is_assistant: False), # Never train on EOT ("all", lambda idx, is_assistant: True), # Always train on EOT ( "turn", lambda idx, is_assistant: is_assistant, ), # Train on EOT after assistant turns ("last", lambda idx, is_last: is_last), # Only train on last EOT ] for setting, expected_train_func in test_settings: LOG.info(f"Testing train_on_eot='{setting}'") strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template( chat_template, jinja_template=chat_template_jinja ), message_property_mappings={"role": "from", "content": "value"}, field_messages="conversations", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], train_on_eot=setting, eot_tokens=[ tokenizer.eos_token ], # Use eos_token as the EOT token for simplicity ) res = strategy.tokenize_prompt(basic_dataset[0]) turns = strategy.get_conversation_thread(basic_dataset[0]) labels = res["labels"] input_ids = res["input_ids"] eos_token_id = tokenizer.eos_token_id eos_indices = [ i for i, token_id in enumerate(input_ids) if token_id == eos_token_id ] assert len(eos_indices) > 0, ( "Expected at least one EOS/EOT token in the input" ) # Check labeling for each EOS/EOT token for idx, eos_idx in enumerate(eos_indices): # Find which turn this EOS token belongs to preceding_turn = None for i, turn in enumerate(basic_dataset[0]["conversations"]): start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i) if ( start_idx != -1 and end_idx != -1 and start_idx <= eos_idx <= end_idx + 1 ): preceding_turn = turn break is_assistant = ( preceding_turn is not None and preceding_turn["from"] == "assistant" ) is_last = idx == len(eos_indices) - 1 expected_label = not expected_train_func( idx, is_assistant if setting != "last" else is_last ) if expected_label: assert labels[eos_idx] == IGNORE_TOKEN_ID, ( f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'" ) else: assert labels[eos_idx] != IGNORE_TOKEN_ID, ( f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'" ) class TestChatTemplateToolCalling: """ Test class for tool calling functionality with chat templates. """ def test_tool_calling_with_llama4_template( self, llama3_tokenizer, ): LOG.info("Testing tool calling with llama3 tokenizer and llama4 chat template") # Create tool calling dataset tool_calling_dataset = [ { "tools": [ { "type": "function", "function": { "name": "xml_escape", "description": 'Replaces any "<", ">", or "&" characters in the input string with their corresponding XML entities.', "parameters": { "type": "object", "properties": { "s": { "type": "string", "description": "The input string to be XML-escaped.", } }, "required": ["s"], }, }, }, { "type": "function", "function": { "name": "multiples", "description": "Generates a list of all the multiples of a number that are less than a given limit.", "parameters": { "type": "object", "properties": { "number": { "type": "integer", "description": "The number to find multiples of.", }, "limit": { "type": "integer", "description": "The upper limit for the multiples.", }, }, "required": ["number", "limit"], }, }, }, ], "messages": [ { "role": "user", "content": "Can you help me find multiples of 5 that are less than 20?", }, { "role": "assistant", "tool_calls": [ { "type": "function", "function": { "name": "multiples", "arguments": { "number": 5, "limit": 20, }, }, } ], }, {"role": "tool", "name": "multiples", "content": "5,10,15"}, { "role": "assistant", "content": "The multiples of 5 less than 20 are: 5, 10, and 15.", }, ], } ] # Setup tokenizer with llama4 chat template tokenizer = deepcopy(llama3_tokenizer) # Add EOS token to the tokenizer eot_token = "<|eot_id|>" tokenizer.add_special_tokens({"additional_special_tokens": [eot_token]}) strategy = ChatTemplateStrategy( ChatTemplatePrompter( tokenizer, chat_template=get_chat_template("llama4"), message_property_mappings={"role": "role", "content": "content"}, field_messages="messages", field_tools="tools", ), tokenizer=tokenizer, train_on_inputs=False, sequence_len=512, roles_to_train=["assistant"], eot_tokens=[eot_token], ) res = strategy.tokenize_prompt(tool_calling_dataset[0]) input_ids = res["input_ids"] labels = res["labels"] # Verify that the input_ids contain expected tokens assert len(input_ids) > 0, "Input IDs should not be empty" assert len(labels) == len(input_ids), "Labels should match input_ids length" # Decode the full conversation to verify structure decoded_conversation = tokenizer.decode(input_ids) # Verify tool calling structure is present in the decoded conversation assert '"type": "function",' in decoded_conversation, ( "Tool type function should be in conversation" ) assert '"name": "multiples",' in decoded_conversation, ( "Tool function name should be in conversation" ) assert ( '<|python_start|><|python_end|>{"name": "multiples", "parameters": {"number": 5, "limit": 20}}<|eot|>' in decoded_conversation ), "Assistant tool call should be in conversation" assert "<|header_start|>ipython<|header_end|>" in decoded_conversation, ( "IPython header should be in conversation" ) assert '"5,10,15"' in decoded_conversation, ( "Tool response should be in conversation" ) # Get conversation turns to verify labeling turns = strategy.get_conversation_thread(tool_calling_dataset[0]) tools = strategy._get_tools(tool_calling_dataset[0]) # Check that assistant responses are properly labeled for i, turn in enumerate(tool_calling_dataset[0]["messages"]): if turn["role"] == "assistant": start_idx, end_idx = strategy.find_turn( turns=turns, turn_idx=i, tools=tools ) assert start_idx != -1 and end_idx != -1, ( f"Assistant turn {i} should be found" ) # Verify that assistant responses have proper labels turn_labels = labels[start_idx:end_idx] assert all(label != IGNORE_TOKEN_ID for label in turn_labels), ( f"Assistant turn {i} should be unmasked" ) ================================================ FILE: tests/prompt_strategies/test_chat_templates_mistral.py ================================================ """Test chat templates for mistral-common wrapper tokenizer""" import unittest from typing import TYPE_CHECKING import pytest if TYPE_CHECKING: from transformers import PreTrainedTokenizer from axolotl.utils.mistral import HFMistralTokenizer # fmt: off @pytest.mark.parametrize( ("tokenizer_str", "assistant_toolcall_ids", "tool_result_ids"), ( ("magistral_tokenizer", (9, 44627, 3684, 33, 19881, 1049, 1050, 1051, 1052, 1053, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)), ("devstral_tokenizer", (9, 1091, 19227, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 61906, 2811, 16753, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 4179, 1429, 1327, 2811, 1429, 19881, 1049, 1050, 1051, 1052, 1053, 1034, 27028, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)), ("devstral_1_1_tokenizer", (9, 44627, 3684, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2,), (7, 1049, 1044, 1050, 8)), ) ) # fmt: on def test_mistral_chat_template( tokenizer_str: str, assistant_toolcall_ids: tuple[int, ...], tool_result_ids: tuple[int, ...], request: pytest.FixtureRequest, ): """Test chat template with the Magistral/Devstral tokenizer""" from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy tokenizer: HFMistralTokenizer = request.getfixturevalue(tokenizer_str) # check bos, eos, pad, unk are accessible properties assert tokenizer.bos_token_id == 1 assert tokenizer.eos_token_id == 2 assert tokenizer.pad_token_id == 11 assert tokenizer.unk_token_id == 0 assert tokenizer.pad_token == "" assert tokenizer.eos_token == "" assert tokenizer.bos_token == "" assert tokenizer.unk_token == "" strategy = MistralStrategy( MistralPrompter( tokenizer, chat_template=None, message_property_mappings={"role": "role", "content": "content"}, ), tokenizer=tokenizer, train_on_inputs=False, train_on_eos="turn", sequence_len=512, roles_to_train=["assistant"], ) # test chat template masking without system prompt res = strategy.tokenize_prompt( { "messages": [ {"role": "user", "content": "Hello, how are you?"}, {"role": "assistant", "content": "I'm doing great, thank you!"}, ] } ) assert res["input_ids"] == [ 1, # bos 3, # [INST] 22177, # Hello 1044, # , 2606, # how 1584, # are 1636, # you 1063, # ? 4, # [/INST] 1073, # I 4525, # 'm 6965, # doing 4824, # great 1044, # , 15412, # thank 1636, # you 1033, # ! 2, # ] assert res["labels"] == [ -100, # bos -100, # [INST] -100, # Hello -100, # , -100, # how -100, # are -100, # you -100, # ? -100, # [/INST] 1073, # I 4525, # 'm 6965, # doing 4824, # great 1044, # , 15412, # thank 1636, # you 1033, # ! 2, # ] # test chat template masking with system prompt res = strategy.tokenize_prompt( { "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello, how are you?"}, {"role": "assistant", "content": "I'm doing great, thank you!"}, ] } ) assert res["input_ids"] == [ 1, # bos 17, # [SYSTEM_PROMPT] 4568, # You 1584, # are 1261, # a 20351, # helpful 27089, # assistant 1046, # . 18, # [/SYSTEM_PROMPT] 3, # [INST] 22177, # Hello 1044, # , 2606, # how 1584, # are 1636, # you 1063, # ? 4, # [/INST] 1073, # I 4525, # 'm 6965, # doing 4824, # great 1044, # , 15412, # thank 1636, # you 1033, # ! 2, # ] assert res["labels"] == [ -100, # bos -100, # [SYSTEM_PROMPT] -100, # You -100, # are -100, # a -100, # helpful -100, # assistant -100, # . -100, # [/SYSTEM_PROMPT] -100, # [INST] -100, # Hello -100, # , -100, # how -100, # are -100, # you -100, # ? -100, # [/INST] 1073, # I 4525, # 'm 6965, # doing 4824, # great 1044, # , 15412, # thank 1636, # you 1033, # ! 2, # ] # test chat template with tools res = strategy.tokenize_prompt( { "tools": [ { "type": "function", "function": { "name": "multiples", "description": "Generates a list of all the multiples of a number that are less than a given limit.", "parameters": { "type": "object", "properties": { "number": { "type": "integer", "description": "The number to find multiples of.", }, "limit": { "type": "integer", "description": "The upper limit for the multiples.", }, }, "required": ["number", "limit"], }, }, }, ], "messages": [ { "role": "user", "content": "Hey, can you give me a breakdown of how to throw an awesome themed party? Like, what themes work best, and how can I set everything up to really wow my guests? I want some ideas on decorations, food, and activities that will make the party unforgettable!", }, { "role": "assistant", "tool_calls": [ { "id": "call12345", "type": "function", "function": { "name": "multiples", "arguments": { "number": 16, "limit": 2, }, }, } ], }, { "role": "tool", "tool_call_id": "call12345", "name": "multiples", "content": "1,2", }, {"role": "assistant", "content": "The multiples of 16 is 1 and 2."}, ], } ) # fmt: off assert res["input_ids"] == [ 1, # bos 5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 14653, 2811, 1429, 10639, 2130, 1261, 2951, 1307, 1747, 1278, 60092, 1307, 1261, 2782, 1455, 1584, 4289, 2224, 1261, 4265, 6139, 39249, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 12856, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 2782, 1317, 3081, 60092, 1307, 2613, 4179, 1429, 33319, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 9229, 6139, 1394, 1278, 60092, 2613, 47579, 1429, 15760, 2811, 12161, 12856, 1897, 1429, 33319, 4964, 2821, 27028, 6, # tool prompt 3, 46634, 1044, 1710, 1636, 5628, 1639, 1261, 44433, 1307, 2606, 1317, 5388, 1420, 54191, 2424, 1286, 8967, 1063, 15621, 1044, 2549, 30305, 2196, 3560, 1044, 1321, 2606, 1710, 1362, 2016, 8605, 2015, 1317, 5524, 118931, 2036, 32951, 1063, 1362, 2933, 2269, 12106, 1408, 101987, 1044, 6939, 1044, 1321, 9216, 1455, 2084, 3180, 1278, 8967, 119141, 1689, 5935, 1033, 4, # user *assistant_toolcall_ids, # assistant tool calling *tool_result_ids, # tool result 1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046, # assistant 2 # eos ] assert res["labels"] == [ -100, # bos -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # tool prompt -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # user prompt *assistant_toolcall_ids, # assistant tool calling *([-100] * len(tool_result_ids)), # tool result 1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046, # assistant 2 # eos ] # fmt: on # test chat template with tokenize=False res = tokenizer.apply_chat_template( [ {"role": "user", "content": "Hello, how are you?"}, {"role": "assistant", "content": "I'm doing great, thank you!"}, ], tokenize=False, ) assert res == "[INST]Hello, how are you?[/INST]I'm doing great, thank you!" # test encode res = tokenizer.encode("Hello, how are you?", add_special_tokens=True) assert res == [ 1, # bos 22177, # Hello 1044, # , 2606, # how 1584, # are 1636, # you 1063, # ? 2, # eos ] # test decode no skip special tokens decoded_res = tokenizer.decode(res, skip_special_tokens=False) assert decoded_res == "Hello, how are you?" # test decode skip special tokens decoded_res = tokenizer.decode(res, skip_special_tokens=True) assert decoded_res == "Hello, how are you?" # test encode no special tokens res = tokenizer.encode("Hello, how are you?", add_special_tokens=False) assert res == [ 22177, # Hello 1044, # , 2606, # how 1584, # are 1636, # you 1063, # ? ] # test convert ids to tokens res = tokenizer.convert_ids_to_tokens(res) # spacing are needed as we are converting without decoding assert res == ["Hello", ",", " how", " are", " you", "?"] @pytest.mark.skip(reason="TODO, fix for new HF wrapper call") def test_magistral_tokenizer_pad_method(magistral_tokenizer: "HFMistralTokenizer"): """Test the MistralTokenizer pad method""" from axolotl.utils.collators.core import IGNORE_INDEX magistral_pad_token_id = 11 # taken from tokenizer.pad_token_id # Test padding with input_ids and labels only features = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, {"input_ids": [7, 8], "labels": [9, 10]}, ] result = magistral_tokenizer.pad(features, padding=True, return_tensors="pt") # Check that input_ids are padded correctly assert result["input_ids"].shape == (2, 3) assert result["input_ids"].tolist() == [[1, 2, 3], [7, 8, magistral_pad_token_id]] # Check that labels are padded correctly assert result["labels"].shape == (2, 3) assert result["labels"].tolist() == [[4, 5, 6], [9, 10, IGNORE_INDEX]] # Check that attention_mask and position_ids are NOT created assert "attention_mask" not in result assert "position_ids" not in result # Test padding with attention_mask features_with_attention = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "attention_mask": [1, 1, 1]}, {"input_ids": [7, 8], "labels": [9, 10], "attention_mask": [1, 1]}, ] result = magistral_tokenizer.pad( features_with_attention, padding=True, return_tensors="pt" ) # Check that attention_mask is padded correctly assert result["attention_mask"].shape == (2, 3) assert result["attention_mask"].tolist() == [[1, 1, 1], [1, 1, 0]] # Test padding with position_ids features_with_position = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "position_ids": [0, 1, 2]}, {"input_ids": [7, 8], "labels": [9, 10], "position_ids": [0, 1]}, ] result = magistral_tokenizer.pad( features_with_position, padding=True, return_tensors="pt" ) # Check that position_ids are padded correctly (continuing sequence) assert result["position_ids"].shape == (2, 3) assert result["position_ids"].tolist() == [[0, 1, 2], [0, 1, 2]] # Test padding with all fields features_all = [ { "input_ids": [1, 2, 3], "labels": [4, 5, 6], "attention_mask": [1, 1, 1], "position_ids": [0, 1, 2], }, { "input_ids": [7, 8], "labels": [9, 10], "attention_mask": [1, 1], "position_ids": [0, 1], }, ] result = magistral_tokenizer.pad(features_all, padding=True, return_tensors="pt") # All fields should be present and correctly padded assert "input_ids" in result assert "labels" in result assert "attention_mask" in result assert "position_ids" in result # Test padding with all sequences same length features_same_length = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, {"input_ids": [7, 8, 9], "labels": [10, 11, 12]}, ] result = magistral_tokenizer.pad( features_same_length, padding=True, return_tensors="pt" ) # Check match when no padding is needed assert result["input_ids"][0].tolist() == features_same_length[0]["input_ids"] assert result["labels"][0].tolist() == features_same_length[0]["labels"] assert result["input_ids"][1].tolist() == features_same_length[1]["input_ids"] assert result["labels"][1].tolist() == features_same_length[1]["labels"] # Test padding with max_length parameter result = magistral_tokenizer.pad( features, padding="max_length", max_length=5, return_tensors="pt" ) # Should pad to max_length assert result["input_ids"].shape == (2, 5) assert result["labels"].shape == (2, 5) # Test numpy return type result = magistral_tokenizer.pad(features, padding=True, return_tensors="np") # Should return numpy arrays import numpy as np assert isinstance(result["input_ids"], np.ndarray) assert isinstance(result["labels"], np.ndarray) # Test unsupported field rejection features_unsupported = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "unsupported_field": [7, 8, 9]}, ] with pytest.raises(NotImplementedError, match="unsupported_field"): magistral_tokenizer.pad(features_unsupported, padding=True, return_tensors="pt") # Test token_type_ids rejection features_token_type = [ {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "token_type_ids": [0, 0, 0]}, ] with pytest.raises(ValueError, match="token_type_ids is not supported"): magistral_tokenizer.pad(features_token_type, padding=True, return_tensors="pt") def test_magistral_tool_calling(magistral_tokenizer: "HFMistralTokenizer"): """Test tool calling with the Magistral tokenizer""" from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy strategy = MistralStrategy( MistralPrompter( magistral_tokenizer, chat_template=None, message_property_mappings={"role": "role", "content": "content"}, ), tokenizer=magistral_tokenizer, train_on_inputs=False, train_on_eos="turn", sequence_len=512, roles_to_train=["assistant"], ) # Test basic tool calling with single function basic_tool_calling = { "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", }, }, "required": ["location"], }, }, }, ], "messages": [ { "role": "user", "content": "What's the weather like in San Francisco?", }, { "role": "assistant", "tool_calls": [ { "id": "call12345", "type": "function", "function": { "name": "get_weather", "arguments": { "location": "San Francisco, CA", }, }, } ], }, { "role": "tool", "tool_call_id": "call12345", "name": "get_weather", "content": "Sunny, 72°F", }, { "role": "assistant", "content": "The weather in San Francisco is sunny and 72°F.", }, ], } res = strategy.tokenize_prompt(basic_tool_calling) # Basic validation assert "input_ids" in res assert "labels" in res assert len(res["input_ids"]) > 0 assert len(res["labels"]) == len(res["input_ids"]) # Decode and verify structure decoded = magistral_tokenizer.decode(res["input_ids"]) assert ( '[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "get_weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}}, "required": ["location"]}}}][/AVAILABLE_TOOLS]' in decoded ) assert ( '[TOOL_CALLS]get_weather[CALL_ID]call12345[ARGS]{"location": "San Francisco, CA"}' in decoded ) assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]Sunny, 72°F[/TOOL_RESULTS]" in decoded assert "The weather in San Francisco is sunny and 72°F." in decoded # Test multiple tool calls in sequence multi_tool_calling = { "tools": [ { "type": "function", "function": { "name": "add_numbers", "description": "Add two numbers together", "parameters": { "type": "object", "properties": { "a": {"type": "number", "description": "First number"}, "b": {"type": "number", "description": "Second number"}, }, "required": ["a", "b"], }, }, }, { "type": "function", "function": { "name": "multiply_numbers", "description": "Multiply two numbers", "parameters": { "type": "object", "properties": { "x": {"type": "number", "description": "First number"}, "y": {"type": "number", "description": "Second number"}, }, "required": ["x", "y"], }, }, }, ], "messages": [ { "role": "user", "content": "Add 5 and 3, then multiply the result by 2", }, { "role": "assistant", "tool_calls": [ { "id": "call12345", "type": "function", "function": { "name": "add_numbers", "arguments": {"a": 5, "b": 3}, }, } ], }, { "role": "tool", "tool_call_id": "call12345", "name": "add_numbers", "content": "8", }, { "role": "assistant", "tool_calls": [ { "id": "call23456", "type": "function", "function": { "name": "multiply_numbers", "arguments": {"x": 8, "y": 2}, }, } ], }, { "role": "tool", "tool_call_id": "call23456", "name": "multiply_numbers", "content": "16", }, { "role": "assistant", "content": "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16.", }, ], } res = strategy.tokenize_prompt(multi_tool_calling) # Validation assert len(res["input_ids"]) > 0 assert len(res["labels"]) == len(res["input_ids"]) decoded = magistral_tokenizer.decode(res["input_ids"]) assert ( '[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "add_numbers", "description": "Add two numbers together", "parameters": {"type": "object", "properties": {"a": {"type": "number", "description": "First number"}, "b": {"type": "number", "description": "Second number"}}, "required": ["a", "b"]}}}, {"type": "function", "function": {"name": "multiply_numbers", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "First number"}, "y": {"type": "number", "description": "Second number"}}, "required": ["x", "y"]}}}][/AVAILABLE_TOOLS]' in decoded ) assert ( '[TOOL_CALLS]add_numbers[CALL_ID]call12345[ARGS]{"a": 5, "b": 3}' in decoded ) assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]8[/TOOL_RESULTS]" in decoded assert ( '[TOOL_CALLS]multiply_numbers[CALL_ID]call23456[ARGS]{"x": 8, "y": 2}' in decoded ) assert "[TOOL_RESULTS]call23456[TOOL_CONTENT]16[/TOOL_RESULTS]" in decoded assert ( "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16." in decoded ) # Test tool calling with system message system_tool_calling = { "tools": [ { "type": "function", "function": { "name": "search_database", "description": "Search for information in database", "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query"}, }, "required": ["query"], }, }, }, ], "messages": [ { "role": "system", "content": "You are a helpful assistant with access to a database.", }, { "role": "user", "content": "Find information about Python programming", }, { "role": "assistant", "tool_calls": [ { "id": "search123", "type": "function", "function": { "name": "search_database", "arguments": {"query": "Python programming"}, }, } ], }, { "role": "tool", "tool_call_id": "search123", "name": "search_database", "content": "Python is a high-level programming language known for its simplicity.", }, { "role": "assistant", "content": "Based on the database search, Python is a high-level programming language known for its simplicity and readability.", }, ], } res = strategy.tokenize_prompt(system_tool_calling) # Validation assert len(res["input_ids"]) > 0 assert len(res["labels"]) == len(res["input_ids"]) decoded = magistral_tokenizer.decode(res["input_ids"]) assert ( '[SYSTEM_PROMPT]You are a helpful assistant with access to a database.[/SYSTEM_PROMPT][AVAILABLE_TOOLS][{"type": "function", "function": {"name": "search_database", "description": "Search for information in database", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query"}}, "required": ["query"]}}}][/AVAILABLE_TOOLS]' in decoded ) # Test error handling - missing tool response incomplete_tool_calling = { "tools": [ { "type": "function", "function": { "name": "get_time", "description": "Get current time", "parameters": {"type": "object", "properties": {}}, }, }, ], "messages": [ { "role": "user", "content": "What time is it?", }, { "role": "assistant", "tool_calls": [ { "id": "time12345", "type": "function", "function": { "name": "get_time", "arguments": {}, }, } ], }, { "role": "assistant", "content": "The current time is 12:00 PM.", }, ], } from mistral_common.exceptions import InvalidMessageStructureException try: strategy.tokenize_prompt(incomplete_tool_calling) except InvalidMessageStructureException as e: assert "Not the same number of function calls and responses" in str(e) @pytest.mark.skip(reason="TODO, fix for new HF wrapper call") def test_magistral_tokenizer_call_method( magistral_tokenizer: "HFMistralTokenizer", llama3_tokenizer: "PreTrainedTokenizer" ): """Test the __call__ method behavior matches HuggingFace standards""" from copy import deepcopy import numpy as np import torch hf_tokenizer = deepcopy(llama3_tokenizer) hf_tokenizer.pad_token = hf_tokenizer.eos_token test_text = "Hello, how are you?" batch_texts = ["Hello world", "How are you?"] # Test single string with return_tensors=None hf_result: dict[str, list[int]] = hf_tokenizer(test_text, return_tensors=None) mistral_result: dict[str, list[int]] = magistral_tokenizer( test_text, return_tensors=None ) assert isinstance(mistral_result, dict) assert set(mistral_result.keys()) == {"input_ids", "attention_mask"} assert isinstance(mistral_result["input_ids"], type(hf_result["input_ids"])) # list assert isinstance( mistral_result["attention_mask"], type(hf_result["attention_mask"]) ) assert len(mistral_result["input_ids"]) == len(mistral_result["attention_mask"]) assert np.all(mistral_result["attention_mask"]) assert len(np.array(mistral_result["input_ids"]).shape) == 1 # 1D array # Test single string with return_tensors='pt' hf_result_pt: dict[str, torch.Tensor] = hf_tokenizer(test_text, return_tensors="pt") mistral_result_pt: dict[str, torch.Tensor] = magistral_tokenizer( test_text, return_tensors="pt" ) # Check structure and types assert isinstance(mistral_result_pt["input_ids"], torch.Tensor) assert isinstance(mistral_result_pt["attention_mask"], torch.Tensor) # Check shapes match (don't compare token dimension) assert len(hf_result_pt["input_ids"].shape) == len( mistral_result_pt["input_ids"].shape ) assert hf_result_pt["input_ids"].shape[0] == mistral_result_pt["input_ids"].shape[0] assert ( mistral_result_pt["attention_mask"].shape == mistral_result_pt["input_ids"].shape ) assert torch.all(mistral_result_pt["attention_mask"] == 1) # Test batch input with padding hf_batch: dict[str, torch.Tensor] = hf_tokenizer( batch_texts, return_tensors="pt", padding=True ) mistral_batch: dict[str, torch.Tensor] = magistral_tokenizer( batch_texts, return_tensors="pt", padding=True ) # Check batch behavior assert len(hf_batch["input_ids"].shape) == len(mistral_batch["input_ids"].shape) assert hf_batch["input_ids"].shape[0] == mistral_batch["input_ids"].shape[0] assert mistral_batch["attention_mask"].shape == mistral_batch["input_ids"].shape assert torch.any( mistral_batch["attention_mask"][0] == 0 ) # padding in shorter sequence assert torch.all( mistral_batch["attention_mask"][1] == 1 ) # no padding in longer sequence # Test numpy tensors mistral_result_np: dict[str, np.ndarray] = magistral_tokenizer( test_text, return_tensors="np" ) assert isinstance(mistral_result_np["input_ids"], np.ndarray) assert isinstance(mistral_result_np["attention_mask"], np.ndarray) # Test consistency with encode() encoded: list[int] = magistral_tokenizer.encode(test_text, add_special_tokens=True) called: dict[str, torch.Tensor] = magistral_tokenizer( test_text, return_tensors="pt" ) assert encoded == called["input_ids"][0].tolist() # Test Error handling with pytest.raises(ValueError, match="Unsupported kwargs"): magistral_tokenizer(test_text, unsupported_param=True) with pytest.raises( ValueError, match="return_tensors='pt' or 'np' requires padding or truncation" ): magistral_tokenizer(batch_texts, return_tensors="pt") if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_chat_templates_thinking.py ================================================ """ Tests for splitting reasoning/thinking from content into separate field """ import pytest from datasets import Dataset from axolotl.prompt_strategies.chat_template import ( load, ) from axolotl.utils.dict import DictDefault @pytest.fixture(name="messages_w_reasoning") def messages_w_reasoning_fixture(): return Dataset.from_list( [ { "messages": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "lorem\nwelcome", }, ] }, { "messages": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "<|begin_of_thought|>lorem<|end_of_thought|>\n<|begin_of_solution|>welcome\n<|end_of_solution|>", }, ] }, { "messages": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "lorem\nwelcome", }, ] }, ] ) class TestSplitThinking: """ test class to make sure datasets with reasoning content conforms to the chat_template strategy """ def test_splits_think(self, messages_w_reasoning, qwen3_tokenizer): strategy = load( qwen3_tokenizer, DictDefault( { "train_on_inputs": False, "sequence_len": 512, } ), DictDefault( { "chat_template": "qwen3", "message_field_role": "role", "message_field_content": "content", "message_property_mappings": { "role": "role", "content": "content", }, "roles": { "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, "field_messages": "messages", "split_thinking": True, } ), ) for conversation in messages_w_reasoning: transformed_prompt = strategy.get_conversation_thread(conversation) assert transformed_prompt[0]["role"] == "user" assert transformed_prompt[1]["role"] == "assistant" assert transformed_prompt[1]["reasoning_content"] == "lorem" assert transformed_prompt[1]["content"] == "welcome" res = strategy.tokenize_prompt(conversation) input_ids = res["input_ids"] # fmt: off expected_input_ids = [ 151644, # im_start 872, # user 198, # \n 14990, # hello 151645, # im_end 198, # \n 151644, # im_start 77091, # assistant 198, # \n 151667, # think 198, # \n 385, 1826, # lorem 198, # \n 151668, # /think 271, # \n 34084, # welcome 151645, # im_end 198, # \n ] # fmt: on assert input_ids == expected_input_ids, ( f"Input IDs mismatch: {input_ids} != {expected_input_ids}" ) ================================================ FILE: tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py ================================================ """ Tests for handling json tool content """ import json import pytest from datasets import Dataset from axolotl.prompt_strategies.chat_template import ( load, ) from axolotl.utils.dict import DictDefault @pytest.fixture(name="qwen3_instruct_prompt_strategy") def qwen3_instruct_chat_template_strategy(qwen3_tokenizer): strategy = load( qwen3_tokenizer, DictDefault( { "train_on_inputs": False, "sequence_len": 512, } ), DictDefault( { "chat_template": "qwen3", "message_field_role": "role", "message_field_content": "content", "message_property_mappings": { "role": "role", "content": "content", }, "roles": { "user": ["user"], "assistant": ["assistant"], "system": ["system"], }, "field_messages": "messages", } ), ) return strategy class TestQwen3IdenticalConversationArgs: """ Test Qwen3 tools is identical between JSON and dict """ @pytest.fixture(name="conversation_dict_args_dataset") def fixture_conversation_dict_args_dataset(self): """ Provides a dataset with conversation where arguments is a dict. """ user_content = "What is the weather in Boston?" function_name = "get_current_weather" arguments_dict = {"location": "Boston, MA", "unit": "celsius"} data = [ { "messages": [ {"role": "user", "content": user_content}, { "role": "assistant", "content": "", "tool_calls": [ { "function": { "name": function_name, "arguments": arguments_dict, # dict } } ], }, ], } ] return Dataset.from_list(data) @pytest.fixture(name="conversation_str_args_dataset") def fixture_conversation_str_args_dataset(self): """ Provides a dataset with conversation where arguments is a JSON string. """ user_content = "What is the weather in Boston?" function_name = "get_current_weather" arguments_dict = {"location": "Boston, MA", "unit": "celsius"} arguments_str = json.dumps(arguments_dict) data = [ { "messages": [ {"role": "user", "content": user_content}, { "role": "assistant", "content": "", "tool_calls": [ { "function": { "name": function_name, "arguments": arguments_str, # str } } ], }, ], } ] return Dataset.from_list(data) @pytest.fixture(name="conversation_mixed_time_types_dataset") def fixture_conversation_mixed_time_types_dataset(self): """ Provides a dataset where 'time' field has different types in different tool calls. """ data = [ { "messages": [ { "role": "user", "content": "Get weather information at different times", }, { "role": "assistant", "content": "", "tool_calls": [ { "function": { "name": "func1", "arguments": json.dumps( {"time": "2025-08-01"} ), # string type } }, { "function": { "name": "func2", "arguments": json.dumps( {"time": 1690876800} ), # number type } }, ], }, ], } ] return Dataset.from_list(data) def test_dict_and_str_args_produce_identical_output( self, conversation_dict_args_dataset, conversation_str_args_dataset, qwen3_instruct_prompt_strategy, qwen3_tokenizer, ): """ Tests that after tokenization and decoding, the outputs for both dict and string `arguments` are exactly the same. """ processed_dict_args = conversation_dict_args_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages"], ) processed_str_args = conversation_str_args_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages"], ) decoded_prompt_from_dict = qwen3_tokenizer.decode( processed_dict_args[0]["input_ids"] ) decoded_prompt_from_str = qwen3_tokenizer.decode( processed_str_args[0]["input_ids"] ) assert decoded_prompt_from_dict == decoded_prompt_from_str, ( f"Dict format output:\n{decoded_prompt_from_dict}\n" f"String format output:\n{decoded_prompt_from_str}" ) assert ( processed_dict_args[0]["input_ids"] == processed_str_args[0]["input_ids"] ), "The tokenized input_ids should be identical for dict and str arguments" def test_str_args_with_mixed_time_types_no_error( self, conversation_mixed_time_types_dataset, qwen3_instruct_prompt_strategy, qwen3_tokenizer, ): """ Tests that when 'time' field has different types (string vs number) in different tool calls, str format arguments don't cause errors. """ processed = conversation_mixed_time_types_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages"], ) assert len(processed) == 1 assert "input_ids" in processed[0] assert len(processed[0]["input_ids"]) > 0 decoded = qwen3_tokenizer.decode(processed[0]["input_ids"]) assert "2025-08-01" in decoded, "String time value should be present" assert "1690876800" in decoded, "Number time value should be present" class TestQwen3IdenticalToolsParameters: """ Test Qwen3 tools parameters handling is identical between JSON string and dict """ @pytest.fixture(name="tools_dict_params_dataset") def fixture_tools_dict_params_dataset(self): """ Provides a dataset with tools where parameters is a dict. """ tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get weather information", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state", }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], }, }, "required": ["location"], }, }, } ] data = [ { "tools": tools, "messages": [ {"role": "user", "content": "What's the weather?"}, { "role": "assistant", "content": "", "tool_calls": [ { "type": "function", "function": { "name": "get_weather", "arguments": {"location": "Boston, MA"}, }, } ], }, { "role": "tool", "name": "get_weather", "content": "72°F and sunny", }, ], } ] return Dataset.from_list(data) @pytest.fixture(name="tools_str_params_dataset") def fixture_tools_str_params_dataset(self): """ Provides a dataset with tools where parameters is a JSON string. """ parameters_dict = { "type": "object", "properties": { "location": {"type": "string", "description": "The city and state"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, "required": ["location"], } tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get weather information", "parameters": json.dumps(parameters_dict), }, } ] data = [ { "tools": tools, "messages": [ {"role": "user", "content": "What's the weather?"}, { "role": "assistant", "content": "", "tool_calls": [ { "type": "function", "function": { "name": "get_weather", "arguments": {"location": "Boston, MA"}, }, } ], }, { "role": "tool", "name": "get_weather", "content": "72°F and sunny", }, ], } ] return Dataset.from_list(data) @pytest.fixture(name="tools_mixed_type_params_dataset") def fixture_tools_mixed_type_params_dataset(self): """ Provides a dataset where different tools have the same parameter name with different types. This tests that JSON string format prevents casting issues. """ tools = [ { "type": "function", "function": { "name": "tool_with_string_arg", "description": "Tool expecting string argument", "parameters": json.dumps( { "type": "object", "properties": { "arg1": { "type": "string", "description": "A string parameter", } }, "required": ["arg1"], } ), }, }, { "type": "function", "function": { "name": "tool_with_number_arg", "description": "Tool expecting number argument", "parameters": json.dumps( { "type": "object", "properties": { "arg1": { "type": "number", "description": "A numeric parameter", } }, "required": ["arg1"], } ), }, }, ] data = [ { "tools": tools, "messages": [ {"role": "user", "content": "Use both tools"}, { "role": "assistant", "content": "", "tool_calls": [ { "type": "function", "function": { "name": "tool_with_string_arg", "arguments": json.dumps({"arg1": "hello"}), }, }, { "type": "function", "function": { "name": "tool_with_number_arg", "arguments": json.dumps({"arg1": 42}), }, }, ], }, ], } ] return Dataset.from_list(data) def test_dict_and_str_params_produce_equivalent_output( self, tools_dict_params_dataset, tools_str_params_dataset, qwen3_instruct_prompt_strategy, qwen3_tokenizer, ): """ Tests that after tokenization and decoding, the outputs for both dict and string `parameters` in tools are semantically equivalent. """ import re processed_dict_params = tools_dict_params_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages", "tools"], ) processed_str_params = tools_str_params_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages", "tools"], ) decoded_dict = qwen3_tokenizer.decode(processed_dict_params[0]["input_ids"]) decoded_str = qwen3_tokenizer.decode(processed_str_params[0]["input_ids"]) # Extract the tool JSON from both outputs tools_pattern = r"\n(.*?)\n" dict_tools_match = re.search(tools_pattern, decoded_dict, re.DOTALL) str_tools_match = re.search(tools_pattern, decoded_str, re.DOTALL) assert dict_tools_match and str_tools_match, ( "Could not find tools section in output" ) # Parse the JSON and compare as objects (order-independent) dict_tools_json = json.loads(dict_tools_match.group(1)) str_tools_json = json.loads(str_tools_match.group(1)) # Deep comparison of the tool definitions assert dict_tools_json == str_tools_json, ( f"Tool definitions are not equivalent:\n" f"Dict format: {json.dumps(dict_tools_json, indent=2)}\n" f"String format: {json.dumps(str_tools_json, indent=2)}" ) # Verify the rest of the structure is the same (excluding the tools JSON part) # The tools JSON can have different order, so we remove it here. dict_normalized = re.sub( r".*?", "TOOLS_PLACEHOLDER", decoded_dict, flags=re.DOTALL, ) str_normalized = re.sub( r".*?", "TOOLS_PLACEHOLDER", decoded_str, flags=re.DOTALL, ) assert dict_normalized == str_normalized, ( "The overall structure differs between dict and string parameter formats" ) def test_str_params_with_mixed_types_no_error( self, tools_mixed_type_params_dataset, qwen3_instruct_prompt_strategy, qwen3_tokenizer, ): """ Tests that when different tools have the same parameter name with different types, JSON string format for parameters doesn't cause casting errors. """ processed = tools_mixed_type_params_dataset.map( qwen3_instruct_prompt_strategy.tokenize_prompt, batched=True, remove_columns=["messages", "tools"], ) assert len(processed) == 1 assert "input_ids" in processed[0] assert len(processed[0]["input_ids"]) > 0 decoded = qwen3_tokenizer.decode(processed[0]["input_ids"]) # Check that both tools are present assert "tool_with_string_arg" in decoded assert "tool_with_number_arg" in decoded # Check that both argument values are present assert "hello" in decoded assert "42" in decoded ================================================ FILE: tests/prompt_strategies/test_dpo_chat_templates.py ================================================ """ tests for chat_template prompt strategy """ import unittest import pytest from datasets import Dataset from transformers import AutoTokenizer from axolotl.prompt_strategies.dpo.chat_template import argilla_chat, default from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="assistant_dataset") def fixture_assistant_dataset(): return Dataset.from_list( [ { "messages": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "hello", }, { "role": "user", "content": "goodbye", }, ], "chosen": { "role": "assistant", "content": "goodbye", }, "rejected": { "role": "assistant", "content": "party on", }, } ] ) @pytest.fixture(name="custom_assistant_dataset") def fixture_custom_assistant_dataset(): return Dataset.from_list( [ { "conversation": [ { "speaker": "human", "text": "hello", }, { "speaker": "agent", "text": "hello", }, { "speaker": "human", "text": "goodbye", }, ], "better": { "speaker": "agent", "text": "goodbye", }, "worse": { "speaker": "agent", "text": "party on", }, } ] ) @pytest.fixture(name="argilla_chat_dataset") def fixture_argilla_chat_dataset(): return Dataset.from_list( [ { "chosen": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "goodbye", }, ], "rejected": [ { "role": "user", "content": "hello", }, { "role": "assistant", "content": "party on", }, ], } ] ) @pytest.fixture(name="phi3_tokenizer") @enable_hf_offline def fixture_phi3_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct") return tokenizer @pytest.fixture(name="gemma_tokenizer") @enable_hf_offline def fixture_gemma_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-2b-it", revision="703fb4a") return tokenizer class TestAssistantDPOChatTemplateLlama3: """ Test class for assistant style datasets with llama-3 prompts using the chat_template strategy. """ def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset): transform_fn, _ = default( DictDefault( { "chat_template": "llama3", "datasets": [ { "type": "chat_template", } ], } ) ) result = transform_fn(assistant_dataset[0], tokenizer=llama3_tokenizer) assert result["prompt"] == ( "<|begin_of_text|>" + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>" + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" ) assert result["chosen"] == "goodbye<|eot_id|>" assert result["rejected"] == "party on<|eot_id|>" def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset): transform_fn, _ = default( DictDefault( { "chat_template": "llama3", "datasets": [ { "type": "chat_template", "field_messages": "conversation", "field_chosen": "better", "field_rejected": "worse", "message_field_role": "speaker", "message_field_content": "text", "roles": { "user": ["human"], "assistant": ["agent"], "system": ["sys"], }, } ], } ) ) result = transform_fn(custom_assistant_dataset[0], tokenizer=llama3_tokenizer) assert result["prompt"] == ( "<|begin_of_text|>" + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>" + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" ) assert result["chosen"] == "goodbye<|eot_id|>" assert result["rejected"] == "party on<|eot_id|>" class TestAssistantDPOChatTemplatePhi3: """ Test class for assistant style datasets with phi-3 prompts using the tokenizer's chat_template strategy. """ def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset): transform_fn, _ = default( DictDefault( { "chat_template": "tokenizer_default", "datasets": [ { "type": "chat_template", } ], } ) ) result = transform_fn(assistant_dataset[0], tokenizer=phi3_tokenizer) assert result["prompt"] == ( "<|user|>\nhello<|end|>\n" + "<|assistant|>\nhello<|end|>\n" + "<|user|>\ngoodbye<|end|>\n" + "<|assistant|>\n" ) assert result["chosen"] == "goodbye<|end|>" assert result["rejected"] == "party on<|end|>" class TestAssistantDPOChatTemplateGemma: """ Test class for assistant style datasets with gemma prompts using the tokenizer's chat_template strategy. """ def test_gemma_defaults(self, gemma_tokenizer, assistant_dataset): transform_fn, _ = default( DictDefault( { "chat_template": "tokenizer_default", "datasets": [ { "type": "chat_template", } ], } ) ) result = transform_fn(assistant_dataset[0], tokenizer=gemma_tokenizer) assert result["prompt"] == ( "user\nhello\n" + "model\nhello\n" + "user\ngoodbye\n" + "model\n" ) assert result["chosen"] == "goodbye" assert result["rejected"] == "party on" class TestArgillaChatDPOChatTemplate: """ Test class for argilla_chat style datasets (chosen/rejected contain full conversations). """ def test_llama3_argilla_chat(self, llama3_tokenizer, argilla_chat_dataset): transform_fn, _ = argilla_chat( DictDefault( { "chat_template": "llama3", "datasets": [ { "type": "chat_template.argilla_chat", } ], } ) ) result = transform_fn(argilla_chat_dataset[0], tokenizer=llama3_tokenizer) assert result["prompt"] == ( "<|begin_of_text|>" + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" ) assert result["chosen"] == "goodbye<|eot_id|>" assert result["rejected"] == "party on<|eot_id|>" def test_phi3_argilla_chat(self, phi3_tokenizer, argilla_chat_dataset): transform_fn, _ = argilla_chat( DictDefault( { "chat_template": "tokenizer_default", "datasets": [ { "type": "chat_template.argilla_chat", } ], } ) ) result = transform_fn(argilla_chat_dataset[0], tokenizer=phi3_tokenizer) assert result["prompt"] == "<|user|>\nhello<|end|>\n" + "<|assistant|>\n" assert result["chosen"] == "goodbye<|end|>" assert result["rejected"] == "party on<|end|>" if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_dpo_chatml.py ================================================ """ Tests for loading DPO preference datasets with chatml formatting """ import unittest import pytest from axolotl.loaders.tokenizer import load_tokenizer from axolotl.prompt_strategies.dpo import load as load_dpo from axolotl.utils.data.rl import prepare_preference_datasets from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="minimal_dpo_cfg") def fixture_cfg(): return DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "rl": "dpo", "learning_rate": 0.000001, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "special_tokens": { "pad_token": "<|endoftext|>", }, "sequence_len": 2048, } ) class TestDPOChatml: """ Test loading DPO preference datasets with chatml formatting """ @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_default(self, minimal_dpo_cfg): cfg = DictDefault( { "datasets": [ { "path": "argilla/distilabel-intel-orca-dpo-pairs", "type": "chatml", "split": "train[:1%]", } ] } | minimal_dpo_cfg ) # test that dpo.load works load_dpo("chatml", cfg) # now actually load the datasets with the strategy tokenizer = load_tokenizer(cfg) train_ds, _ = prepare_preference_datasets(cfg, tokenizer) assert train_ds[0]["prompt"].startswith("<|im_start|>") assert train_ds[0]["prompt"].endswith("<|im_start|>assistant\n") assert "chosen" in train_ds[0] assert "rejected" in train_ds[0] if __name__ == "__main__": unittest.main() ================================================ FILE: tests/prompt_strategies/test_jinja_template_analyzer.py ================================================ """ tests for jinja_template_analyzer """ import pytest from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer from axolotl.utils.logging import get_logger LOG = get_logger(__name__, log_level="DEBUG") class TestJinjaTemplateAnalyzer: """ tests for jinja_template_analyzer """ def test_basic_variable_extraction(self, basic_jinja_template_analyzer): """Test that all top-level variables are correctly extracted.""" LOG.info("Testing with train_on_inputs=True") variables = basic_jinja_template_analyzer.get_template_variables() expected_vars = {"messages", "add_generation_prompt", "eos_token", "message"} assert set(variables.keys()) == expected_vars def test_mixtral_variable_extraction(self, mistral_jinja_template_analyzer): """Test that all top-level variables are correctly extracted.""" LOG.info("Testing with train_on_inputs=True") variables = mistral_jinja_template_analyzer.get_template_variables() expected_vars = { "messages", "content", "eos_token", "message", "tools", "system_message", "loop_messages", "ns", "tool_call", "tool", "loop", "bos_token", "raise_exception", } assert set(variables.keys()) == expected_vars message_vars = variables["message"] assert message_vars == {"role", "content", "tool_calls", "tool_call_id"} def test_message_property_access(self, basic_jinja_template_analyzer): """Test that properties accessed on 'message' variable are correctly identified.""" LOG.info("Testing message property access") variables = basic_jinja_template_analyzer.get_template_variables() assert "messages" in variables assert "message" in variables assert "role" in variables["message"] assert "content" in variables["message"] def test_detailed_analysis(self, basic_jinja_template_analyzer): """Test the detailed analysis of variable usage.""" LOG.info("Testing detailed analysis") analysis = basic_jinja_template_analyzer.analyze_template() assert analysis["messages"]["is_iterated"] is True assert "role" in analysis["message"]["accessed_properties"] assert "content" in analysis["message"]["accessed_properties"] assert analysis["add_generation_prompt"]["is_conditional"] is True assert len(analysis["add_generation_prompt"]["accessed_properties"]) == 0 assert not analysis["eos_token"]["is_iterated"] assert len(analysis["eos_token"]["accessed_properties"]) == 0 def test_nested_property_access(self): """Test handling of nested property access.""" LOG.info("Testing nested property access") template = """{{ user.profile.name }}{{ user.settings['preference'] }}""" analyzer = JinjaTemplateAnalyzer(template) variables = analyzer.get_template_variables() assert "user" in variables assert "profile" in variables["user"] assert "settings" in variables["user"] def test_loop_variable_handling(self): """Test handling of loop variables and their properties.""" LOG.info("Testing loop variable handling") template = """ {% for item in items %} {{ item.name }} {% for subitem in item.subitems %} {{ subitem.value }} {% endfor %} {% endfor %} """ analyzer = JinjaTemplateAnalyzer(template) analysis = analyzer.analyze_template() assert analysis["items"]["is_iterated"] assert "name" in analysis["item"]["accessed_properties"] assert "subitems" in analysis["item"]["accessed_properties"] def test_conditional_variable_usage(self): """Test detection of variables used in conditional statements.""" LOG.info("Testing conditional variable usage") template = """ {% if user.is_admin and config.debug_mode %} {{ debug_info }} {% endif %} """ analyzer = JinjaTemplateAnalyzer(template) analysis = analyzer.analyze_template() assert analysis["user"]["is_conditional"] assert analysis["config"]["is_conditional"] assert "is_admin" in analysis["user"]["accessed_properties"] assert "debug_mode" in analysis["config"]["accessed_properties"] def test_complex_expressions(self): """Test handling of complex expressions and filters.""" LOG.info("Testing complex expressions and filters") template = """ {{ user.name | upper }} {{ messages | length > 0 and messages[0].content }} {{ data['key'].nested['value'] }} """ analyzer = JinjaTemplateAnalyzer(template) variables = analyzer.get_template_variables() assert "user" in variables assert "name" in variables["user"] assert "messages" in variables assert "content" in variables["messages"] assert "data" in variables def test_basic_msg_vars(self, basic_jinja_template_analyzer): """Test that the basic message variables are correctly identified.""" LOG.info("Testing basic message variables") variables = basic_jinja_template_analyzer.get_message_vars() assert variables == {"role", "content"} def test_mixtral_msg_vars(self, mistral_jinja_template_analyzer): """Test that the mixtral message variables are correctly identified.""" LOG.info("Testing mixtral message variables") variables = mistral_jinja_template_analyzer.get_message_vars() assert variables == {"role", "content", "tool_calls", "tool_call_id"} if __name__ == "__main__": pytest.main([__file__]) ================================================ FILE: tests/prompt_strategies/test_raw_io.py ================================================ """ Test module for raw i/o data for prompts """ import pytest from datasets import Dataset from tokenizers import AddedToken from transformers import AutoTokenizer from axolotl.datasets import TokenizedPromptDataset from axolotl.prompt_strategies.input_output import ( RawInputOutputPrompter, RawInputOutputStrategy, ) @pytest.fixture(name="segments_dataset") def fixture_sharegpt_dataset(): return Dataset.from_list( [ { "segments": [ { "label": False, "text": "hello ", }, { "label": True, "text": "hi there.", }, { "label": False, "text": "goodbye ", }, { "label": True, "text": "farewell", }, ] } ] ) @pytest.fixture(name="tokenizer") def fixture_tokenizer(): tokenizer = AutoTokenizer.from_pretrained( "casperhansen/mistral-7b-instruct-v0.1-awq" ) tokenizer.add_tokens( [ AddedToken("", rstrip=False, lstrip=False, normalized=False), ] ) return tokenizer class TestRawInputOutputPrompts: """ Test class for raw i/o prompter """ def test_segment_prompts(self, segments_dataset, tokenizer): strategy = RawInputOutputStrategy( RawInputOutputPrompter(), tokenizer, False, # train_on_inputs 2048, # sequence_len ) dataset_wrapper = TokenizedPromptDataset( strategy, segments_dataset, process_count=1 ) input_ids = dataset_wrapper[0]["input_ids"] labels = dataset_wrapper[0]["labels"] assert ( tokenizer.decode(input_ids) == " hello hi there. goodbye farewell" ) # fmt: off assert input_ids == [ 1, # 6312, # hell 28709, # o 28705, # 12014, # hi 736, # there 28723, # . 32000, # 1179, # good 17664, # bye 28705, # 19111, # fare 5458, # well 32000, # ] # fmt: on # fmt: off assert labels == [ -100, # -100, # hell -100, # o -100, # 12014, # hi 736, # there 28723, # . 32000, # -100, # good -100, # bye -100, # 19111, # fare 5458, # well 32000, # ] # fmt: on ================================================ FILE: tests/prompt_strategies/test_stepwise.py ================================================ """ tests for chat_template prompt strategy """ import datasets import pytest from datasets import Dataset from transformers import AutoTokenizer from axolotl.datasets import TokenizedPromptDataset from axolotl.prompt_strategies.stepwise_supervised import ( StepwiseSupervisedPromptTokenizingStrategy, ) class TestStepWiseSupervisedPromptTokenizingStrategy: """ Test class for stepwise supervised prompt strategy """ @pytest.fixture() def stepwise_supervised_dataset(self): return Dataset.from_list( [ { "prompt": "Which number is larger, 9.8 or 9.11?", "completions": [ "The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.", "Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.", "Actually, this is incorrect. In decimal numbers, 0.8 is equal to 0.80, which is larger than 0.11. Therefore, 9.8 is larger than 9.11.", ], "labels": [True, False, False], } ] ) @pytest.fixture() def tokenizer(self): return AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B") def test_stepwise_supervised_dataset(self, tokenizer, stepwise_supervised_dataset): strategy = StepwiseSupervisedPromptTokenizingStrategy( tokenizer, sequence_len=2048, step_separator="\n", ) stepwise_supervised_dataset = stepwise_supervised_dataset.cast_column( "labels", datasets.Sequence(datasets.Value("int64")) ) dataset_wrapper = TokenizedPromptDataset( strategy, stepwise_supervised_dataset, process_count=1, ) labels = dataset_wrapper[0]["labels"] # expected labels is: # the prompt + first step are ignored, followed by the label for step 1 (True) # the second step, and its label (False) # the third step, and its label (False) expected = [-100] * 47 + [1] + [-100] * 29 + [0] + [-100] * 48 + [0] assert labels == expected ================================================ FILE: tests/telemetry/__init__.py ================================================ ================================================ FILE: tests/telemetry/conftest.py ================================================ """Shared pytest fixtures for telemetry tests.""" import pytest @pytest.fixture(autouse=True) def del_track_env(monkeypatch): monkeypatch.delenv("AXOLOTL_DO_NOT_TRACK", raising=False) yield ================================================ FILE: tests/telemetry/test_callbacks.py ================================================ """Tests for telemetry callback module.""" # pylint: disable=redefined-outer-name import time from unittest.mock import MagicMock, patch import pytest from transformers import TrainerControl, TrainerState, TrainingArguments from axolotl.telemetry.callbacks import TIME_SINCE_LAST, TelemetryCallback def calc_expected_metrics(step, last_step, current_time, last_time, start_time=900.0): """Calculate expected metrics values for tests""" time_diff = current_time - last_time step_diff = step - last_step return { "steps_per_second": ( step_diff / time_diff if time_diff > 0 and step_diff > 0 else 0 ), "time_since_last_report": time_diff, "elapsed_time": current_time - start_time, } @pytest.fixture def mock_time(): """Mock time.time() to have predictable values in tests""" with patch("axolotl.telemetry.callbacks.time") as mock_time: mock_time.time.return_value = 1000.0 yield mock_time @pytest.fixture def mock_telemetry_manager(): """Create a mock TelemetryManager""" with patch("axolotl.telemetry.callbacks.TelemetryManager") as mock_manager_class: mock_manager = MagicMock() mock_manager_class.get_instance.return_value = mock_manager yield mock_manager @pytest.fixture def mock_runtime_metrics_tracker(): """Create a mock RuntimeMetricsTracker""" with patch( "axolotl.telemetry.callbacks.RuntimeMetricsTracker" ) as mock_tracker_class: mock_tracker = MagicMock() # Set up metrics property on the tracker mock_metrics = MagicMock() mock_metrics.to_dict.return_value = { "total_steps": 100, "peak_cpu_memory_bytes": 1024, } mock_tracker.metrics = mock_metrics # Make the constructor return our mock mock_tracker_class.return_value = mock_tracker yield mock_tracker @pytest.fixture def training_args(): """Create a minimal TrainingArguments instance""" return TrainingArguments(output_dir="./output") @pytest.fixture def trainer_state(): """Create a mock TrainerState""" state = MagicMock(spec=TrainerState) state.global_step = 10 state.epoch = 0.5 # halfway through first epoch state.log_history = [{"loss": 2.5, "learning_rate": 5e-5}] return state @pytest.fixture def trainer_control(): """Create a mock TrainerControl""" return MagicMock(spec=TrainerControl) # pylint: disable=unused-argument @pytest.fixture def callback(mock_telemetry_manager, mock_runtime_metrics_tracker): """Create a TelemetryCallback instance with mocked dependencies""" return TelemetryCallback() class TestTelemetryCallback: """Tests for the TelemetryCallback class.""" def test_initialization(self, callback, mock_runtime_metrics_tracker): """Test callback initialization.""" assert callback.current_epoch == -1 assert callback.tracker == mock_runtime_metrics_tracker assert callback.last_report_step == 0 assert hasattr(callback, "start_time") assert hasattr(callback, "last_report_time") assert callback.report_interval_steps == 100 def test_on_train_begin( self, callback, mock_telemetry_manager, training_args, trainer_state, trainer_control, ): """Test on_train_begin sends expected event.""" callback.on_train_begin(training_args, trainer_state, trainer_control) mock_telemetry_manager.send_event.assert_called_once_with( event_type="train-start" ) def test_on_train_end( self, callback, mock_telemetry_manager, training_args, trainer_state, trainer_control, ): """Test on_train_end sends expected event with metrics.""" callback.on_train_end(training_args, trainer_state, trainer_control) mock_telemetry_manager.send_event.assert_called_once() call_args = mock_telemetry_manager.send_event.call_args[1] assert call_args["event_type"] == "train-end" assert "loss" in call_args["properties"] assert call_args["properties"]["loss"] == 2.5 assert "learning_rate" in call_args["properties"] assert call_args["properties"]["learning_rate"] == 5e-5 # Check that metrics from RuntimeMetricsTracker are included assert "total_steps" in call_args["properties"] assert call_args["properties"]["total_steps"] == 100 assert "peak_cpu_memory_bytes" in call_args["properties"] assert call_args["properties"]["peak_cpu_memory_bytes"] == 1024 def test_on_epoch_begin( self, callback, mock_runtime_metrics_tracker, training_args, trainer_state, trainer_control, ): """Test on_epoch_begin updates epoch counter and calls tracker.""" initial_epoch = callback.current_epoch callback.on_epoch_begin(training_args, trainer_state, trainer_control) assert callback.current_epoch == initial_epoch + 1 mock_runtime_metrics_tracker.start_epoch.assert_called_once_with( initial_epoch + 1 ) def test_on_epoch_end( self, callback, mock_runtime_metrics_tracker, training_args, trainer_state, trainer_control, ): """Test on_epoch_end calls tracker.""" # Set current epoch callback.current_epoch = 2 callback.on_epoch_end(training_args, trainer_state, trainer_control) mock_runtime_metrics_tracker.end_epoch.assert_called_once_with(2) def test_on_step_end_no_report( self, callback, mock_telemetry_manager, mock_runtime_metrics_tracker, training_args, trainer_state, trainer_control, ): """Test on_step_end updates tracker but doesn't report if criteria not met.""" # Set up state to avoid reporting trainer_state.global_step = 42 # Not divisible by report_interval_steps callback.last_report_step = 41 # Just 1 step since last report callback.last_report_time = time.time() # Just now callback.on_step_end(training_args, trainer_state, trainer_control) # Should update tracker mock_runtime_metrics_tracker.update_step.assert_called_once_with(42) # Should not send telemetry mock_telemetry_manager.send_event.assert_not_called() # Should not update last report time/step assert callback.last_report_step == 41 def test_on_step_end_report_interval_steps( self, callback, mock_telemetry_manager, mock_runtime_metrics_tracker, mock_time, training_args, trainer_state, trainer_control, ): """Test on_step_end reports when step interval is reached.""" # Set up state with clear values current_step = 100 # Exactly matches report_interval_steps last_step = 0 start_time = 900.0 current_time = 1000.0 time_diff = current_time - start_time # 100 seconds # Configure state and callback trainer_state.global_step = current_step callback.report_interval_steps = 100 callback.last_report_step = last_step callback.start_time = start_time callback.last_report_time = start_time # Mock time.time() to return consistent values mock_time.time.return_value = current_time callback.on_step_end(training_args, trainer_state, trainer_control) # Should update tracker mock_runtime_metrics_tracker.update_step.assert_called_once_with(current_step) mock_runtime_metrics_tracker.update_memory_metrics.assert_called_once() # Should send telemetry mock_telemetry_manager.send_event.assert_called_once() call_args = mock_telemetry_manager.send_event.call_args[1] assert call_args["event_type"] == "train-progress" # Properties should include expected values props = call_args["properties"] assert props["step"] == current_step assert props["elapsed_time"] == time_diff # 1000 - 900 = 100 assert props["time_since_last_report"] == time_diff # 1000 - 900 = 100 assert props["steps_per_second"] == 1.0 # 100 steps / 100 seconds # Should update last report time/step assert callback.last_report_step == current_step assert callback.last_report_time == current_time def test_on_step_end_report_time_elapsed( self, callback, mock_telemetry_manager, mock_runtime_metrics_tracker, # pylint: disable=unused-argument mock_time, training_args, trainer_state, trainer_control, ): """Test on_step_end reports when enough time has elapsed.""" # Set up state with clear values current_step = 120 last_step = 10 start_time = 900.0 current_time = 1000.0 time_diff = TIME_SINCE_LAST + 1 # Just over the threshold # Configure state and callback trainer_state.global_step = current_step callback.report_interval_steps = 100 callback.last_report_step = last_step callback.start_time = start_time callback.last_report_time = current_time - time_diff # Mock time.time() to return consistent values mock_time.time.return_value = current_time callback.on_step_end(training_args, trainer_state, trainer_control) # Should send telemetry mock_telemetry_manager.send_event.assert_called_once() # Properties should include expected values props = mock_telemetry_manager.send_event.call_args[1]["properties"] expected_metrics = calc_expected_metrics( current_step, last_step, current_time, current_time - time_diff, start_time ) assert props["steps_per_second"] == expected_metrics["steps_per_second"] assert ( props["time_since_last_report"] == expected_metrics["time_since_last_report"] ) def test_on_step_end_first_step( self, callback, mock_telemetry_manager, mock_runtime_metrics_tracker, # pylint: disable=unused-argument mock_time, training_args, trainer_state, trainer_control, ): """Test on_step_end always reports on first step.""" # Set up state with clear values current_step = 1 # First step last_step = 0 start_time = 900.0 current_time = 1000.0 last_report_time = 999.0 # Just 1 second ago # Configure state and callback trainer_state.global_step = current_step callback.report_interval_steps = 100 callback.last_report_step = last_step callback.start_time = start_time callback.last_report_time = last_report_time # Mock time.time() to return consistent values mock_time.time.return_value = current_time callback.on_step_end(training_args, trainer_state, trainer_control) # Should send telemetry even though not much time has passed mock_telemetry_manager.send_event.assert_called_once() # Properties should include expected values for first step props = mock_telemetry_manager.send_event.call_args[1]["properties"] assert props["step"] == current_step expected_metrics = calc_expected_metrics( current_step, last_step, current_time, last_report_time, start_time ) assert props["steps_per_second"] == expected_metrics["steps_per_second"] def test_log_history_empty( self, callback, mock_telemetry_manager, mock_runtime_metrics_tracker, # pylint: disable=unused-argument mock_time, training_args, trainer_state, trainer_control, ): """Test handling of empty log history.""" # Set up state with clear values current_step = 1 start_time = 900.0 current_time = 1000.0 # Configure state and callback trainer_state.global_step = current_step trainer_state.log_history = [] callback.start_time = start_time # Mock time.time() to return consistent values mock_time.time.return_value = current_time callback.on_step_end(training_args, trainer_state, trainer_control) # Should still send telemetry mock_telemetry_manager.send_event.assert_called_once() # Properties should have default values for missing log data props = mock_telemetry_manager.send_event.call_args[1]["properties"] assert props["loss"] == 0 assert props["learning_rate"] == 0 ================================================ FILE: tests/telemetry/test_errors.py ================================================ """Tests for telemetry error utilities""" # pylint: disable=redefined-outer-name from unittest.mock import MagicMock, patch import pytest from axolotl.telemetry.errors import sanitize_stack_trace, send_errors @pytest.fixture(autouse=True) def reset_error_flag(monkeypatch): """Reset ERROR_HANDLED flag using monkeypatch""" import axolotl.telemetry.errors monkeypatch.setattr(axolotl.telemetry.errors, "ERROR_HANDLED", False) yield monkeypatch.setattr(axolotl.telemetry.errors, "ERROR_HANDLED", False) @pytest.fixture def example_stack_trace(): """Provide a sample stack trace with mixed paths""" return """Traceback (most recent call last): File "/home/user/.local/lib/python3.9/site-packages/axolotl/cli/train.py", line 83, in main trainer = get_trainer(cfg) File "/home/user/.local/lib/python3.9/site-packages/axolotl/train.py", line 214, in get_trainer model = get_model(cfg, tokenizer) File "/home/user/.local/lib/python3.9/site-packages/axolotl/utils/models.py", line 120, in get_model raise ValueError("Model path not found") ValueError: Model path not found """ @pytest.fixture def windows_stack_trace(): """Provide a sample stack trace with Windows paths""" return """Traceback (most recent call last): File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\axolotl\\cli\\train.py", line 83, in main trainer = get_trainer(cfg) File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\axolotl\\train.py", line 214, in get_trainer model = get_model(cfg, tokenizer) File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\models\\auto\\modeling_auto.py", line 482, in from_pretrained raise ValueError(f"Unrecognized configuration class {config.__class__}") ValueError: Unrecognized configuration class """ @pytest.fixture def mixed_stack_trace(): """Provide a sample stack trace with both axolotl and non-axolotl paths""" return """Traceback (most recent call last): File "/home/user/.local/lib/python3.9/site-packages/axolotl/cli/train.py", line 83, in main trainer = get_trainer(cfg) File "/home/user/.local/lib/python3.9/site-packages/transformers/trainer.py", line 520, in train self._inner_training_loop() File "/home/user/.local/lib/python3.9/site-packages/axolotl/utils/trainer.py", line 75, in _inner_training_loop super()._inner_training_loop() File "/home/user/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in __next__ data = self._next_data() RuntimeError: CUDA out of memory """ @pytest.fixture def venv_stack_trace(): """Provide a sample stack trace with virtual environment paths""" return """Traceback (most recent call last): File "/home/user/venv/lib/python3.9/site-packages/transformers/trainer.py", line 1729, in train self._inner_training_loop() File "/home/user/venv/lib/python3.9/site-packages/transformers/trainer.py", line 2013, in _inner_training_loop self.accelerator.backward(loss) File "/home/user/venv/lib/python3.9/site-packages/accelerate/accelerator.py", line 1851, in backward self.scaler.scale(loss).backward(**kwargs) File "/home/user/venv/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) RuntimeError: CUDA out of memory """ @pytest.fixture def dist_packages_stack_trace(): """Provide a sample stack trace with dist-packages paths""" return """Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__ data = self._next_data() File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 675, in _next_data data = self._dataset_fetcher.fetch(index) File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/usr/local/lib/python3.8/dist-packages/datasets/arrow_dataset.py", line 2808, in __getitem__ raise IndexError(f"Index {key} out of range for dataset of length {len(self)}.") IndexError: Index 10000 out of range for dataset of length 9832. """ @pytest.fixture def project_stack_trace(): """Provide a sample stack trace from a project directory (not a virtual env)""" return """Traceback (most recent call last): File "/home/user/projects/myproject/run.py", line 25, in main() File "/home/user/projects/myproject/src/cli.py", line 45, in main app.run() File "/home/user/projects/myproject/src/app.py", line 102, in run raise ValueError("Configuration missing") ValueError: Configuration missing """ def test_sanitize_stack_trace(example_stack_trace): """Test that sanitize_stack_trace properly preserves axolotl paths""" sanitized = sanitize_stack_trace(example_stack_trace) # Check that personal paths are removed assert "/home/user" not in sanitized assert ".local/lib/python3.9" not in sanitized # Check that site-packages is preserved assert "site-packages/axolotl/cli/train.py" in sanitized assert "site-packages/axolotl/train.py" in sanitized assert "site-packages/axolotl/utils/models.py" in sanitized # Check that error message is preserved assert "ValueError: Model path not found" in sanitized def test_sanitize_windows_paths(windows_stack_trace): """Test that sanitize_stack_trace handles Windows paths""" sanitized = sanitize_stack_trace(windows_stack_trace) # Check that personal paths are removed assert "C:\\Users\\name" not in sanitized assert "AppData\\Local\\Programs\\Python" not in sanitized # Check that both axolotl and transformers packages are preserved assert ( "site-packages\\axolotl\\cli\\train.py" in sanitized or "site-packages/axolotl/cli/train.py" in sanitized ) assert ( "site-packages\\axolotl\\train.py" in sanitized or "site-packages/axolotl/train.py" in sanitized ) assert ( "site-packages\\transformers\\models\\auto\\modeling_auto.py" in sanitized or "site-packages/transformers/models/auto/modeling_auto.py" in sanitized ) # Check that error message is preserved assert "ValueError: Unrecognized configuration class" in sanitized def test_sanitize_mixed_paths(mixed_stack_trace): """Test that sanitize_stack_trace preserves all package paths""" sanitized = sanitize_stack_trace(mixed_stack_trace) # Check that all package paths are preserved assert "site-packages/axolotl/cli/train.py" in sanitized assert "site-packages/transformers/trainer.py" in sanitized assert "site-packages/axolotl/utils/trainer.py" in sanitized assert "site-packages/torch/utils/data/dataloader.py" in sanitized # Check that error message is preserved assert "RuntimeError: CUDA out of memory" in sanitized def test_sanitize_venv_paths(venv_stack_trace): """Test that sanitize_stack_trace preserves virtual environment package paths""" sanitized = sanitize_stack_trace(venv_stack_trace) # Check that personal paths are removed assert "/home/user/venv" not in sanitized # Check that all package paths are preserved assert "site-packages/transformers/trainer.py" in sanitized assert "site-packages/accelerate/accelerator.py" in sanitized assert "site-packages/torch/_tensor.py" in sanitized # Check that error message is preserved assert "RuntimeError: CUDA out of memory" in sanitized def test_sanitize_dist_packages(dist_packages_stack_trace): """Test that sanitize_stack_trace preserves dist-packages paths""" sanitized = sanitize_stack_trace(dist_packages_stack_trace) # Check that system paths are removed assert "/usr/local/lib/python3.8" not in sanitized # Check that all package paths are preserved assert "dist-packages/torch/utils/data/dataloader.py" in sanitized assert "dist-packages/torch/utils/data/_utils/fetch.py" in sanitized assert "dist-packages/datasets/arrow_dataset.py" in sanitized # Check that error message is preserved assert ( "IndexError: Index 10000 out of range for dataset of length 9832." in sanitized ) def test_sanitize_project_paths(project_stack_trace): """Test handling of project paths (non-virtual env)""" sanitized = sanitize_stack_trace(project_stack_trace) # Check that personal paths are removed assert "/home/user/projects" not in sanitized # For non-package paths, we should at least preserve the filename assert "run.py" in sanitized assert "cli.py" in sanitized assert "app.py" in sanitized # Check that error message is preserved assert "ValueError: Configuration missing" in sanitized @pytest.fixture def mock_telemetry_manager(): """Create a mock TelemetryManager""" with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class: mock_manager = MagicMock() mock_manager.enabled = True mock_manager_class.get_instance.return_value = mock_manager yield mock_manager def test_send_errors_successful_execution(mock_telemetry_manager): """Test that send_errors doesn't send telemetry for successful function execution""" @send_errors def test_func(): return "success" result = test_func() assert result == "success" mock_telemetry_manager.send_event.assert_not_called() def test_send_errors_with_exception(mock_telemetry_manager): """Test that send_errors sends telemetry when an exception occurs""" test_error = ValueError("Test error") @send_errors def test_func(): raise test_error with pytest.raises(ValueError) as excinfo: test_func() assert excinfo.value == test_error mock_telemetry_manager.send_event.assert_called_once() # Check that the error info was passed correctly call_args = mock_telemetry_manager.send_event.call_args[1] assert "test_func-error" in call_args["event_type"] assert "Test error" in call_args["properties"]["exception"] assert "stack_trace" in call_args["properties"] def test_send_errors_nested_calls(mock_telemetry_manager): """Test that send_errors only sends telemetry once for nested decorated functions""" @send_errors def inner_func(): raise ValueError("Inner error") @send_errors def outer_func(): return inner_func() with pytest.raises(ValueError): outer_func() # Telemetry should be sent only once for the inner function assert mock_telemetry_manager.send_event.call_count == 1 call_args = mock_telemetry_manager.send_event.call_args[1] assert "inner_func-error" in call_args["event_type"] def test_send_errors_telemetry_disable(): """Test that send_errors doesn't attempt to send telemetry when disabled""" with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class: mock_manager = MagicMock() mock_manager.enabled = False mock_manager_class.get_instance.return_value = mock_manager @send_errors def test_func(): raise ValueError("Test error") with pytest.raises(ValueError): test_func() mock_manager.send_event.assert_not_called() def test_error_handled_reset(): """Test that ERROR_HANDLED flag is properly reset""" with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class: # Create and configure the mock manager mock_manager = MagicMock() mock_manager.enabled = True mock_manager_class.get_instance.return_value = mock_manager from axolotl.telemetry.errors import ERROR_HANDLED @send_errors def test_func(): raise ValueError("Test error") assert not ERROR_HANDLED with pytest.raises(ValueError): test_func() from axolotl.telemetry.errors import ERROR_HANDLED assert ERROR_HANDLED def test_module_path_resolution(mock_telemetry_manager): """Test that the module path is correctly resolved for the event type""" import inspect current_module = inspect.getmodule(test_module_path_resolution).__name__ @send_errors def test_func(): raise ValueError("Test error") with pytest.raises(ValueError): test_func() assert mock_telemetry_manager.send_event.called event_type = mock_telemetry_manager.send_event.call_args[1]["event_type"] expected_event_type = f"{current_module}.test_func-error" assert expected_event_type == event_type ================================================ FILE: tests/telemetry/test_manager.py ================================================ """Tests for TelemetryManager class and utilities""" # pylint: disable=redefined-outer-name,protected-access import os from unittest.mock import patch import pytest import yaml from axolotl.telemetry.manager import TelemetryManager @pytest.fixture def mock_whitelist(tmp_path): """Create a temporary whitelist file for testing""" whitelist_content = { "organizations": ["meta-llama", "mistralai"], } whitelist_file = tmp_path / "whitelist.yaml" with open(whitelist_file, "w", encoding="utf-8") as f: yaml.dump(whitelist_content, f) return str(whitelist_file) @pytest.fixture def telemetry_manager_class(): """Reset the TelemetryManager singleton between tests""" original_instance = TelemetryManager._instance original_initialized = TelemetryManager._initialized TelemetryManager._instance = None TelemetryManager._initialized = False yield TelemetryManager TelemetryManager._instance = original_instance TelemetryManager._initialized = original_initialized @pytest.fixture def manager(telemetry_manager_class, mock_whitelist): """Create a TelemetryManager instance with mocked dependencies""" with ( patch("posthog.capture"), patch("posthog.flush"), patch("time.sleep"), patch("axolotl.telemetry.manager.WHITELIST_PATH", mock_whitelist), patch.dict(os.environ, {"RANK": "0"}), ): manager = telemetry_manager_class() # Manually enable for most tests manager.enabled = True return manager def test_singleton_instance(telemetry_manager_class): """Test that TelemetryManager is a singleton""" with ( patch("posthog.capture"), patch("time.sleep"), patch.dict(os.environ, {"RANK": "0"}), ): first = telemetry_manager_class() second = telemetry_manager_class() assert first is second assert telemetry_manager_class.get_instance() is first def test_telemetry_enabled_by_default(telemetry_manager_class): """Test that telemetry is enabled by default (opt-out)""" with ( patch.dict(os.environ, {"RANK": "0"}, clear=True), patch("time.sleep"), patch("logging.Logger.info"), ): manager = telemetry_manager_class() assert manager.enabled def test_telemetry_enabled_with_explicit_opt_in(telemetry_manager_class): """Test that telemetry is enabled when AXOLOTL_DO_NOT_TRACK=0""" with ( patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "RANK": "0"}), patch("time.sleep"), ): manager = telemetry_manager_class() assert manager.enabled def test_telemetry_disabled_with_axolotl_do_not_track(telemetry_manager_class): """Test that telemetry is disabled when AXOLOTL_DO_NOT_TRACK=1""" with ( patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "1", "RANK": "0"}), patch("time.sleep"), ): manager = telemetry_manager_class() assert not manager.enabled def test_telemetry_disabled_with_do_not_track(telemetry_manager_class): """Test that telemetry is disabled when DO_NOT_TRACK=1""" with ( patch.dict( os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "DO_NOT_TRACK": "1", "RANK": "0"} ), patch("time.sleep"), ): manager = telemetry_manager_class() assert not manager.enabled def test_telemetry_disabled_for_non_main_process(telemetry_manager_class): """Test that telemetry is disabled for non-main processes""" with ( patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "RANK": "1"}), patch("time.sleep"), ): manager = telemetry_manager_class() assert not manager.enabled def test_is_whitelisted(telemetry_manager_class, mock_whitelist): """Test org whitelist functionality""" with ( patch("axolotl.telemetry.manager.WHITELIST_PATH", mock_whitelist), patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}), ): manager = telemetry_manager_class() # Should match organizations from the mock whitelist assert manager._is_whitelisted("meta-llama/llama-7b") assert manager._is_whitelisted("mistralai/mistral-7b-instruct") # Should not match assert not manager._is_whitelisted("unknown/model") # Should handle case insensitively assert manager._is_whitelisted("META-LLAMA/Llama-7B") # Should handle empty input assert not manager._is_whitelisted("") def test_system_info_collection(manager): """Test system information collection""" system_info = manager._get_system_info() # Check essential keys assert "os" in system_info assert "python_version" in system_info assert "cpu_count" in system_info assert "memory_total" in system_info assert "accelerator_count" in system_info def test_send_event(telemetry_manager_class): """Test basic event sending""" with ( patch("posthog.capture") as mock_capture, patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}), ): manager = telemetry_manager_class() # Test with clean properties (no PII) manager.send_event("test_event", {"key": "value"}) assert mock_capture.called assert mock_capture.call_args[1]["event"] == "test_event" assert mock_capture.call_args[1]["properties"] == {"key": "value"} assert mock_capture.call_args[1]["distinct_id"] == manager.run_id # Test with default properties (None) mock_capture.reset_mock() manager.send_event("simple_event") assert mock_capture.called assert mock_capture.call_args[1]["properties"] == {} def test_send_system_info(telemetry_manager_class): """Test sending system info""" with ( patch("posthog.capture") as mock_capture, patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}), ): manager = telemetry_manager_class() manager.send_system_info() assert mock_capture.called assert mock_capture.call_args[1]["event"] == "system-info" assert mock_capture.call_args[1]["properties"] == manager.system_info def test_redacted_properties(telemetry_manager_class): """Test path redaction in send_event method""" with ( patch("posthog.capture") as mock_capture, patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}), ): manager = telemetry_manager_class() # Test with properties containing various paths and non-paths test_properties = { "filepath": "/home/user/sensitive/data.txt", "windows_path": "C:\\Users\\name\\Documents\\project\\file.py", "output_dir": "/var/lib/data", "path_to_model": "models/llama/7b", "message": "Training started", # Should not be redacted "metrics": {"loss": 0.5, "accuracy": 0.95}, # Should not be redacted "base_model": "models/local_model", "nested": { "model_path": "/models/my_model", "root_dir": "/home/user/projects", "stats": {"steps": 1000, "epochs": 3}, # Should not be redacted }, } manager.send_event("test_event", test_properties) # Verify the call was made assert mock_capture.called # Get the sanitized properties that were sent sanitized = mock_capture.call_args[1]["properties"] # Check that path-like and base_model keys were redacted assert sanitized["filepath"] == "[REDACTED]" assert sanitized["windows_path"] == "[REDACTED]" assert sanitized["path_to_model"] == "[REDACTED]" assert sanitized["base_model"] == "[REDACTED]" # Check that non-path values were preserved assert sanitized["message"] == "Training started" assert sanitized["metrics"] == {"loss": 0.5, "accuracy": 0.95} # Check nested structure handling assert sanitized["nested"]["model_path"] == "[REDACTED]" assert sanitized["nested"]["root_dir"] == "[REDACTED]" assert sanitized["nested"]["stats"] == {"steps": 1000, "epochs": 3} def test_disable_telemetry(manager): """Test that disabled telemetry doesn't send events""" with patch("posthog.capture") as mock_capture: manager.enabled = False manager.send_event("test_event") assert not mock_capture.called def test_exception_handling_during_send(manager): """Test that exceptions in PostHog are handled gracefully""" with ( patch("posthog.capture", side_effect=Exception("Test error")), patch("logging.Logger.warning") as mock_warning, ): manager.send_event("test_event") warning_logged = False for call in mock_warning.call_args_list: if "Failed to send telemetry event" in str(call): warning_logged = True break assert warning_logged def test_shutdown(manager): """Test shutdown behavior""" with patch("posthog.shutdown") as mock_shutdown: manager.shutdown() assert mock_shutdown.called ================================================ FILE: tests/telemetry/test_runtime_metrics.py ================================================ """Tests for runtime metrics telemetry module""" # pylint: disable=redefined-outer-name from unittest.mock import MagicMock, patch import pytest from axolotl.telemetry.runtime_metrics import RuntimeMetrics, RuntimeMetricsTracker @pytest.fixture def mock_time(): """Mock time.time() to have predictable values in tests""" with patch("time.time") as mock_time: # Start with time 1000.0 and increment by 10 seconds on each call times = [1000.0 + i * 10 for i in range(10)] mock_time.side_effect = times yield mock_time @pytest.fixture def mock_telemetry_manager(): """Create a mock TelemetryManager""" with patch( "axolotl.telemetry.runtime_metrics.TelemetryManager" ) as mock_manager_class: mock_manager = MagicMock() mock_manager.enabled = True mock_manager_class.get_instance.return_value = mock_manager yield mock_manager @pytest.fixture def mock_psutil(): """Mock psutil for memory information""" with patch("axolotl.telemetry.runtime_metrics.psutil") as mock_psutil: mock_process = MagicMock() mock_memory_info = MagicMock() # Set initial memory to 1GB mock_memory_info.rss = 1024 * 1024 * 1024 mock_process.memory_info.return_value = mock_memory_info mock_psutil.Process.return_value = mock_process yield mock_psutil @pytest.fixture def mock_torch(): """Mock torch.cuda functions""" with patch("axolotl.telemetry.runtime_metrics.torch") as mock_torch: mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 2 # Mock memory allocated per device (1GB for device 0, 2GB for device 1) mock_torch.cuda.memory_allocated.side_effect = lambda device: ( (device + 1) * 1024 * 1024 * 1024 ) yield mock_torch class TestRuntimeMetrics: """Tests for RuntimeMetrics class.""" def test_initialization(self): """Test RuntimeMetrics initialization.""" metrics = RuntimeMetrics(start_time=1000.0) assert metrics.start_time == 1000.0 assert metrics.epoch_start_times == {} assert metrics.epoch_end_times == {} assert metrics.peak_gpu_memory == {} assert metrics.total_steps == 0 assert metrics.current_epoch == 0 assert metrics.current_step == 0 assert metrics.peak_cpu_memory == 0 def test_elapsed_time(self, mock_time): """Test elapsed_time property.""" metrics = RuntimeMetrics(start_time=1000.0) # Mock time.time() to return 1050.0 mock_time.side_effect = [1050.0] assert metrics.elapsed_time == 50.0 def test_epoch_time(self): """Test epoch_time method.""" metrics = RuntimeMetrics(start_time=1000.0) # No epoch data assert metrics.epoch_time(0) is None # Add epoch start but no end metrics.epoch_start_times[0] = 1000.0 assert metrics.epoch_time(0) is None # Add epoch end metrics.epoch_end_times[0] = 1060.0 assert metrics.epoch_time(0) == 60.0 def test_average_epoch_time(self): """Test average_epoch_time method.""" metrics = RuntimeMetrics(start_time=1000.0) # No completed epochs assert metrics.average_epoch_time() is None # Add one completed epoch metrics.epoch_start_times[0] = 1000.0 metrics.epoch_end_times[0] = 1060.0 assert metrics.average_epoch_time() == 60.0 # Add second completed epoch metrics.epoch_start_times[1] = 1060.0 metrics.epoch_end_times[1] = 1140.0 # 80 seconds assert metrics.average_epoch_time() == 70.0 # Average of 60 and 80 # Add incomplete epoch (should not affect average) metrics.epoch_start_times[2] = 1140.0 assert metrics.average_epoch_time() == 70.0 def test_steps_per_second(self, mock_time): """Test steps_per_second method.""" metrics = RuntimeMetrics(start_time=1000.0) # No steps - first call to time.time() mock_time.side_effect = None mock_time.return_value = 1050.0 assert metrics.steps_per_second() is None # Add steps - second call to time.time() metrics.total_steps = 100 mock_time.return_value = 1050.0 # Keep same time for consistent result assert metrics.steps_per_second() == 2.0 # 100 steps / 50 seconds def test_to_dict_basic(self, mock_time): """Test to_dict method with basic metrics.""" metrics = RuntimeMetrics(start_time=1000.0) metrics.total_steps = 100 metrics.peak_cpu_memory = 2 * 1024 * 1024 * 1024 # 2GB # Mock elapsed_time mock_time.side_effect = None mock_time.return_value = 1050.0 result = metrics.to_dict() assert result["total_time_seconds"] == 50.0 assert result["total_steps"] == 100 assert result["steps_per_second"] == 2.0 assert result["epochs_completed"] == 0 assert result["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024 assert "epoch_times" not in result assert "gpu_memory" not in result def test_to_dict_with_epochs(self, mock_time): """Test to_dict method with epoch data.""" metrics = RuntimeMetrics(start_time=1000.0) metrics.total_steps = 100 # Add epoch data metrics.epoch_start_times[0] = 1000.0 metrics.epoch_end_times[0] = 1060.0 metrics.epoch_start_times[1] = 1060.0 metrics.epoch_end_times[1] = 1140.0 # Mock elapsed_time mock_time.side_effect = None mock_time.return_value = 1150.0 result = metrics.to_dict() assert "epoch_times" in result assert result["epoch_times"]["epoch_0_seconds"] == 60.0 assert result["epoch_times"]["epoch_1_seconds"] == 80.0 assert result["average_epoch_time_seconds"] == 70.0 def test_to_dict_with_gpu_memory(self, mock_time): """Test to_dict method with GPU memory data.""" metrics = RuntimeMetrics(start_time=1000.0) metrics.peak_gpu_memory = { 0: 1 * 1024 * 1024 * 1024, # 1GB 1: 2 * 1024 * 1024 * 1024, # 2GB } # Mock elapsed_time mock_time.side_effect = [1050.0] result = metrics.to_dict() assert "gpu_memory" in result assert result["gpu_memory"]["gpu_0_peak_memory_bytes"] == 1 * 1024 * 1024 * 1024 assert result["gpu_memory"]["gpu_1_peak_memory_bytes"] == 2 * 1024 * 1024 * 1024 class TestRuntimeMetricsTracker: """Tests for RuntimeMetricsTracker class.""" # pylint: disable=unused-argument def test_initialization(self, mock_time, mock_telemetry_manager): """Test RuntimeMetricsTracker initialization.""" tracker = RuntimeMetricsTracker() assert isinstance(tracker.metrics, RuntimeMetrics) assert tracker.metrics.start_time == 1000.0 # First value from mock_time # pylint: disable=unused-argument def test_start_epoch( self, mock_time, mock_psutil, mock_torch, mock_telemetry_manager ): """Test start_epoch method.""" tracker = RuntimeMetricsTracker() # Reset mock_time to control next value mock_time.side_effect = [1010.0] tracker.start_epoch(0) assert tracker.metrics.current_epoch == 0 assert tracker.metrics.epoch_start_times[0] == 1010.0 # Verify memory metrics were updated assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024 assert 0 in tracker.metrics.peak_gpu_memory assert 1 in tracker.metrics.peak_gpu_memory # pylint: disable=unused-argument def test_end_epoch(self, mock_time, mock_telemetry_manager): """Test end_epoch method.""" tracker = RuntimeMetricsTracker() # Start epoch 0 mock_time.side_effect = [1010.0] tracker.start_epoch(0) # End epoch 0 mock_time.side_effect = [1060.0] tracker.end_epoch(0) assert 0 in tracker.metrics.epoch_end_times assert tracker.metrics.epoch_end_times[0] == 1060.0 # pylint: disable=unused-argument def test_update_step( self, mock_time, mock_psutil, mock_torch, mock_telemetry_manager ): """Test update_step method.""" tracker = RuntimeMetricsTracker() # Update step to a non-multiple of 100 tracker.update_step(42) assert tracker.metrics.current_step == 42 assert tracker.metrics.total_steps == 1 # Memory metrics should not be updated for non-multiple of 100 assert tracker.metrics.peak_cpu_memory == 0 # Update step to a multiple of 100 tracker.update_step(100) assert tracker.metrics.current_step == 100 assert tracker.metrics.total_steps == 2 # Memory metrics should be updated for multiple of 100 assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024 # pylint: disable=unused-argument def test_update_memory_metrics( self, mock_psutil, mock_torch, mock_telemetry_manager ): """Test update_memory_metrics method.""" tracker = RuntimeMetricsTracker() # Initial memory state assert tracker.metrics.peak_cpu_memory == 0 assert tracker.metrics.peak_gpu_memory == {} # Update memory metrics tracker.update_memory_metrics() # Verify CPU memory assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024 # Verify GPU memory assert tracker.metrics.peak_gpu_memory[0] == 1 * 1024 * 1024 * 1024 assert tracker.metrics.peak_gpu_memory[1] == 2 * 1024 * 1024 * 1024 # Change mocked memory values to be lower mock_process = mock_psutil.Process.return_value mock_memory_info = mock_process.memory_info.return_value mock_memory_info.rss = 0.5 * 1024 * 1024 * 1024 # 0.5GB mock_torch.cuda.memory_allocated.side_effect = lambda device: ( (device + 0.5) * 1024 * 1024 * 1024 ) # Update memory metrics again tracker.update_memory_metrics() # Peak values should not decrease assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024 assert tracker.metrics.peak_gpu_memory[0] == 1 * 1024 * 1024 * 1024 assert tracker.metrics.peak_gpu_memory[1] == 2 * 1024 * 1024 * 1024 # Change mocked memory values to be higher mock_memory_info.rss = 2 * 1024 * 1024 * 1024 # 2GB mock_torch.cuda.memory_allocated.side_effect = lambda device: ( (device + 2) * 1024 * 1024 * 1024 ) # Update memory metrics again tracker.update_memory_metrics() # Peak values should increase assert tracker.metrics.peak_cpu_memory == 2 * 1024 * 1024 * 1024 assert tracker.metrics.peak_gpu_memory[0] == 2 * 1024 * 1024 * 1024 assert tracker.metrics.peak_gpu_memory[1] == 3 * 1024 * 1024 * 1024 # pylint: disable=unused-argument def test_get_memory_metrics(self, mock_psutil, mock_torch, mock_telemetry_manager): """Test get_memory_metrics method.""" tracker = RuntimeMetricsTracker() # Set peak memory values tracker.metrics.peak_cpu_memory = 2 * 1024 * 1024 * 1024 tracker.metrics.peak_gpu_memory = { 0: 3 * 1024 * 1024 * 1024, 1: 4 * 1024 * 1024 * 1024, } # Get memory metrics memory_metrics = tracker.get_memory_metrics() # Verify CPU memory assert ( memory_metrics["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024 ) # Current value from mock assert ( memory_metrics["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024 ) # Peak value we set # Verify GPU memory assert ( memory_metrics["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024 ) # Current value from mock assert ( memory_metrics["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024 ) # Peak value we set assert ( memory_metrics["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024 ) # Current value from mock assert ( memory_metrics["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024 ) # Peak value we set ================================================ FILE: tests/test_chunked_xentropy.py ================================================ """ test suite for chunked cross entropy """ import pytest import torch from torch import nn from axolotl.monkeypatch.loss.chunked import get_causal_lm_loss @pytest.fixture def chunked_fixtures(): model_dim = 512 vocab_size = 1024 * 256 seq_len = 2048 batch_size = 1 lm_head = nn.Linear(model_dim, vocab_size) hidden_state = torch.randn(batch_size, seq_len, model_dim) labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len)) return lm_head, hidden_state, labels, vocab_size def test_chunked_forward(chunked_fixtures): lm_head, hidden_state, labels, vocab_size = chunked_fixtures lm_loss = get_causal_lm_loss() logits = lm_head(hidden_state) chunked_lm_loss = lm_loss(logits, labels) logits_flattened = logits.view(-1, vocab_size) labels_flattened = labels.view(-1) loss = nn.functional.cross_entropy( logits_flattened.float(), labels_flattened, reduction="mean" ) assert torch.allclose(chunked_lm_loss, loss, atol=1e-2, rtol=1e-2) ================================================ FILE: tests/test_context_parallel_batch_size.py ================================================ """Tests for batch_size calculation with context parallelism.""" import sys import types import pytest from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault @pytest.fixture(name="cp_base_cfg") def fixture_cp_base_cfg(min_base_cfg): return ( DictDefault( micro_batch_size=2, gradient_accumulation_steps=4, sequence_len=2048, num_epochs=1, flash_attention=True, ) | min_base_cfg ) class TestContextParallelBatchSize: """Verify batch_size scales by effective dp world_size when using context parallelism.""" @pytest.mark.parametrize( "world_size, context_parallel_size, expected_batch_size", [ (4, 1, 32), # no CP: 2*4*4 = 32 (4, 2, 16), # CP=2: 2*4*(4//2) = 16 (4, 4, 8), # CP=4: 2*4*(4//4) = 8 (2, 2, 8), # CP=ws: 2*4*(2//2) = 8 (no scaling) ], ) def test_batch_size_with_context_parallelism( self, cp_base_cfg, monkeypatch, world_size, context_parallel_size, expected_batch_size, ): monkeypatch.setenv("WORLD_SIZE", str(world_size)) # Mock ring_flash_attn since it's not installable on CPU, # but required by schema validation when context_parallel_size > 1. if "ring_flash_attn" not in sys.modules: monkeypatch.setitem( sys.modules, "ring_flash_attn", types.ModuleType("ring_flash_attn") ) cp_base_cfg["context_parallel_size"] = context_parallel_size cfg = validate_config(cp_base_cfg) normalize_config(cfg) assert cfg.batch_size == expected_batch_size ================================================ FILE: tests/test_convert.py ================================================ """Unit tests for src/axolotl/convert.py""" import json import pytest from axolotl.convert import ( FileReader, FileWriter, JsonlSerializer, JsonParser, JsonToJsonlConverter, StdoutWriter, ) class TestJsonParser: def test_parse_valid_json_array(self): parser = JsonParser() result = parser.parse('[{"key": "value"}]') assert result == [{"key": "value"}] def test_parse_valid_json_object(self): parser = JsonParser() result = parser.parse('{"key": "value"}') assert result == {"key": "value"} def test_parse_invalid_json_raises(self): parser = JsonParser() with pytest.raises(json.JSONDecodeError): parser.parse("not valid json") class TestJsonlSerializer: def test_serialize_single_item(self): serializer = JsonlSerializer() result = serializer.serialize([{"a": 1}]) assert result == '{"a": 1}' def test_serialize_multiple_items(self): serializer = JsonlSerializer() result = serializer.serialize([{"a": 1}, {"b": 2}]) lines = result.split("\n") assert len(lines) == 2 assert json.loads(lines[0]) == {"a": 1} assert json.loads(lines[1]) == {"b": 2} def test_serialize_empty_list(self): serializer = JsonlSerializer() result = serializer.serialize([]) assert result == "" class TestFileReaderWriter: def test_read_write_roundtrip(self, tmp_path): test_file = tmp_path / "test.txt" content = '{"hello": "world"}' writer = FileWriter(str(test_file)) writer.write(content) reader = FileReader() result = reader.read(str(test_file)) assert result == content class TestStdoutWriter: def test_write_to_stdout(self, capsys): writer = StdoutWriter() writer.write("hello") captured = capsys.readouterr() assert captured.out == "hello\n" class TestJsonToJsonlConverter: def test_convert_json_to_jsonl(self, tmp_path): input_data = [{"name": "Alice"}, {"name": "Bob"}] input_file = tmp_path / "input.json" output_file = tmp_path / "output.jsonl" input_file.write_text(json.dumps(input_data), encoding="utf-8") converter = JsonToJsonlConverter( FileReader(), FileWriter(str(output_file)), JsonParser(), JsonlSerializer() ) converter.convert(str(input_file)) result = output_file.read_text(encoding="utf-8") lines = result.split("\n") assert len(lines) == 2 assert json.loads(lines[0]) == {"name": "Alice"} assert json.loads(lines[1]) == {"name": "Bob"} ================================================ FILE: tests/test_data.py ================================================ """ test module for the axolotl.utils.data module """ import unittest from transformers import LlamaTokenizer from axolotl.utils.data import encode_streaming, md5 from axolotl.utils.trainer import filter_sequences_by_length from tests.hf_offline_utils import enable_hf_offline class TestEncodePretraining(unittest.TestCase): """ test class for encode pretraining and md5 helper """ @enable_hf_offline def setUp(self): self.tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b") self.tokenizer.add_special_tokens( { "eos_token": "", "bos_token": "", "unk_token": "", "pad_token": "", } ) self.max_tokens = 15 # set a small number for easy inspection def test_encode_pretraining(self): examples = { "text": [ "Hello, world!", "Nice to meet you.", "lorem ipsum dolor sit amet.", "Nice to meet you again!.", "hello, hello", ] } result = encode_streaming(examples, self.tokenizer, self.max_tokens) self.assertEqual(len(result["input_ids"]), 3) # Assert the length of input_ids and attention_mask is correct self.assertEqual(len(result["input_ids"][0]), self.max_tokens) self.assertEqual(len(result["attention_mask"][0]), self.max_tokens) # Assert EOS and PAD tokens are correctly added # hello world! is 4 tokens self.assertEqual(result["input_ids"][0][0], self.tokenizer.bos_token_id) self.assertEqual(result["input_ids"][0][5], self.tokenizer.eos_token_id) self.assertEqual(result["input_ids"][0][6], self.tokenizer.pad_token_id) # second part, 5 tokens self.assertEqual(result["input_ids"][0][7], self.tokenizer.bos_token_id) self.assertEqual(result["input_ids"][0][13], self.tokenizer.eos_token_id) self.assertEqual(result["input_ids"][0][14], self.tokenizer.pad_token_id) def test_md5(self): self.assertEqual(md5("hello world"), "5eb63bbbe01eeed093cb22bb8f5acdc3") self.assertEqual( md5("hello world", "utf-8"), "5eb63bbbe01eeed093cb22bb8f5acdc3" ) def test_excess_length_strategy(self): """Test that excess_length_strategy results in a value error when set to 'raise'.""" # -- single sequence -- # This should work data = {"input_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]} filter_sequences_by_length(data, 32, raise_on_drop=True) # This should return True, since data fits dropped = filter_sequences_by_length(data, 32) self.assertTrue(dropped) # This should raise self.assertRaises( ValueError, filter_sequences_by_length, data, 15, raise_on_drop=True ) # This should return False, since data doesn't fit dropped = filter_sequences_by_length(data, 15) self.assertFalse(dropped) # -- batch sequence -- # This should work data = { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], ] } filter_sequences_by_length(data, 32, raise_on_drop=True) # This should raise self.assertRaises( ValueError, filter_sequences_by_length, data, 15, raise_on_drop=True ) # This should keep the first but drop the second entry dropped = filter_sequences_by_length(data, 15) self.assertEqual(dropped, [True, False]) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_datasets.py ================================================ """Test dataset loading under various conditions.""" import shutil import tempfile from pathlib import Path from typing import Any, Generator from unittest.mock import patch import pytest from datasets import Dataset from huggingface_hub import snapshot_download from transformers import PreTrainedTokenizer from axolotl.loaders.tokenizer import load_tokenizer from axolotl.utils.data.rl import prepare_preference_datasets from axolotl.utils.data.sft import ( _load_tokenized_prepared_datasets, ) from axolotl.utils.dict import DictDefault from tests.constants import ( ALPACA_MESSAGES_CONFIG_OG, ALPACA_MESSAGES_CONFIG_REVISION, SPECIAL_TOKENS, ) from tests.hf_offline_utils import enable_hf_offline class TestDatasetPreparation: """Test a configured dataloader.""" @pytest.fixture def tokenizer( self, tokenizer_huggyllama ) -> Generator[PreTrainedTokenizer, Any, Any]: tokenizer_huggyllama.add_special_tokens(SPECIAL_TOKENS) yield tokenizer_huggyllama @pytest.fixture def dataset_fixture(self): yield Dataset.from_list( [ { "instruction": "Evaluate this sentence for spelling and grammar mistakes", "input": "He finnished his meal and left the resturant", "output": "He finished his meal and left the restaurant.", } ] ) @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_load_hub(self, tokenizer): """Core use case. Verify that processing data from the hub works""" with tempfile.TemporaryDirectory() as tmp_dir: prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 2000 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline @pytest.mark.skip("datasets bug with local datasets when offline") def test_load_local_hub(self, tokenizer): """Niche use case. Verify that a local copy of a hub dataset can be loaded""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path.mkdir(parents=True, exist_ok=True) snapshot_path = snapshot_download( repo_id="mhenrichsen/alpaca_2k_test", repo_type="dataset", local_dir=tmp_ds_path, ) # offline mode doesn't actually copy it to local_dir, so we # have to copy all the contents in the dir manually from the returned snapshot_path shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True) prepared_path = Path(tmp_dir) / "prepared" # Right now a local copy that doesn't fully conform to a dataset # must list data_files and ds_type otherwise the loader won't know # how to load it. cfg = DictDefault( { "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "ds_type": "parquet", "type": "alpaca", "data_files": [ f"{tmp_ds_path}/alpaca_2000.parquet", ], }, ], } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 2000 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features shutil.rmtree(tmp_ds_path) @enable_hf_offline def test_load_from_save_to_disk(self, tokenizer, dataset_fixture): """Usual use case. Verify datasets saved via `save_to_disk` can be loaded.""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_name = Path(tmp_dir) / "tmp_dataset" dataset_fixture.save_to_disk(str(tmp_ds_name)) prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { "path": str(tmp_ds_name), "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 1 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline def test_load_from_dir_of_parquet(self, tokenizer, dataset_fixture): """Usual use case. Verify a directory of parquet files can be loaded.""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_dir = Path(tmp_dir) / "tmp_dataset" tmp_ds_dir.mkdir() tmp_ds_path = tmp_ds_dir / "shard1.parquet" dataset_fixture.to_parquet(tmp_ds_path) prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { "path": str(tmp_ds_dir), "ds_type": "parquet", "name": "test_data", "data_files": [ str(tmp_ds_path), ], "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 1 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline def test_load_from_dir_of_json(self, tokenizer, dataset_fixture): """Standard use case. Verify a directory of json files can be loaded.""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_dir = Path(tmp_dir) / "tmp_dataset" tmp_ds_dir.mkdir() tmp_ds_path = tmp_ds_dir / "shard1.json" dataset_fixture.to_json(tmp_ds_path) prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { "path": str(tmp_ds_dir), "ds_type": "json", "name": "test_data", "data_files": [ str(tmp_ds_path), ], "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 1 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline def test_load_from_single_parquet(self, tokenizer, dataset_fixture): """Standard use case. Verify a single parquet file can be loaded.""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet" dataset_fixture.to_parquet(tmp_ds_path) prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { "path": str(tmp_ds_path), "name": "test_data", "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 1 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline def test_load_from_single_json(self, tokenizer, dataset_fixture): """Standard use case. Verify a single json file can be loaded.""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json" dataset_fixture.to_json(tmp_ds_path) prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { "path": str(tmp_ds_path), "name": "test_data", "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 1 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @pytest.mark.skip(reason="TODO: fix hf offline mode for CI rate limits") @enable_hf_offline def test_load_hub_with_dpo(self): """Verify that processing dpo data from the hub works""" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "rl": "dpo", "chat_template": "llama3", "datasets": [ALPACA_MESSAGES_CONFIG_OG], } ) tokenizer = load_tokenizer(cfg) train_dataset, _ = prepare_preference_datasets(cfg, tokenizer) assert len(train_dataset) == 1800 assert "conversation" not in train_dataset.features assert "chosen" in train_dataset.features assert "rejected" in train_dataset.features assert "prompt" in train_dataset.features @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_load_hub_with_revision(self, tokenizer): """Verify that processing data from the hub works with a specific revision""" with tempfile.TemporaryDirectory() as tmp_dir: prepared_path = Path(tmp_dir) / "prepared" # make sure prepared_path is empty shutil.rmtree(prepared_path, ignore_errors=True) cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", "revision": "d05c1cb", }, ], } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 2000 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features @enable_hf_offline def test_load_hub_with_revision_with_dpo( self, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff ): """Verify that processing dpo data from the hub works with a specific revision""" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "rl": "dpo", "chat_template": "llama3", "datasets": [ALPACA_MESSAGES_CONFIG_REVISION], "dataset_num_proc": 4, } ) with patch( "axolotl.utils.data.rl.load_dataset_with_config" ) as mock_load_dataset: # Set up the mock to return different values on successive calls mock_load_dataset.return_value = ( dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff ) tokenizer = load_tokenizer(cfg) train_dataset, _ = prepare_preference_datasets(cfg, tokenizer) assert len(train_dataset) == 1800 assert "conversation" not in train_dataset.features assert "chosen" in train_dataset.features assert "rejected" in train_dataset.features assert "prompt" in train_dataset.features @enable_hf_offline @pytest.mark.skip("datasets bug with local datasets when offline") def test_load_local_hub_with_revision( self, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, tokenizer ): """Verify that a local copy of a hub dataset can be loaded with a specific revision""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path.mkdir(parents=True, exist_ok=True) snapshot_path = snapshot_download( repo_id="mhenrichsen/alpaca_2k_test", repo_type="dataset", local_dir=tmp_ds_path, revision="d05c1cb", ) shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True) prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "ds_type": "parquet", "type": "alpaca", "data_files": [ f"{tmp_ds_path}/alpaca_2000.parquet", ], "revision": "d05c1cb", }, ], } ) with patch( "axolotl.utils.data.shared.load_dataset_with_config" ) as mock_load_dataset: # Set up the mock to return different values on successive calls mock_load_dataset.return_value = ( dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path), ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 2000 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features shutil.rmtree(tmp_ds_path) @enable_hf_offline def test_loading_local_dataset_folder(self, tokenizer): """Verify that a dataset downloaded to a local folder can be loaded""" with tempfile.TemporaryDirectory() as tmp_dir: tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path.mkdir(parents=True, exist_ok=True) snapshot_path = snapshot_download( repo_id="mhenrichsen/alpaca_2k_test", repo_type="dataset", ) shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True) prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { "path": str(tmp_ds_path), "type": "alpaca", }, ], "dataset_num_proc": 4, } ) with patch( "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path) ): dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg) assert len(dataset) == 2000 assert "input_ids" in dataset.features assert "attention_mask" in dataset.features assert "labels" in dataset.features shutil.rmtree(tmp_ds_path) ================================================ FILE: tests/test_dict.py ================================================ """Module for testing DictDefault class""" import unittest import pytest from axolotl.utils.dict import DictDefault class DictDefaultTest(unittest.TestCase): """ Test DictDefault class """ def test_dict_default(self): cfg = DictDefault( { "key_a": {"key_b": "value_a"}, "key_c": "value_c", "key_d": ["value_d", "value_e"], } ) assert cfg.key_a.key_b == "value_a", ( "DictDefault should return value for existing nested keys" ) assert cfg.key_c == "value_c", ( "DictDefault should return value for existing keys" ) assert cfg.key_d[0] == "value_d", ( "DictDefault should return value for existing keys in list" ) assert "value_e" in cfg.key_d, ( "DictDefault should support in operator for existing keys in list" ) def test_dict_or_operator(self): cfg = DictDefault({"key_a": {"key_b": "value_b"}, "key_f": "value_g"}) cfg = cfg | DictDefault( { "key_a": {"key_b": "value_a"}, "key_c": "value_c", "key_d": ["value_d", "value_e"], "key_f": "value_f", } ) assert cfg.key_a.key_b == "value_b", ( "DictDefault should support OR operator for existing nested keys" ) assert cfg.key_c == "value_c", "DictDefault should not delete existing key" assert cfg.key_d == [ "value_d", "value_e", ], "DictDefault should not overwrite existing keys in list" assert cfg.key_f == "value_g", ( "DictDefault should support OR operator for existing key" ) def test_dict_missingkey(self): cfg = DictDefault({}) assert cfg.random_key is None, "DictDefault should return None for missing keys" def test_dict_or(self): cfg = DictDefault({}) | DictDefault({}) assert cfg.random_key is None, ( "DictDefault should return None for missing keys after | operation" ) def test_dict_nested_missingparentkey(self): """ Due to subclassing Dict, DictDefault will error if we try to access a nested key whose parent key does not exist. """ cfg = DictDefault({}) with pytest.raises( AttributeError, match=r"'NoneType' object has no attribute 'another_random_key'", ): cfg.random_key.another_random_key = "value" def test_dict_shorthand_assignment(self): """ Shorthand assignment is said to not be supported if subclassed. However, their example raises error instead of None. This test ensures that it is supported for current implementation. Ref: https://github.com/mewwts/addict#default-values """ cfg = DictDefault({"key_a": {"key_b": "value_a"}}) cfg.key_a.key_b = "value_b" assert cfg.key_a.key_b == "value_b", "Shorthand assignment should be supported" ================================================ FILE: tests/test_exact_deduplication.py ================================================ """Test suite for functions in the `axolotl.utils.data.utils` module, focusing on the `deduplicate_and_log_datasets` function. Additionally, this test suite includes tests for functions that indirectly call `deduplicate_and_log_datasets` during the execution of the preprocess command. """ import unittest from unittest.mock import patch import pytest from datasets import Dataset from axolotl.loaders import load_processor, load_tokenizer from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.data import prepare_datasets, prepare_preference_datasets from axolotl.utils.data.utils import deduplicate_and_log_datasets from axolotl.utils.dict import DictDefault from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION from tests.hf_offline_utils import enable_hf_offline def verify_deduplication(actual_dataset, expected_dataset, dataset_name): """Validates deduplication results and size consistency. Parameters: - actual_dataset: Deduplicated dataset. - expected_dataset: Expected dataset. - dataset_name: Name of the dataset (e.g., 'train' or 'eval'). Asserts: - Datasets match in content. - Dataset size matches unique row count. """ # Convert datasets to sets of tuples for unordered comparison actual_rows = set(tuple(row.values()) for row in actual_dataset) expected_rows = set(tuple(row.values()) for row in expected_dataset) # Verify deduplication correctness assert actual_rows == expected_rows, f"Mismatch in {dataset_name} dataset" # Verify size consistency assert len(actual_rows) == len(actual_dataset), ( f"Size mismatch in {dataset_name} dataset after deduplication" ) class TestDeduplicateIndividualFunctions(unittest.TestCase): """Test class for deduplication function in data utils""" def setUp(self): # Sample data with duplicates self.data = { "column1": ["apple", "banana", "apple", "orange", "banana"], "column2": [1, 2, 1, 3, 2], "column3": ["red", "yellow", "red", "orange", "yellow"], } # Expected result after deduplication self.expected_data = { "column1": ["apple", "banana", "orange"], "column2": [1, 2, 3], "column3": ["red", "yellow", "orange"], } # Convert to Dataset format self.dataset = Dataset.from_dict(self.data) self.expected_dataset = Dataset.from_dict(self.expected_data) def test_deduplication(self): train_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset) eval_dataset, _ = deduplicate_and_log_datasets( dataset=self.dataset, dataset_name="eval" ) verify_deduplication(train_dataset, self.expected_dataset, "train_dataset") verify_deduplication(eval_dataset, self.expected_dataset, "eval_dataset") def test_exact_duplicates(self): # Test when datasets are exact duplicates duplicate_data = { "column1": ["apple", "apple", "apple"], "column2": [1, 1, 1], "column3": ["red", "red", "red"], } expected_data = {"column1": ["apple"], "column2": [1], "column3": ["red"]} # Convert to Dataset format dataset = Dataset.from_dict(duplicate_data) expected_dataset = Dataset.from_dict(expected_data) # Run deduplication train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset) eval_dataset, _ = deduplicate_and_log_datasets( dataset=dataset, dataset_name="eval" ) verify_deduplication(train_dataset, expected_dataset, "train_dataset") verify_deduplication(eval_dataset, expected_dataset, "eval_dataset") def test_partial_duplicates(self): # Test when only part of the dataset is a duplicate partial_duplicate_data = { "column1": ["apple", "banana", "apple"], "column2": [1, 2, 1], "column3": ["red", "yellow", "red"], } expected_data = { "column1": ["apple", "banana"], "column2": [1, 2], "column3": ["red", "yellow"], } # Convert to Dataset format dataset = Dataset.from_dict(partial_duplicate_data) expected_dataset = Dataset.from_dict(expected_data) # Run deduplication train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset) eval_dataset, _ = deduplicate_and_log_datasets( dataset=dataset, dataset_name="eval" ) verify_deduplication(train_dataset, expected_dataset, "train_dataset") verify_deduplication(eval_dataset, expected_dataset, "eval_dataset") def test_combined_duplicates_empty(self): # Test when only part of the dataset is a duplicate partial_duplicate_data = { "column1": ["apple", "banana", "apple"], "column2": [1, 2, 1], "column3": ["red", "yellow", "red"], } expected_data_train = { "column1": ["apple", "banana"], "column2": [1, 2], "column3": ["red", "yellow"], } expected_data_eval = { "column1": [], "column2": [], "column3": [], } # Convert to Dataset format dataset = Dataset.from_dict(partial_duplicate_data) expected_dataset_train = Dataset.from_dict(expected_data_train) expected_dataset_eval = Dataset.from_dict(expected_data_eval) # Run deduplication train_dataset, eval_dataset = deduplicate_and_log_datasets( dataset=dataset, other_dataset=dataset ) verify_deduplication(train_dataset, expected_dataset_train, "train_dataset") verify_deduplication(eval_dataset, expected_dataset_eval, "eval_dataset") def test_combined_duplicates_one(self): # Test when only part of the dataset is a duplicate partial_duplicate_data_train = { "column1": ["apple", "banana", "apple"], "column2": [1, 2, 1], "column3": ["red", "yellow", "red"], } partial_duplicate_data_eval = { "column1": ["apple", "orange", "apple"], "column2": [1, 2, 1], "column3": ["red", "orange", "red"], } expected_data_train = { "column1": ["apple", "banana"], "column2": [1, 2], "column3": ["red", "yellow"], } expected_data_eval = { "column1": ["orange"], "column2": [2], "column3": ["orange"], } # Convert to Dataset format dataset_train = Dataset.from_dict(partial_duplicate_data_train) dataset_eval = Dataset.from_dict(partial_duplicate_data_eval) expected_dataset_train = Dataset.from_dict(expected_data_train) expected_dataset_eval = Dataset.from_dict(expected_data_eval) # Run deduplication train_dataset, eval_dataset = deduplicate_and_log_datasets( dataset=dataset_train, other_dataset=dataset_eval ) verify_deduplication(train_dataset, expected_dataset_train, "train_dataset") verify_deduplication(eval_dataset, expected_dataset_eval, "eval_dataset") class TestDeduplicateRLDataset: """Test a configured dataloader with deduplication.""" @pytest.fixture def cfg(self): fixture = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "rl": "dpo", "chat_template": "llama3", "dataset_exact_deduplication": True, "datasets": [ ALPACA_MESSAGES_CONFIG_REVISION, ALPACA_MESSAGES_CONFIG_REVISION, ], "dataset_num_proc": 4, } ) yield fixture @enable_hf_offline def test_load_with_deduplication( self, cfg, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, tokenizer_huggyllama, ): """Verify that loading with deduplication removes duplicates.""" with ( patch( "axolotl.utils.data.rl.load_dataset_with_config" ) as mock_load_dataset, patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer, ): # Set up the mock to return different values on successive calls mock_load_dataset.side_effect = [ dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, ] mock_load_tokenizer.return_value = tokenizer_huggyllama tokenizer = load_tokenizer(cfg) train_dataset, _ = prepare_preference_datasets(cfg, tokenizer) # Verify that the dataset has been deduplicated assert len(train_dataset) == 1800, "Dataset was not properly deduplicated" @enable_hf_offline def test_load_without_deduplication( self, cfg, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, tokenizer_huggyllama, ): with ( patch( "axolotl.utils.data.rl.load_dataset_with_config" ) as mock_load_dataset, patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer, ): # Set up the mock to return different values on successive calls mock_load_dataset.side_effect = [ dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, ] mock_load_tokenizer.return_value = tokenizer_huggyllama # Load the dataset without deduplication cfg.dataset_exact_deduplication = False tokenizer = load_tokenizer(cfg) train_dataset, _ = prepare_preference_datasets(cfg, tokenizer) # Verify that the dataset retains duplicates assert len(train_dataset) == 1800 * 2, ( "Dataset deduplication occurred when it should not have" ) class TestDeduplicateNonRL(unittest.TestCase): """Test prepare_dataset function with different configurations.""" @enable_hf_offline def setUp(self) -> None: self.cfg_1 = DictDefault( { "base_model": "huggyllama/llama-7b", "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "dataset_exact_deduplication": True, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "val_set_size": 0.0, "gradient_accumulation_steps": 2, "batch_size": 10, "micro_batch_size": 10, "num_epochs": 1, } ) self.cfg_1 = validate_config(self.cfg_1) normalize_config(self.cfg_1) @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_prepare_dataset_with_deduplication_train(self): """Verify that prepare_dataset function processes the dataset correctly with deduplication.""" self.cfg_1.dataset_exact_deduplication = True # Load tokenizer and processor tokenizer = load_tokenizer(self.cfg_1) processor = ( load_processor(self.cfg_1, tokenizer=tokenizer) if self.cfg_1.processor_type else None ) # Prepare dataset using the prepare_dataset function train_dataset, _, _, _ = prepare_datasets( self.cfg_1, tokenizer, processor=processor, ) self.assertEqual( len(train_dataset), 2000, "Train dataset should have 2000 samples after deduplication.", ) @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_prepare_dataset_with_deduplication_eval(self): """Verify that prepare_dataset function processes the dataset correctly with deduplication.""" self.cfg_1.dataset_exact_deduplication = True self.cfg_1.val_set_size = 0.5 # Load tokenizer and processor tokenizer = load_tokenizer(self.cfg_1) processor = ( load_processor(self.cfg_1, tokenizer=tokenizer) if self.cfg_1.processor_type else None ) # Prepare dataset using the prepare_dataset function _, eval_dataset, _, _ = prepare_datasets( self.cfg_1, tokenizer, processor=processor, ) self.assertEqual( len(eval_dataset), 1000, "Eval dataset should have 2000 samples after deduplication.", ) @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits") @enable_hf_offline def test_prepare_dataset_without_deduplication(self): """Verify that prepare_dataset function processes the dataset correctly without deduplication.""" self.cfg_1.dataset_exact_deduplication = False self.cfg_1.val_set_size = 0.1 # Load tokenizer and processor tokenizer = load_tokenizer(self.cfg_1) processor = ( load_processor(self.cfg_1, tokenizer=tokenizer) if self.cfg_1.processor_type else None ) # Prepare dataset using the prepare_dataset function train_dataset, eval_dataset, _, _ = prepare_datasets( self.cfg_1, tokenizer, processor=processor, ) # Verify that the dataset has been prepared correctly self.assertEqual( len(train_dataset), 1800 * 2, "Train dataset should have 3600 samples without deduplication.", ) self.assertEqual( len(eval_dataset), 200 * 2, "Train dataset should have 400 samples after deduplication.", ) class TestWrongCollisions(unittest.TestCase): """Creating mock datasets for testing wrong collisions.""" def setUp(self): self.train_data = {"text": ["sample 5", "sample 6"], "label": [1, 2]} self.eval_data = { "text": [ "sample 5", "sample 7", ], # Different label but same text as in train_data "label": [2, 3], } self.dataset_data = { "text": ["sample 5", "sample 9", "sample 5"], "label": [1, 2, 8], } self.train_dataset = Dataset.from_dict(self.train_data) self.eval_dataset = Dataset.from_dict(self.eval_data) self.dataset = Dataset.from_dict(self.dataset_data) def test_deduplication_dataset_only(self): dedup_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset) self.assertEqual( len(dedup_dataset), 3, "Dataset should have all original values" ) self.assertEqual( str(dedup_dataset), str(self.dataset), "The string representation of the output dataset should not differ.", ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_freeze.py ================================================ """ This module contains unit tests for the `freeze_layers_except` function. The `freeze_layers_except` function is used to freeze layers in a model, except for the specified layers. The unit tests in this module verify the behavior of the `freeze_layers_except` function in different scenarios. """ import unittest import torch from torch import nn from axolotl.utils.freeze import freeze_layers_except ZERO = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ONE_TO_TEN = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] class TestFreezeLayersExcept(unittest.TestCase): """ A test case class for the `freeze_layers_except` function. """ def setUp(self): self.model = _TestModel() def test_freeze_layers_with_dots_in_name(self): freeze_layers_except(self.model, ["features.layer"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) def test_freeze_layers_without_dots_in_name(self): freeze_layers_except(self.model, ["classifier"]) self.assertFalse( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertTrue( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) def test_freeze_layers_regex_patterns(self): # The second pattern cannot match because only characters 'a' to 'c' are allowed after the word 'class', whereas it should be matching the character 'i'. freeze_layers_except(self.model, [r"^features.[a-z]+.weight$", r"class[a-c]+"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) def test_all_layers_frozen(self): freeze_layers_except(self.model, []) self.assertFalse( self.model.features.layer.weight.requires_grad, "model.features.layer should be frozen.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) def test_all_layers_unfrozen(self): freeze_layers_except(self.model, ["features.layer", "classifier"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertTrue( self.model.classifier.weight.requires_grad, "model.classifier should be trainable.", ) def test_freeze_layers_with_range_pattern_start_end(self): freeze_layers_except(self.model, ["features.layer[1:5]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ZERO, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ZERO, ZERO, ZERO, ZERO, ZERO, ] ) def test_freeze_layers_with_range_pattern_single_index(self): freeze_layers_except(self.model, ["features.layer[5]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ZERO, ZERO, ZERO, ZERO, ZERO, ONE_TO_TEN, ZERO, ZERO, ZERO, ZERO] ) def test_freeze_layers_with_range_pattern_start_omitted(self): freeze_layers_except(self.model, ["features.layer[:5]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ZERO, ZERO, ZERO, ZERO, ZERO, ] ) def test_freeze_layers_with_range_pattern_end_omitted(self): freeze_layers_except(self.model, ["features.layer[4:]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ZERO, ZERO, ZERO, ZERO, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ] ) def test_freeze_layers_with_range_pattern_merge_included(self): freeze_layers_except(self.model, ["features.layer[4:]", "features.layer[5:6]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ZERO, ZERO, ZERO, ZERO, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ] ) def test_freeze_layers_with_range_pattern_merge_intersect(self): freeze_layers_except(self.model, ["features.layer[4:7]", "features.layer[6:8]"]) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ZERO, ZERO, ZERO, ZERO, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ONE_TO_TEN, ZERO, ZERO, ] ) def test_freeze_layers_with_range_pattern_merge_separate(self): freeze_layers_except( self.model, ["features.layer[1:2]", "features.layer[3:4]", "features.layer[5:6]"], ) self.assertTrue( self.model.features.layer.weight.requires_grad, "model.features.layer should be trainable.", ) self.assertFalse( self.model.classifier.weight.requires_grad, "model.classifier should be frozen.", ) self._assert_gradient_output( [ ZERO, ONE_TO_TEN, ZERO, ONE_TO_TEN, ZERO, ONE_TO_TEN, ZERO, ZERO, ZERO, ZERO, ] ) def _assert_gradient_output(self, expected): input_tensor = torch.tensor([ONE_TO_TEN], dtype=torch.float32) self.model.features.layer.weight.grad = None # Reset gradients output = self.model.features.layer(input_tensor) loss = output.sum() loss.backward() expected_grads = torch.tensor(expected) torch.testing.assert_close( self.model.features.layer.weight.grad, expected_grads ) class _SubLayerModule(nn.Module): def __init__(self): super().__init__() self.layer = nn.Linear(10, 10) class _TestModel(nn.Module): def __init__(self): super().__init__() self.features = _SubLayerModule() self.classifier = nn.Linear(10, 2) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_loaders.py ================================================ """Module for `axolotl.loaders`.""" from unittest.mock import MagicMock import pytest from transformers import BitsAndBytesConfig, PreTrainedTokenizerBase from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.utils.import_utils import is_torch_mps_available from axolotl.loaders import ModelLoader from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import _get_parallel_config_kwargs class TestModelsUtils: """Testing module for `axolotl.loaders`.""" def setup_method(self) -> None: # load config self.cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "load_in_8bit": True, "load_in_4bit": False, "adapter": "lora", "flash_attention": False, "sample_packing": True, "device_map": "auto", } ) self.tokenizer = MagicMock(spec=PreTrainedTokenizerBase) self.inference = False self.reference_model = True # init ModelLoader self.model_loader = ModelLoader( cfg=self.cfg, tokenizer=self.tokenizer, inference=self.inference, reference_model=self.reference_model, ) def test_set_device_map_config(self): # check device_map device_map = self.cfg.device_map if is_torch_mps_available(): device_map = "mps" self.model_loader._set_device_map_config() if is_deepspeed_zero3_enabled(): assert "device_map" not in self.model_loader.model_kwargs else: assert device_map in self.model_loader.model_kwargs["device_map"] # check torch_dtype assert self.cfg.torch_dtype == self.model_loader.model_kwargs["torch_dtype"] @pytest.mark.parametrize("adapter", ["lora", "qlora", None]) @pytest.mark.parametrize("load_in_8bit", [True, False]) @pytest.mark.parametrize("load_in_4bit", [True, False]) @pytest.mark.parametrize("gptq", [True, False]) def test_set_quantization_config( self, adapter, load_in_8bit, load_in_4bit, gptq, ): # init cfg as args self.cfg.load_in_8bit = load_in_8bit self.cfg.load_in_4bit = load_in_4bit self.cfg.gptq = gptq self.cfg.adapter = adapter self.model_loader._set_quantization_config() if "quantization_config" in self.model_loader.model_kwargs or self.cfg.gptq: assert not ( hasattr(self.model_loader.model_kwargs, "load_in_8bit") and hasattr(self.model_loader.model_kwargs, "load_in_4bit") ) if self.cfg.adapter == "qlora" and load_in_4bit: assert isinstance( self.model_loader.model_kwargs.get("quantization_config"), BitsAndBytesConfig, ) assert ( self.model_loader.model_kwargs["quantization_config"]._load_in_4bit is True ) if self.cfg.adapter == "lora" and load_in_8bit: assert isinstance( self.model_loader.model_kwargs.get("quantization_config"), BitsAndBytesConfig, ) assert ( self.model_loader.model_kwargs["quantization_config"]._load_in_8bit is True ) def test_message_property_mapping(self): """Test message property mapping configuration validation""" from axolotl.utils.schemas.datasets import SFTDataset # Test legacy fields are mapped orrectly dataset = SFTDataset( path="test_path", message_field_role="role_field", message_field_content="content_field", ) assert dataset.message_property_mappings == { "role": "role_field", "content": "content_field", } # Test direct message_property_mapping works dataset = SFTDataset( path="test_path", message_property_mappings={ "role": "custom_role", "content": "custom_content", }, ) assert dataset.message_property_mappings == { "role": "custom_role", "content": "custom_content", } # Test both legacy and new fields work when they match dataset = SFTDataset( path="test_path", message_field_role="same_role", message_property_mappings={"role": "same_role"}, ) assert dataset.message_property_mappings == { "role": "same_role", "content": "content", } # Test both legacy and new fields work when they don't overlap dataset = SFTDataset( path="test_path", message_field_role="role_field", message_property_mappings={"content": "content_field"}, ) assert dataset.message_property_mappings == { "role": "role_field", "content": "content_field", } # Test no role or content provided dataset = SFTDataset( path="test_path", ) assert dataset.message_property_mappings == { "role": "role", "content": "content", } # Test error when legacy and new fields conflict with pytest.raises(ValueError) as exc_info: SFTDataset( path="test_path", message_field_role="legacy_role", message_property_mappings={"role": "different_role"}, ) assert "Conflicting message role fields" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: SFTDataset( path="test_path", message_field_content="legacy_content", message_property_mappings={"content": "different_content"}, ) assert "Conflicting message content fields" in str(exc_info.value) @pytest.mark.parametrize( "world_size, tensor_parallel_size, context_parallel_size, dp_shard_size, dp_replicate_size, is_fsdp, expected", [ (16, 2, 2, 2, 2, True, (2, 2, 2, 2)), (16, 1, 1, None, None, True, (0, 0, 16, 1)), (16, 2, 2, 2, None, True, (2, 2, 2, 2)), (16, 2, 2, None, 2, True, (2, 2, 2, 2)), (16, 1, 1, None, 2, True, (0, 0, 8, 2)), (2, 1, 1, None, None, True, (0, 0, 2, 1)), ], ) def test_get_parallel_config_kwargs( self, world_size, tensor_parallel_size, context_parallel_size, dp_shard_size, dp_replicate_size, is_fsdp, expected, ): res = _get_parallel_config_kwargs( world_size, tensor_parallel_size, context_parallel_size, dp_shard_size, dp_replicate_size, is_fsdp, ) if expected[0] > 1: assert res["tp_size"] == expected[0] if expected[1] > 1: assert res["cp_size"] == expected[1] if expected[2] > 1: assert res["dp_shard_size"] == expected[2] if expected[3] > 1: assert res["dp_replicate_size"] == expected[3] ================================================ FILE: tests/test_logging_config_file_capture.py ================================================ import logging import tempfile import pytest def read(path: str) -> str: with open(path, "r", encoding="utf-8") as f: return f.read() @pytest.fixture(autouse=True) def _reset_logging_state(): # Ensure a clean slate for logging between tests for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.shutdown() # Note: dictConfig in configure_logging will set up handlers again yield for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.shutdown() def test_axolotl_logs_captured_at_all_levels(monkeypatch): from axolotl.logging_config import configure_logging from axolotl.utils import tee from axolotl.utils.logging import get_logger with tempfile.TemporaryDirectory() as td: # Avoid stdout tee in this test to simplify interaction with pytest capture monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0") configure_logging() path = tee.prepare_debug_log( type("Cfg", (), {"output_dir": td, "get": lambda *_: False}) ) log = get_logger("axolotl.test") log.info("AX-INFO") log.debug("AX-DEBUG") tee.file_only_stream.flush() data = read(path) assert "AX-INFO" in data assert "AX-DEBUG" in data tee.close_debug_log() def test_third_party_logs_filtered_and_warning_captured(monkeypatch): from axolotl.logging_config import configure_logging from axolotl.utils import tee with tempfile.TemporaryDirectory() as td: monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0") configure_logging() path = tee.prepare_debug_log( type("Cfg", (), {"output_dir": td, "get": lambda *_: False}) ) # Third-party logger (non-axolotl) other = logging.getLogger("thirdparty.lib") other.info("TP-INFO") other.warning("TP-WARN") # Simulate Python warnings routed through logging logging.getLogger("py.warnings").warning("PY-WARN") # Push through buffers tee.file_only_stream.flush() data = read(path) # INFO from non-axolotl should be filtered out (not present) assert "TP-INFO" not in data # WARNING+ should be present assert "TP-WARN" in data # Python warnings captured (via py.warnings logger) assert "PY-WARN" in data tee.close_debug_log() tee.close_debug_log() def test_prepare_debug_log_idempotent_and_no_duplicate(monkeypatch): from axolotl.logging_config import configure_logging from axolotl.utils import tee from axolotl.utils.logging import get_logger with tempfile.TemporaryDirectory() as td: monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0") configure_logging() cfg = type("Cfg", (), {"output_dir": td, "get": lambda *_: False}) p1 = tee.prepare_debug_log(cfg) p2 = tee.prepare_debug_log(cfg) assert p1 == p2 log = get_logger("axolotl.test") marker = "UNIQUE-MARKER-12345" log.info(marker) tee.file_only_stream.flush() data = read(p1) # Ensure the marker appears once (not duplicated via propagation) assert data.count(marker) == 1 tee.close_debug_log() ================================================ FILE: tests/test_lora.py ================================================ """ tests for loading loras """ from axolotl.loaders import ModelLoader, load_tokenizer from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault minimal_config = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "learning_rate": 0.000001, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) class TestLoRALoad: """ Test class for loading LoRA weights """ def test_load_lora_weights(self): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.0, "lora_target_linear": True, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "sequence_len": 1024, } | minimal_config ) cfg = validate_config(cfg) normalize_config(cfg) tokenizer = load_tokenizer(cfg) ModelLoader(cfg, tokenizer).load() def test_load_lora_weights_empty_dropout(self): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": None, "lora_target_linear": True, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "sequence_len": 1024, } | minimal_config ) cfg = validate_config(cfg) normalize_config(cfg) assert cfg.lora_dropout == 0.0 tokenizer = load_tokenizer(cfg) ModelLoader(cfg, tokenizer).load() ================================================ FILE: tests/test_normalize_config.py ================================================ """ Test classes for checking functionality of the cfg normalization """ import unittest from unittest.mock import patch from axolotl.utils.config import ( normalize_cfg_datasets, normalize_config, validate_config, ) from axolotl.utils.dict import DictDefault class NormalizeConfigTestCase(unittest.TestCase): """ test class for normalize_config checks """ def _get_base_cfg(self): return DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "base_model_config": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "num_epochs": 1, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "learning_rate": 0.0001, } ) def test_base_model_config_set_when_empty(self): cfg = self._get_base_cfg() del cfg.base_model_config normalize_config(cfg) assert cfg.base_model_config == cfg.base_model def test_chat_template_chatml(self): cfg = DictDefault( { "chat_template": "chatml", "datasets": [ { "path": "lorem/ipsum", "type": "chat_template", "chat_template": "gemma", }, { "path": "sit/amet", "type": "chat_template", }, ], } ) normalize_cfg_datasets(cfg) assert cfg.datasets[0].chat_template == "gemma" assert cfg.datasets[1].chat_template == "chatml" @patch("axolotl.utils.config.is_torch_bf16_gpu_available") def test_bf16_auto_setter_available(self, mock_bf16_avail): cfg = self._get_base_cfg() cfg.bf16 = "auto" mock_bf16_avail.return_value = True normalize_config(cfg) self.assertTrue(cfg.bf16) self.assertFalse(cfg.fp16) @patch("axolotl.utils.config.is_torch_bf16_gpu_available") def test_bf16_auto_setter_not_available(self, mock_bf16_avail): cfg = self._get_base_cfg() cfg.bf16 = "auto" cfg.fp16 = None mock_bf16_avail.return_value = False normalize_config(cfg) self.assertFalse(cfg.bf16) self.assertTrue(cfg.fp16) @patch("axolotl.utils.config.is_torch_bf16_gpu_available") def test_bf16_disables_fp16(self, mock_bf16_avail): cfg = self._get_base_cfg() cfg.bf16 = True cfg.fp16 = False mock_bf16_avail.return_value = True normalize_config(cfg) self.assertTrue(cfg.bf16) self.assertFalse(cfg.fp16) def test_migrate_fsdp_config(self): """Test basic FSDP config migration with and without fsdp_version""" cfg_with_version = self._get_base_cfg() | DictDefault( { "fsdp_config": { "fsdp_version": 2, "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_offload_params": False, "fsdp_cpu_ram_efficient_loading": True, } } ) cfg_with_version = validate_config(cfg_with_version) self.assertEqual(cfg_with_version.fsdp_version, 2) self.assertEqual( cfg_with_version.fsdp_config.auto_wrap_policy, "TRANSFORMER_BASED_WRAP" ) self.assertEqual(cfg_with_version.fsdp_config.offload_params, False) self.assertEqual(cfg_with_version.fsdp_config.cpu_ram_efficient_loading, True) self.assertNotIn("fsdp_auto_wrap_policy", cfg_with_version.fsdp_config) self.assertNotIn("fsdp_offload_params", cfg_with_version.fsdp_config) self.assertNotIn("fsdp_cpu_ram_efficient_loading", cfg_with_version.fsdp_config) self.assertIn("fsdp_version", cfg_with_version.fsdp_config) cfg_without_version = self._get_base_cfg() | DictDefault( { "fsdp_config": { "fsdp_auto_wrap_policy": "SIZE_BASED_WRAP", "fsdp_offload_params": True, } } ) cfg_without_version = validate_config(cfg_without_version) self.assertNotIn("fsdp_version", cfg_without_version) self.assertEqual( cfg_without_version.fsdp_config.auto_wrap_policy, "SIZE_BASED_WRAP" ) self.assertEqual(cfg_without_version.fsdp_config.offload_params, True) self.assertNotIn("fsdp_auto_wrap_policy", cfg_without_version.fsdp_config) self.assertNotIn("fsdp_offload_params", cfg_without_version.fsdp_config) def test_migrate_fsdp_config_no_fsdp_config(self): """Test that function doesn't crash when no fsdp_config is present""" cfg = self._get_base_cfg() cfg = validate_config(cfg) self.assertNotIn("fsdp_config", cfg) self.assertNotIn("fsdp_version", cfg) def test_migrate_fsdp_config_empty_fsdp_config(self): """Test migration with empty fsdp_config""" cfg = self._get_base_cfg() | DictDefault({"fsdp_config": {}}) cfg = validate_config(cfg) self.assertNotIn("fsdp_version", cfg) self.assertEqual(cfg.fsdp_config, {}) def test_migrate_fsdp_config_mixed_keys(self): """Test migration with a mix of fsdp_ and non-fsdp_ keys""" cfg = self._get_base_cfg() | DictDefault( { "fsdp_config": { "fsdp_version": 1, "fsdp_state_dict_type": "FULL_STATE_DICT", "mixed_precision_policy": "fp16", "activation_checkpointing": True, "fsdp_reshard_after_forward": False, } } ) cfg = validate_config(cfg) self.assertEqual(cfg.fsdp_version, 1) self.assertEqual(cfg.fsdp_config.state_dict_type, "FULL_STATE_DICT") self.assertEqual(cfg.fsdp_config.reshard_after_forward, False) self.assertEqual(cfg.fsdp_config.mixed_precision_policy, "fp16") self.assertEqual(cfg.fsdp_config.activation_checkpointing, True) # Check original fsdp_ keys are removed self.assertNotIn("fsdp_state_dict_type", cfg.fsdp_config) self.assertNotIn("fsdp_reshard_after_forward", cfg.fsdp_config) self.assertIn("fsdp_version", cfg.fsdp_config) ================================================ FILE: tests/test_opentelemetry_callback.py ================================================ """Tests for OpenTelemetry metrics callback functionality.""" import time import pytest from axolotl.utils.dict import DictDefault @pytest.fixture def mock_otel_config(): """Mock configuration for OpenTelemetry callback.""" return DictDefault( { "use_otel_metrics": True, "otel_metrics_host": "localhost", "otel_metrics_port": 8003, # Use unique port for tests } ) @pytest.fixture def mock_trainer_state(): """Mock trainer state for callback testing.""" from transformers import TrainerState state = TrainerState() state.epoch = 1.0 state.global_step = 100 return state @pytest.fixture def mock_training_args(): """Mock training arguments for callback testing.""" from transformers import TrainingArguments return TrainingArguments(output_dir="/tmp/test") @pytest.fixture def mock_trainer_control(): """Mock trainer control for callback testing.""" from transformers.trainer_callback import TrainerControl return TrainerControl() class TestOpenTelemetryConfig: """Test OpenTelemetry configuration schema.""" def test_config_schema_valid(self): """Test OpenTelemetry configuration schema validation.""" from axolotl.utils.schemas.integrations import OpenTelemetryConfig # Test valid config valid_config = { "use_otel_metrics": True, "otel_metrics_host": "localhost", "otel_metrics_port": 8000, } otel_config = OpenTelemetryConfig(**valid_config) assert otel_config.use_otel_metrics is True assert otel_config.otel_metrics_host == "localhost" assert otel_config.otel_metrics_port == 8000 def test_config_defaults(self): """Test OpenTelemetry configuration default values.""" from axolotl.utils.schemas.integrations import OpenTelemetryConfig # Test minimal config with defaults minimal_config = {"use_otel_metrics": True} otel_config = OpenTelemetryConfig(**minimal_config) assert otel_config.use_otel_metrics is True assert otel_config.otel_metrics_host == "localhost" # default assert otel_config.otel_metrics_port == 8000 # default def test_config_disabled_by_default(self): """Test that OpenTelemetry is disabled by default.""" from axolotl.utils.schemas.integrations import OpenTelemetryConfig # Test default config default_config = OpenTelemetryConfig() assert default_config.use_otel_metrics is False class TestOpenTelemetryCallback: """Test OpenTelemetry callback functionality.""" def test_callback_import(self): """Test that OpenTelemetry callback can be imported.""" from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback assert OpenTelemetryMetricsCallback is not None def test_callback_graceful_fallback(self, mock_otel_config): """Test callback gracefully handles missing dependencies.""" from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback # This should not raise an exception even if dependencies are missing callback = OpenTelemetryMetricsCallback(mock_otel_config) # Callback should exist but may have metrics disabled assert callback is not None assert hasattr(callback, "metrics_enabled") def test_callback_initialization_enabled(self, mock_otel_config): """Test callback initialization when OpenTelemetry is available.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) callback = OpenTelemetryMetricsCallback(mock_otel_config) if OPENTELEMETRY_AVAILABLE: assert callback.metrics_enabled is True assert callback.cfg == mock_otel_config assert callback.metrics_host == "localhost" assert callback.metrics_port == 8003 else: assert callback.metrics_enabled is False def test_metrics_server_lifecycle( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test metrics server starts and stops correctly.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) # Start server callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) assert callback.server_started is True # End training callback.on_train_end( mock_training_args, mock_trainer_state, mock_trainer_control ) def test_metrics_recording( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test that metrics are recorded during training.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) # Test logging metrics test_logs = { "loss": 0.5, "learning_rate": 1e-4, "grad_norm": 0.8, } # This should not raise an exception callback.on_log( mock_training_args, mock_trainer_state, mock_trainer_control, logs=test_logs ) assert callback.metrics_enabled is True def test_evaluation_metrics( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test evaluation metrics recording.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) # Test evaluation metrics eval_logs = { "eval_loss": 0.3, "eval_accuracy": 0.95, } # This should not raise an exception callback.on_evaluate( mock_training_args, mock_trainer_state, mock_trainer_control, eval_logs ) assert callback.metrics_enabled is True def test_thread_safety(self, mock_otel_config): """Test that callback has thread safety mechanisms.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) assert hasattr(callback, "metrics_lock") # Check it's a lock-like object assert hasattr(callback.metrics_lock, "__enter__") assert hasattr(callback.metrics_lock, "__exit__") class TestOpenTelemetryIntegration: """Integration tests for OpenTelemetry.""" def test_availability_check(self): """Test availability check function.""" from axolotl.utils import is_opentelemetry_available result = is_opentelemetry_available() assert isinstance(result, bool) def test_prometheus_endpoint_basic( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test basic Prometheus endpoint functionality.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") try: import requests except ImportError: pytest.skip("requests library not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) if not callback.server_started: pytest.skip("Metrics server failed to start") # Give server time to start time.sleep(1) # Try to access metrics endpoint try: response = requests.get( f"http://{callback.metrics_host}:{callback.metrics_port}/metrics", timeout=2, ) assert response.status_code == 200 # Check for Prometheus format assert "# TYPE" in response.text or "# HELP" in response.text except requests.exceptions.RequestException: pytest.skip( "Could not connect to metrics endpoint - this is expected in some environments" ) class TestOpenTelemetryCallbackMethods: """Test specific callback methods.""" def test_step_end_callback( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test step end callback method.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) # Should not raise an exception callback.on_step_end( mock_training_args, mock_trainer_state, mock_trainer_control ) def test_epoch_end_callback( self, mock_otel_config, mock_trainer_state, mock_training_args, mock_trainer_control, ): """Test epoch end callback method.""" from axolotl.utils.callbacks.opentelemetry import ( OPENTELEMETRY_AVAILABLE, OpenTelemetryMetricsCallback, ) if not OPENTELEMETRY_AVAILABLE: pytest.skip("OpenTelemetry dependencies not available") callback = OpenTelemetryMetricsCallback(mock_otel_config) callback.on_train_begin( mock_training_args, mock_trainer_state, mock_trainer_control ) # Should not raise an exception callback.on_epoch_end( mock_training_args, mock_trainer_state, mock_trainer_control ) ================================================ FILE: tests/test_packed_batch_sampler.py ================================================ """Module for testing streaming dataset sequence packing""" import pytest from datasets import concatenate_datasets from torch.utils.data import DataLoader, RandomSampler from transformers import AutoTokenizer from axolotl.datasets import TokenizedPromptDataset from axolotl.prompt_strategies.completion import load from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq from axolotl.utils.data.utils import handle_long_seq_in_dataset from axolotl.utils.dict import DictDefault from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths from tests.hf_offline_utils import enable_hf_offline @pytest.fixture(name="tokenizer") def fixture_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") tokenizer.pad_token = "" return tokenizer class TestBatchedSamplerPacking: """ Test class for packing streaming dataset sequences """ @pytest.mark.parametrize( "batch_size, num_workers", [ (1, 0), (2, 0), (1, 2), (2, 2), ], ) @pytest.mark.parametrize("max_seq_length", [4096, 512]) @pytest.mark.parametrize("sequential", [True, False]) @enable_hf_offline def test_packing( self, dataset_winglian_tiny_shakespeare, batch_size, num_workers, tokenizer, max_seq_length, sequential, ): from axolotl.monkeypatch.data.batch_dataset_fetcher import ( apply_multipack_dataloader_patch, remove_multipack_dataloader_patch, ) # Apply the patch for multipack handling apply_multipack_dataloader_patch() dataset = dataset_winglian_tiny_shakespeare["train"] cfg = DictDefault( { "train_on_inputs": True, "sequence_len": max_seq_length, } ) ds_cfg = DictDefault( { "field": "text", } ) completion_strategy = load(tokenizer, cfg, ds_cfg) dataset_wrapper = TokenizedPromptDataset( completion_strategy, dataset, ) train_dataset = concatenate_datasets([dataset_wrapper]) train_dataset = handle_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg) lengths = get_dataset_lengths(train_dataset) batch_sampler = MultipackBatchSampler( sampler=RandomSampler(train_dataset), lengths=lengths, batch_size=batch_size, batch_max_len=max_seq_length, group_size=100000, bin_size=200, sequential=sequential, drop_last=False, ) loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=V2BatchSamplerDataCollatorForSeq2Seq( tokenizer=tokenizer, padding=True, pad_to_multiple_of=max_seq_length, return_tensors="pt", ), num_workers=num_workers, ) batch_idxs = [] for batch in batch_sampler: for pack in batch: batch_idxs.extend(pack) try: for batch in loader: assert batch["input_ids"].numel() <= batch_size * max_seq_length assert batch["input_ids"].shape[1] == max_seq_length original_idxs = set(range(len(train_dataset))) assert original_idxs == set(batch_idxs) assert len(batch_idxs) == len(set(batch_idxs)) finally: # Clean up: remove the patch after the test remove_multipack_dataloader_patch() ================================================ FILE: tests/test_packed_dataset.py ================================================ """Module for testing dataset sequence packing""" import unittest from transformers import AutoTokenizer from axolotl.cli.args import TrainerCliArgs from axolotl.common.datasets import load_datasets from axolotl.train import setup_model_and_trainer from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault from tests.e2e.utils import with_temp_dir from tests.hf_offline_utils import enable_hf_offline class TestPacking(unittest.TestCase): """ Test class for packing dataset sequences """ @enable_hf_offline def setUp(self) -> None: self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") self.tokenizer.add_special_tokens( { "bos_token": "", "eos_token": "", "unk_token": "", } ) @with_temp_dir def test_lora_packing(self, temp_dir): cfg = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "sample_packing": True, "multipack_real_batches": False, "eval_sample_packing": True, "adapter": "lora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, "val_set_size": 0.2, "special_tokens": { "pad_token": "<|endoftext|>", }, "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", }, ], "dataset_num_proc": 4, "num_epochs": 1, "max_steps": 20, "save_steps": 10, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "fp16": False, "bf16": False, } ) cfg = validate_config(cfg) normalize_config(cfg) cli_args = TrainerCliArgs() dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) ( trainer, _, _, _, _, ) = setup_model_and_trainer(cfg, dataset_meta) sampler = trainer._get_eval_sampler(trainer.eval_dataset) assert "MultipackBatchSampler" in sampler.__class__.__name__ assert ( "V2BatchSamplerDataCollatorForSeq2Seq" in trainer.eval_data_collator.__class__.__name__ ) dataloader = trainer.get_eval_dataloader(trainer.eval_dataset) dataloader_iter = iter(dataloader) batch = next(dataloader_iter) assert batch["input_ids"].shape == (1, 8192) sampler = trainer._get_train_sampler(trainer.train_dataset) assert "MultipackBatchSampler" in sampler.__class__.__name__ assert ( "V2BatchSamplerDataCollatorForSeq2Seq" in trainer.train_data_collator.__class__.__name__ ) dataloader = trainer.get_train_dataloader() dataloader_iter = iter(dataloader) batch = next(dataloader_iter) assert batch["input_ids"].shape == (1, 8192) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_packed_pretraining.py ================================================ """Module for testing streaming dataset sequence packing""" import functools import random import string import pytest import torch from datasets import IterableDataset from torch.utils.data import DataLoader from axolotl.utils.data import get_dataset_wrapper, wrap_streaming_dataset from axolotl.utils.dict import DictDefault class TestPretrainingPacking: """ Test class for packing streaming dataset sequences """ @pytest.fixture def random_text(self): # seed with random.seed(0) for reproducibility random.seed(0) # generate row of random text with "words" of between 2 and 10 characters and # between 400 to 1200 characters per line def rand_txt(): return " ".join( [ "".join( random.choices(string.ascii_lowercase, k=random.randint(2, 10)) ) for _ in range(random.randint(50, 200)) ] ) # Create a list of 2000 random texts rather than just using it within the # generator so the test runs faster data = [rand_txt() for _ in range(500)] # Create an IterableDataset def generator(): for row in data: yield {"text": row} return IterableDataset.from_generator(generator) @pytest.mark.flaky(retries=1, delay=5) def test_packing_stream_dataset(self, tokenizer_huggyllama, random_text): dataset = random_text cfg = DictDefault( { "pretraining_dataset": [ { "path": "winglian/tiny-shakespeare", "type": "pretrain", } ], "sample_packing": True, "pretrain_multipack_attn": True, "pad_to_sequence_len": True, "sequence_len": 2048, "micro_batch_size": 2, "sample_packing_group_size": 100000, "sample_packing_bin_size": 200, } ) ds_wrapper_partial = functools.partial( get_dataset_wrapper, cfg.pretraining_dataset[0], tokenizer_huggyllama, cfg, cfg.pretraining_dataset[0]["type"] or "pretrain", ) original_bsz = cfg.micro_batch_size train_dataset = wrap_streaming_dataset( dataset, tokenizer_huggyllama, cfg, ds_wrapper_partial, ) trainer_loader = DataLoader( train_dataset, batch_size=1, collate_fn=None, drop_last=True, ) idx = 0 for data in trainer_loader: if idx > 3: break assert data["input_ids"].shape == torch.Size( [1, original_bsz * cfg.sequence_len] ) assert data["position_ids"].shape == torch.Size( [1, original_bsz * cfg.sequence_len] ) assert data["labels"].shape == torch.Size( [1, original_bsz * cfg.sequence_len] ) assert "attention_mask" not in data # FIXME add back once we fix packing unpad/pad with attention mask # assert data["attention_mask"].shape == torch.Size( # [1, original_bsz * cfg.sequence_len] # ) idx += 1 ================================================ FILE: tests/test_perplexity.py ================================================ """unit tests for perplexity eval callback""" from pytest import fixture from transformers.models.auto.modeling_auto import AutoModelForCausalLM from transformers.models.auto.tokenization_auto import AutoTokenizer from axolotl.utils.callbacks.perplexity import Perplexity MODEL_NAME = "HuggingFaceTB/SmolLM2-135M" @fixture() def metric(tokenizer): return Perplexity(tokenizer=tokenizer, max_seq_len=512) @fixture() def model(): return AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, dtype="float32" ) @fixture() def tokenizer(): tokenizer_ = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer_.add_special_tokens({"pad_token": "<|endoftext|>"}) return tokenizer_ def test_perplexity_longer_than_stride(model, metric): # taken from https://huggingface.co/datasets/roneneldan/TinyStories sample_text = """ Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong. One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn. Beep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after. One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play?" asked the little fish. The crab looked at Fin and said, "No, I don't want to play. I am cold and I don't feel fine." Fin felt sad but wanted to help the crab feel better. He swam away and thought of a plan. He remembered that the sun could make things warm. So, Fin swam to the top of the water and called to the sun, "Please, sun, help my new friend feel fine and not freeze!" The sun heard Fin's call and shone its warm light on the shore. The crab started to feel better and not so cold. He saw Fin and said, "Thank you, little fish, for making me feel fine. I don't feel like I will freeze now. Let's play together!" And so, Fin and the crab played and became good friends. """ result = metric.compute(model, [sample_text]) ppl = result["score"] assert round(ppl, 2) == 7.41 def test_perplexity_short(model, metric): # taken from https://huggingface.co/datasets/roneneldan/TinyStories sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun." result = metric.compute(model, [sample_text]) ppl = result["score"] assert round(ppl, 2) == 10.33 ================================================ FILE: tests/test_prompt_tokenizers.py ================================================ """Module for testing prompt tokenizers.""" import json from pathlib import Path from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter from axolotl.prompt_strategies.alpaca_w_system import ( InstructionWSystemPromptTokenizingStrategy, SystemDataPrompter, ) from axolotl.prompt_strategies.llama2_chat import ( Llama2ChatPrompter, LLama2ChatTokenizingStrategy, ) from axolotl.prompt_strategies.orpo.chat_template import load from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy from axolotl.prompters import AlpacaPrompter, PromptStyle from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import enable_hf_offline test_data = { "multi_turn_sys": { "conversations": [ {"from": "system", "value": "lorem"}, {"from": "human", "value": "abc"}, {"from": "gpt", "value": "ipsum"}, {"from": "human", "value": "123"}, {"from": "gpt", "value": "sit"}, ] }, "single_turn_sys": { "conversations": [ {"from": "system", "value": "lorem"}, {"from": "human", "value": "abc"}, {"from": "gpt", "value": "ipsum"}, ] }, "single_turn_no_sys": { "conversations": [ {"from": "human", "value": "abc"}, {"from": "gpt", "value": "ipsum"}, ] }, "multi_turn_no_sys": { "conversations": [ {"from": "human", "value": "abc"}, {"from": "gpt", "value": "ipsum"}, {"from": "human", "value": "123"}, {"from": "gpt", "value": "sit"}, ] }, } class TestPromptTokenizationStrategies: """ Test class for prompt tokenization strategies. """ @enable_hf_offline def test_no_sys_prompt(self, tokenizer_huggyllama_w_special_tokens): """ tests the interface between the user and assistant parts """ prompter = NoSystemPrompter() strat = AlpacaPromptTokenizingStrategy( prompter, tokenizer_huggyllama_w_special_tokens, False, 2048, ) sample = { "instruction": "hello cruel. lorem ipsum dolor sit amet.", "output": "world!", } example = strat.tokenize_prompt(sample) world_idx = example["input_ids"].index(3186) assert example["labels"][world_idx] == 3186 assert example["labels"][world_idx - 1] == -100 @enable_hf_offline def test_alpaca(self, tokenizer_huggyllama_w_special_tokens): """ tests the interface between the user and assistant parts """ prompter = AlpacaPrompter() strat = AlpacaPromptTokenizingStrategy( prompter, tokenizer_huggyllama_w_special_tokens, False, 2048, ) sample = {"instruction": "hello!", "output": "Hi! How can I help?"} example = strat.tokenize_prompt(sample) world_idx = example["input_ids"].index(6324) assert example["labels"][world_idx] == 6324 assert example["labels"][world_idx - 1] == -100 class TestInstructionWSystemPromptTokenizingStrategy: """ Test class for prompt tokenization strategies with sys prompt from the dataset """ @enable_hf_offline def test_system_alpaca(self, tokenizer_huggyllama_w_special_tokens): prompter = SystemDataPrompter(PromptStyle.CHAT.value) strat = InstructionWSystemPromptTokenizingStrategy( prompter, tokenizer_huggyllama_w_special_tokens, False, 2048, ) sample = { "system": "use cot", "instruction": "hello!", "output": "Hi! How can I help?", } example = strat.tokenize_prompt(sample) assert example["input_ids"][0:5] == [ 1, 28962, 1254, 12665, 29901, ] # "SYSTEM:" assert example["input_ids"][5:7] == [671, 20118] # " use cot" assert example["input_ids"][8] == 11889 # USER class Llama2ChatTokenizationTest: """ Test class for prompt tokenization strategies with sys prompt from the dataset """ @enable_hf_offline def test_llama2_chat_integration(self, tokenizer_llama2_7b): with open( Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8" ) as fin: data = fin.read() conversation = json.loads(data) with open( Path(__file__).parent / "fixtures/conversation.tokenized_llama2chat.json", encoding="utf-8", ) as fin: data = fin.read() tokenized_conversation = json.loads(data) prompter = Llama2ChatPrompter() strat = LLama2ChatTokenizingStrategy( prompter, tokenizer_llama2_7b, False, 4096, ) example = strat.tokenize_prompt(conversation) for fields in ["input_ids", "attention_mask", "labels"]: # pytest assert equals assert len(example[fields]) == len(tokenized_conversation[fields]) assert example[fields] == tokenized_conversation[fields] def compare_with_transformers_integration(self, tokenizer_llama2_7b): # this needs transformers >= v4.31.0 from transformers.models.llama.tokenization_llama import B_SYS, E_SYS from transformers.pipelines.conversational import Conversation # from transformers.models.llama.tokenization_llama import DEFAULT_SYSTEM_PROMPT # broken as of 23/7/20 # see https://github.com/huggingface/transformers/pull/24935 DEFAULT_SYSTEM_PROMPT = """\ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" with open( Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8" ) as fin: data = fin.read() conversation = json.loads(data) with open( Path(__file__).parent / "fixtures/conversation.tokenized_llama2chat.json", encoding="utf-8", ) as fin: data = fin.read() tokenized_conversation = json.loads(data) user_input = [] answers = [] for msg in conversation["conversations"]: if msg["from"] == "human": user_input.append(msg["value"]) else: answers.append(msg["value"]) hf_conf = Conversation( text=user_input[-1], past_user_inputs=[B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + user_input[0]] + user_input[1:-1], generated_responses=answers, ) hf_tokens = tokenizer_llama2_7b._build_conversation_input_ids(hf_conf) assert hf_tokens == tokenized_conversation["input_ids"][: len(hf_tokens)] class OrpoTokenizationTest: """test case for the ORPO tokenization""" @enable_hf_offline def test_orpo_integration( self, tokenizer_mistral_7b_instruct_chatml, dataset_argilla_ultrafeedback_binarized_preferences_cleaned, ): ds = dataset_argilla_ultrafeedback_binarized_preferences_cleaned.select([0]) strat = load( tokenizer_mistral_7b_instruct_chatml, DictDefault({"train_on_inputs": False}), DictDefault({"chat_template": "chatml"}), ) res = strat.tokenize_prompt(ds[0]) assert "rejected_input_ids" in res assert "rejected_labels" in res assert "input_ids" in res assert "labels" in res assert "prompt_attention_mask" in res assert len(res["rejected_input_ids"]) == len(res["rejected_labels"]) assert len(res["input_ids"]) == len(res["labels"]) assert len(res["input_ids"]) == len(res["prompt_attention_mask"]) assert res["rejected_labels"][0] == -100 assert res["rejected_input_ids"][-1] == res["rejected_labels"][-1] assert res["labels"][0] == -100 assert res["input_ids"][-1] == res["labels"][-1] assert res["prompt_attention_mask"][0] == 1 assert res["prompt_attention_mask"][-1] == 0 ================================================ FILE: tests/test_prompters.py ================================================ """Module testing prompters""" import unittest from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter from axolotl.prompters import ( AlpacaPrompter, MultipleChoiceExplainPrompter, PromptStyle, UnpromptedPrompter, ) class AlpacaPrompterTest(unittest.TestCase): """ Test AlpacaPrompter """ def test_prompt_style_w_none(self): prompter = AlpacaPrompter(prompt_style=None) res = next(prompter.build_prompt("tell me a joke")) # just testing that it uses instruct style assert "### Instruction:" in res def test_prompt_style_w_instruct(self): prompter = AlpacaPrompter(prompt_style=PromptStyle.INSTRUCT.value) res = next( prompter.build_prompt("tell me a joke about the following", "alpacas") ) assert "Below is an instruction" in res assert "### Instruction:" in res assert "### Input:" in res assert "alpacas" in res assert "### Response:" in res assert "USER:" not in res assert "ASSISTANT:" not in res res = next(prompter.build_prompt("tell me a joke about the following")) assert "Below is an instruction" in res assert "### Instruction:" in res assert "### Input:" not in res assert "### Response:" in res assert "USER:" not in res assert "ASSISTANT:" not in res def test_prompt_style_w_phi(self): prompter = AlpacaPrompter(prompt_style=PromptStyle.PHI.value) res = next(prompter.build_prompt("tell me a joke about the following")) assert ( """<|system|> Below is an instruction that describes a task. Write a response that appropriately completes the request.<|end|> <|user|> tell me a joke about the following<|end|> <|assistant|> """ == res ) def test_prompt_style_w_chat(self): prompter = AlpacaPrompter(prompt_style=PromptStyle.CHAT.value) res = next( prompter.build_prompt("tell me a joke about the following", "alpacas") ) assert "Below is an instruction" in res assert "### Instruction:" not in res assert "### Input:" not in res assert "alpacas" in res assert "### Response:" not in res assert "USER:" in res assert "ASSISTANT:" in res res = next(prompter.build_prompt("tell me a joke about the following")) assert "Below is an instruction" in res assert "### Instruction:" not in res assert "### Input:" not in res assert "### Response:" not in res assert "USER:" in res assert "ASSISTANT:" in res def test_system_prompt(self): prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value) res = next( prompter.build_prompt_w_system( "use cot", "tell me a joke about the following", "alpacas" ) ) assert "use cot" in res assert res.startswith("SYSTEM:") assert "### Instruction:" not in res assert "### Input:" not in res assert "alpacas" in res assert "### Response:" not in res assert "USER:" in res assert "ASSISTANT:" in res class UnpromptedPrompterTest(unittest.TestCase): """ Test class for UnpromptedPrompter with no system prompts """ def test_prompt_style_w_none(self): prompter = UnpromptedPrompter(prompt_style=None) res = next(prompter.build_prompt("tell me a joke")) assert "### Instruction:" in res assert "tell me a joke" in res assert res.startswith("###") def test_prompt_style_w_instruct(self): prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value) res = next( prompter.build_prompt("tell me a joke about the following", "alpacas") ) assert "### Instruction:" in res assert "tell me a joke" in res assert res.startswith("###") def test_prompt_style_w_chat(self): prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value) res = next( prompter.build_prompt("tell me a joke about the following", "alpacas") ) assert "USER:" in res assert "tell me a joke" in res assert res.startswith("USER:") class MultipleChoiceExplainPrompterTest(unittest.TestCase): """ Test class for MultipleChoiceExplainPrompter """ def test_prompt_style_w_chat(self): prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value) res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C")) assert "USER:" in res assert "choose one" in res assert "Choose the answer that best answers the question." in res assert "- A\n- B\n- C" in res ================================================ FILE: tests/test_revision_parameter.py ================================================ """Tests for revision_of_model being passed to tokenizer and processor loaders.""" from unittest.mock import MagicMock, patch from transformers import PreTrainedTokenizerBase from axolotl.utils.dict import DictDefault class TestRevisionParameter: """Tests for revision_of_model being passed to tokenizer and processor loaders.""" @patch("axolotl.loaders.tokenizer.load_model_config") @patch("axolotl.loaders.tokenizer.AutoTokenizer") @patch( "axolotl.loaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches" ) def test_load_tokenizer_passes_revision( self, _mock_patches, mock_auto_tokenizer, _mock_load_config ): mock_tokenizer = MagicMock() mock_tokenizer.__class__.__name__ = "MockTokenizer" mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer cfg = DictDefault( { "tokenizer_config": "some-model", "revision_of_model": "abc123", } ) from axolotl.loaders.tokenizer import load_tokenizer load_tokenizer(cfg) call_kwargs = mock_auto_tokenizer.from_pretrained.call_args assert call_kwargs.kwargs.get("revision") == "abc123" @patch("axolotl.loaders.tokenizer.load_model_config") @patch("axolotl.loaders.tokenizer.AutoTokenizer") @patch( "axolotl.loaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches" ) def test_load_tokenizer_omits_revision_when_unset( self, _mock_patches, mock_auto_tokenizer, _mock_load_config ): mock_tokenizer = MagicMock() mock_tokenizer.__class__.__name__ = "MockTokenizer" mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer cfg = DictDefault( { "tokenizer_config": "some-model", } ) from axolotl.loaders.tokenizer import load_tokenizer load_tokenizer(cfg) call_kwargs = mock_auto_tokenizer.from_pretrained.call_args assert "revision" not in call_kwargs.kwargs @patch("axolotl.loaders.tokenizer.AutoTokenizer") @patch("axolotl.loaders.tokenizer.is_local_main_process", return_value=True) @patch("axolotl.loaders.tokenizer.barrier") def test_modify_tokenizer_files_passes_revision( self, _mock_barrier, _mock_main, mock_auto_tokenizer, temp_dir ): mock_tokenizer = MagicMock() mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer from axolotl.loaders.tokenizer import modify_tokenizer_files modify_tokenizer_files("some-model", {}, output_dir=temp_dir, revision="abc123") call_kwargs = mock_auto_tokenizer.from_pretrained.call_args assert call_kwargs.kwargs.get("revision") == "abc123" @patch("axolotl.loaders.tokenizer.AutoTokenizer") @patch("axolotl.loaders.tokenizer.is_local_main_process", return_value=True) @patch("axolotl.loaders.tokenizer.barrier") def test_modify_tokenizer_files_defaults_revision_to_main( self, _mock_barrier, _mock_main, mock_auto_tokenizer, temp_dir ): mock_tokenizer = MagicMock() mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer from axolotl.loaders.tokenizer import modify_tokenizer_files modify_tokenizer_files("some-model", {}, output_dir=temp_dir) call_kwargs = mock_auto_tokenizer.from_pretrained.call_args assert call_kwargs.kwargs.get("revision") == "main" @patch("axolotl.loaders.processor.AutoProcessor") def test_load_processor_passes_revision(self, mock_auto_processor): mock_processor = MagicMock() mock_processor.size = {} mock_auto_processor.from_pretrained.return_value = mock_processor cfg = DictDefault( { "processor_config": "some-model", "revision_of_model": "abc123", "trust_remote_code": False, } ) tokenizer = MagicMock(spec=PreTrainedTokenizerBase) from axolotl.loaders.processor import load_processor load_processor(cfg, tokenizer) call_kwargs = mock_auto_processor.from_pretrained.call_args assert call_kwargs.kwargs.get("revision") == "abc123" @patch("axolotl.loaders.processor.AutoProcessor") def test_load_processor_omits_revision_when_unset(self, mock_auto_processor): mock_processor = MagicMock() mock_processor.size = {} mock_auto_processor.from_pretrained.return_value = mock_processor cfg = DictDefault( { "processor_config": "some-model", "trust_remote_code": False, } ) tokenizer = MagicMock(spec=PreTrainedTokenizerBase) from axolotl.loaders.processor import load_processor load_processor(cfg, tokenizer) call_kwargs = mock_auto_processor.from_pretrained.call_args assert "revision" not in call_kwargs.kwargs ================================================ FILE: tests/test_save_deduplicated.py ================================================ """Tests to verify that deduplication runs before dataset saving during preprocessing. This addresses GitHub issue #2719: Save De-duplicated Set During Pre-processing. """ from unittest.mock import MagicMock, patch from datasets import Dataset from axolotl.utils.dict import DictDefault class TestSFTSaveDeduplicatedBeforeSave: """Verify that in SFT data loading, deduplication occurs before saving.""" @patch("axolotl.utils.data.sft.save_preprocessed_dataset") @patch("axolotl.utils.data.sft.generate_dataset_hash_from_config") @patch("axolotl.utils.data.sft.deduplicate_and_log_datasets") @patch("axolotl.utils.data.sft.merge_datasets") @patch("axolotl.utils.data.sft._load_and_process_single_dataset") @patch("axolotl.utils.data.sft.datasets_with_name_generator") def test_dedup_called_before_save_sft( self, mock_datasets_gen, mock_load_single, mock_merge, mock_dedup, mock_gen_hash, mock_save, ): """Deduplication should be called before save_preprocessed_dataset in SFT.""" from axolotl.utils.data.sft import _load_raw_datasets # Set up mock data dataset = Dataset.from_dict({"text": ["a", "b", "a"], "label": [1, 2, 1]}) deduped_dataset = Dataset.from_dict({"text": ["a", "b"], "label": [1, 2]}) mock_datasets_gen.return_value = [ DictDefault({"path": "test", "type": "alpaca"}) ] mock_load_single.return_value = (dataset, None) mock_merge.return_value = dataset mock_dedup.return_value = (deduped_dataset, None) mock_gen_hash.return_value = "testhash" cfg = DictDefault( { "skip_prepare_dataset": False, "dataset_exact_deduplication": True, "sequence_len": 1024, "eval_sequence_len": None, "sample_packing": False, "is_preprocess": False, "seed": 42, "datasets": [{"path": "test", "type": "alpaca"}], } ) tokenizer = MagicMock() tokenizer.name_or_path = "test-tokenizer" # Track call order call_order = [] mock_dedup.side_effect = lambda **kwargs: ( call_order.append("dedup") or (deduped_dataset, None) ) mock_save.side_effect = lambda *args, **kwargs: call_order.append("save") _load_raw_datasets( cfg=cfg, datasets_configs=cfg.datasets, tokenizer=tokenizer, split="train", ) # Verify dedup was called assert "dedup" in call_order, "Deduplication should have been called" # Verify save was called assert "save" in call_order, "Save should have been called" # Verify dedup happened before save assert call_order.index("dedup") < call_order.index("save"), ( "Deduplication must occur before saving the dataset" ) @patch("axolotl.utils.data.sft.save_preprocessed_dataset") @patch("axolotl.utils.data.sft.generate_dataset_hash_from_config") @patch("axolotl.utils.data.sft.merge_datasets") @patch("axolotl.utils.data.sft._load_and_process_single_dataset") @patch("axolotl.utils.data.sft.datasets_with_name_generator") def test_no_dedup_when_disabled_sft( self, mock_datasets_gen, mock_load_single, mock_merge, mock_gen_hash, mock_save, ): """Deduplication should not be called when dataset_exact_deduplication is False.""" from axolotl.utils.data.sft import _load_raw_datasets dataset = Dataset.from_dict({"text": ["a", "b", "a"], "label": [1, 2, 1]}) mock_datasets_gen.return_value = [ DictDefault({"path": "test", "type": "alpaca"}) ] mock_load_single.return_value = (dataset, None) mock_merge.return_value = dataset mock_gen_hash.return_value = "testhash" cfg = DictDefault( { "skip_prepare_dataset": False, "dataset_exact_deduplication": False, "sequence_len": 1024, "eval_sequence_len": None, "sample_packing": False, "is_preprocess": False, "seed": 42, "datasets": [{"path": "test", "type": "alpaca"}], } ) tokenizer = MagicMock() tokenizer.name_or_path = "test-tokenizer" with patch("axolotl.utils.data.sft.deduplicate_and_log_datasets") as mock_dedup: _load_raw_datasets( cfg=cfg, datasets_configs=cfg.datasets, tokenizer=tokenizer, split="train", ) mock_dedup.assert_not_called() class TestRLSaveDeduplicatedBeforeSave: """Verify that in RL data loading, deduplication occurs before saving.""" @patch.object(Dataset, "filter", lambda self, *args, **kwargs: self) @patch("axolotl.utils.data.rl.save_preprocessed_dataset") @patch("axolotl.utils.data.rl.generate_dataset_hash_from_config") @patch("axolotl.utils.data.rl.deduplicate_and_log_datasets") @patch("axolotl.utils.data.rl.merge_datasets") @patch("axolotl.utils.data.rl.load_dataset_with_config") @patch("axolotl.utils.data.rl.datasets_with_name_generator") @patch("axolotl.utils.data.rl.load_tokenizer") def test_dedup_called_before_save_rl( self, mock_load_tokenizer, mock_datasets_gen, mock_load_dataset, mock_merge, mock_dedup, mock_gen_hash, mock_save, ): """Deduplication should be called before save_preprocessed_dataset in RL.""" from axolotl.utils.data.rl import _load_split dataset = Dataset.from_dict( { "prompt": ["hi", "bye", "hi"], "chosen": ["a", "b", "a"], "rejected": ["c", "d", "c"], } ) deduped_dataset = Dataset.from_dict( { "prompt": ["hi", "bye"], "chosen": ["a", "b"], "rejected": ["c", "d"], } ) mock_datasets_gen.return_value = [DictDefault({"path": "test", "type": None})] mock_load_dataset.return_value = dataset mock_merge.return_value = dataset mock_dedup.return_value = (deduped_dataset, None) mock_gen_hash.return_value = "testhash" tokenizer = MagicMock() tokenizer.name_or_path = "test-tokenizer" mock_load_tokenizer.return_value = tokenizer cfg = DictDefault( { "skip_prepare_dataset": False, "dataset_exact_deduplication": True, "sequence_len": 1024, "rl": "dpo", "datasets": [{"path": "test", "type": None}], "hf_use_auth_token": False, "dataset_num_proc": 1, "is_preprocess": False, } ) call_order = [] mock_dedup.side_effect = lambda **kwargs: ( call_order.append("dedup") or (deduped_dataset, None) ) mock_save.side_effect = lambda *args, **kwargs: call_order.append("save") _load_split(cfg, split="train") assert "dedup" in call_order, "Deduplication should have been called" assert "save" in call_order, "Save should have been called" assert call_order.index("dedup") < call_order.index("save"), ( "Deduplication must occur before saving the dataset" ) ================================================ FILE: tests/test_schedulers.py ================================================ """ test module for the axolotl.utis.data module """ import unittest import torch from torch.optim import SGD from axolotl.utils.schedulers import get_cosine_schedule_with_warmup_decay_constant class TestCosineConstantLr(unittest.TestCase): """ test class for encode pretraining and md5 helper """ def setUp(self): self.train_steps = 1000 self.warmup_steps = 10 self.min_lr_ratio = 0.1 self.constant_lr_ratio = 0.8 self._lr = 0.01 self.optimizer = SGD([torch.tensor(1)], lr=self._lr) self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant( self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.train_steps, min_lr_ratio=self.min_lr_ratio, constant_lr_ratio=self.constant_lr_ratio, ) def test_schedulers(self): self.assertEqual(self.lr_scheduler.get_last_lr()[0], 0) for _ in range(self.warmup_steps): self.optimizer.step() self.lr_scheduler.step() self.assertEqual(self.lr_scheduler.get_last_lr()[0], self._lr) constant_step = int(self.train_steps * self.constant_lr_ratio) remaining_step = self.train_steps - constant_step for _ in range(constant_step): self.optimizer.step() self.lr_scheduler.step() self.assertEqual( self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio ) for _ in range(remaining_step): self.optimizer.step() self.lr_scheduler.step() self.assertEqual( self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_streaming.py ================================================ """Test streaming configuration and data loading functionality.""" import unittest from unittest.mock import Mock, patch from datasets import IterableDataset from axolotl.utils.config import validate_config from axolotl.utils.data.sft import ( _prepare_streaming_dataset, prepare_datasets, ) from axolotl.utils.dict import DictDefault class TestStreamingConfig(unittest.TestCase): """Test streaming configuration and deprecation handling.""" def test_streaming_multipack_buffer_size_deprecation(self): """Test that pretrain_multipack_buffer_size is properly deprecated.""" # Test with old config name cfg_old = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "pretrain_multipack_buffer_size": 5000, "datasets": [{"path": "test/dataset", "type": "alpaca"}], "sequence_len": 256, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 0.0001, } ) with self.assertLogs("axolotl.utils.schemas.validation", level="WARNING") as cm: validated_cfg = validate_config(cfg_old) self.assertIn("pretrain_multipack_buffer_size` is deprecated", cm.output[0]) self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 5000) self.assertIsNone( getattr(validated_cfg, "pretrain_multipack_buffer_size", None) ) def test_streaming_multipack_buffer_size_new(self): """Test that new streaming_multipack_buffer_size works correctly.""" cfg_new = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "streaming_multipack_buffer_size": 7000, "datasets": [{"path": "test/dataset", "type": "alpaca"}], "sequence_len": 256, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 0.0001, } ) validated_cfg = validate_config(cfg_new) self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 7000) def test_both_buffer_sizes_raises_error(self): """Test that having both old and new buffer size configs raises an error.""" cfg_both = DictDefault( { "base_model": "HuggingFaceTB/SmolLM2-135M", "pretrain_multipack_buffer_size": 5000, "streaming_multipack_buffer_size": 7000, "datasets": [{"path": "test/dataset", "type": "alpaca"}], "sequence_len": 256, "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 0.0001, } ) with self.assertRaises(ValueError) as cm: validate_config(cfg_both) self.assertIn("both are set", str(cm.exception)) class TestStreamingDatasetPreparation(unittest.TestCase): """Test dataset preparation with streaming configuration.""" def setUp(self): self.tokenizer = Mock() self.tokenizer.pad_token_id = 0 self.tokenizer.eos_token_id = 1 @patch("axolotl.utils.data.sft._prepare_streaming_dataset") def test_prepare_datasets_with_streaming_true(self, mock_prepare_streaming): """Test that streaming=True triggers streaming dataset preparation.""" cfg = DictDefault( { "streaming": True, "datasets": [{"path": "test/dataset", "type": "alpaca"}], } ) mock_prepare_streaming.return_value = (Mock(), None, 100, []) prepare_datasets(cfg, self.tokenizer) mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None) @patch("axolotl.utils.data.sft._prepare_streaming_dataset") def test_prepare_datasets_with_pretraining_dataset(self, mock_prepare_streaming): """Test that pretraining_dataset triggers streaming dataset preparation.""" cfg = DictDefault( { "pretraining_dataset": "test/dataset", } ) mock_prepare_streaming.return_value = (Mock(), None, 100, []) prepare_datasets(cfg, self.tokenizer) mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None) @patch("axolotl.utils.data.sft._prepare_standard_dataset") def test_prepare_datasets_without_streaming(self, mock_prepare_standard): """Test that without streaming, standard dataset preparation is used.""" cfg = DictDefault( { "datasets": [{"path": "test/dataset", "type": "alpaca"}], } ) mock_prepare_standard.return_value = (Mock(), None, 100, []) prepare_datasets(cfg, self.tokenizer) mock_prepare_standard.assert_called_once_with(cfg, self.tokenizer, None) class TestStreamingWithSamplePacking(unittest.TestCase): """Test streaming dataset preparation with sample packing.""" def setUp(self): self.tokenizer = Mock() self.tokenizer.pad_token_id = 0 self.tokenizer.eos_token_id = 1 @patch("axolotl.utils.data.sft._load_streaming_dataset") def test_streaming_sft_with_sample_packing_sets_split(self, mock_load_streaming): """Test that streaming SFT with sample_packing sets default split.""" cfg = DictDefault( { "streaming": True, "sample_packing": True, "datasets": [{"path": "test/dataset", "type": "alpaca"}], "sequence_len": 256, "micro_batch_size": 1, } ) mock_load_streaming.return_value = Mock(spec=IterableDataset) with patch("axolotl.utils.data.sft._load_and_prepare_datasets"): _prepare_streaming_dataset(cfg, self.tokenizer, None) # Check that the dataset config has split set to 'train' call_args = mock_load_streaming.call_args dataset_config = call_args[0][0] self.assertEqual(dataset_config.split, "train") def test_multipack_attn_forced_true_for_sft(self): """Test that multipack_attn is forced to True for SFT with sample packing.""" from axolotl.utils.data.streaming import wrap_streaming_dataset cfg = DictDefault( { "sample_packing": True, "pretrain_multipack_attn": False, # Should be overridden for SFT "pretraining_dataset": None, # This makes it SFT "sequence_len": 256, "micro_batch_size": 1, "streaming_multipack_buffer_size": 1000, "seed": 42, } ) mock_dataset = Mock() mock_dataset.features = None # For streaming datasets mock_dataset.__iter__ = Mock(return_value=iter([])) # Empty iterator mock_dataset.map = Mock(return_value=mock_dataset) mock_ds_wrapper = Mock() with patch( "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq" ) as mock_collator: with patch("axolotl.utils.data.streaming.encode_packed_streaming"): wrap_streaming_dataset( mock_dataset, self.tokenizer, cfg, mock_ds_wrapper ) # Check that multipack_attn=True was used in the collator mock_collator.assert_called_once() call_kwargs = mock_collator.call_args[1] self.assertTrue(call_kwargs["multipack_attn"]) def test_multipack_attn_respects_config_for_pretraining(self): """Test that multipack_attn respects config for pretraining datasets.""" from axolotl.utils.data.streaming import wrap_streaming_dataset cfg = DictDefault( { "sample_packing": True, "pretrain_multipack_attn": False, # Should be respected for pretraining "pretraining_dataset": "test/dataset", # This makes it pretraining "sequence_len": 256, "micro_batch_size": 1, "streaming_multipack_buffer_size": 1000, "seed": 42, } ) mock_dataset = Mock() mock_dataset.features = None # For streaming datasets mock_dataset.__iter__ = Mock(return_value=iter([])) # Empty iterator mock_dataset.map = Mock(return_value=mock_dataset) mock_ds_wrapper = Mock() with patch( "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq" ) as mock_collator: with patch("axolotl.utils.data.streaming.encode_packed_streaming"): wrap_streaming_dataset( mock_dataset, self.tokenizer, cfg, mock_ds_wrapper ) # Check that multipack_attn=False was used (respecting config) mock_collator.assert_called_once() call_kwargs = mock_collator.call_args[1] self.assertFalse(call_kwargs["multipack_attn"]) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_tensor_parallel_batch_size.py ================================================ """Tests for batch_size calculation with tensor parallelism.""" from unittest.mock import patch import addict import pytest from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault @pytest.fixture(name="tp_base_cfg") def fixture_tp_base_cfg(min_base_cfg): return ( DictDefault( micro_batch_size=2, gradient_accumulation_steps=4, sequence_len=2048, num_epochs=1, ) | min_base_cfg ) class TestTensorParallelBatchSize: """Verify batch_size scales by effective dp world_size when using tensor parallelism.""" @pytest.mark.parametrize( "world_size, tensor_parallel_size, expected_batch_size", [ (4, 1, 32), # no TP: 2*4*4 = 32 (4, 2, 16), # TP=2: 2*4*(4//2) = 16 (4, 4, 8), # TP=4: 2*4*(4//4) = 8 (2, 2, 8), # TP=ws: 2*4*(2//2) = 8 (no scaling) ], ) def test_batch_size_with_tensor_parallelism( self, tp_base_cfg, monkeypatch, world_size, tensor_parallel_size, expected_batch_size, ): monkeypatch.setenv("WORLD_SIZE", str(world_size)) tp_base_cfg["tensor_parallel_size"] = tensor_parallel_size cfg = validate_config(tp_base_cfg) # Mock load_model_config to avoid downloading the model and to bypass # the tie_word_embeddings validation that blocks TP > 1. with patch( "axolotl.utils.config.load_model_config", return_value=addict.Dict({"model_type": "llama"}), ): normalize_config(cfg) assert cfg.batch_size == expected_batch_size ================================================ FILE: tests/test_tokenizers.py ================================================ """ Test cases for the tokenizer loading """ import unittest import pytest from axolotl.loaders import load_tokenizer from axolotl.utils.dict import DictDefault from tests.hf_offline_utils import enable_hf_offline class TestTokenizers: """ test class for the load_tokenizer fn """ @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer") @enable_hf_offline def test_default_use_fast(self): cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", } ) tokenizer = load_tokenizer(cfg) assert "Fast" in tokenizer.__class__.__name__ @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer") @enable_hf_offline def test_dont_use_fast(self): cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "tokenizer_use_fast": False, } ) tokenizer = load_tokenizer(cfg) assert "Fast" not in tokenizer.__class__.__name__ @enable_hf_offline def test_special_tokens_modules_to_save(self): # setting special_tokens to new token cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "adapter": "lora", "special_tokens": {"bos_token": "[INST]"}, } ) with pytest.raises( ValueError, match=r".*Please set lora_modules_to_save*", ): load_tokenizer(cfg) # setting special_tokens but not changing from default cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "adapter": "lora", "special_tokens": {"bos_token": ""}, } ) load_tokenizer(cfg) # non-adapter setting special_tokens cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "special_tokens": {"bos_token": "[INST]"}, } ) load_tokenizer(cfg) @enable_hf_offline def test_add_additional_special_tokens(self): cfg = DictDefault( { "tokenizer_config": "huggyllama/llama-7b", "special_tokens": {"additional_special_tokens": ["<|im_start|>"]}, } ) tokenizer = load_tokenizer(cfg) assert "LlamaTokenizer" in tokenizer.__class__.__name__ assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792] assert len(tokenizer) == 32001 # ensure reloading the tokenizer again from cfg results in same vocab length tokenizer = load_tokenizer(cfg) assert len(tokenizer) == 32001 @enable_hf_offline def test_added_tokens_overrides(self, temp_dir): cfg = DictDefault( { # use with tokenizer that has reserved_tokens in added_tokens "tokenizer_config": "NousResearch/Llama-3.2-1B", "added_tokens_overrides": { 128041: "RANDOM_OVERRIDE_1", 128042: "RANDOM_OVERRIDE_2", }, "output_dir": temp_dir, } ) tokenizer = load_tokenizer(cfg) assert tokenizer.encode("RANDOM_OVERRIDE_1", add_special_tokens=False) == [ 128041 ] assert tokenizer.encode("RANDOM_OVERRIDE_2", add_special_tokens=False) == [ 128042 ] assert ( tokenizer.decode([128041, 128042]) == "RANDOM_OVERRIDE_1RANDOM_OVERRIDE_2" ) @pytest.mark.skip("FIXME slow test sdist py3.11 + torch2.8.0") @enable_hf_offline def test_added_tokens_overrides_gemma3(self, temp_dir): cfg = DictDefault( { # use with tokenizer that has reserved_tokens in added_tokens "tokenizer_config": "mlx-community/gemma-3-4b-it-8bit", "added_tokens_overrides": { 256001: "RANDOM_OVERRIDE_1", 256002: "RANDOM_OVERRIDE_2", }, "output_dir": temp_dir, } ) tokenizer = load_tokenizer(cfg) assert tokenizer.encode("RANDOM_OVERRIDE_1", add_special_tokens=False) == [ 256001 ] assert tokenizer.encode("RANDOM_OVERRIDE_2", add_special_tokens=False) == [ 256002 ] assert ( tokenizer.decode([256001, 256002]) == "RANDOM_OVERRIDE_1RANDOM_OVERRIDE_2" ) @enable_hf_offline def test_added_tokens_overrides_with_toolargeid(self, temp_dir): cfg = DictDefault( { # use with tokenizer that has reserved_tokens in added_tokens "tokenizer_config": "HuggingFaceTB/SmolLM2-135M", "added_tokens_overrides": {1000000: "BROKEN_RANDOM_OVERRIDE_1"}, "output_dir": temp_dir, } ) with pytest.raises( ValueError, match=r".*Token ID 1000000 not found in added_tokens.*" ): load_tokenizer(cfg) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_train.py ================================================ """Test for batch size calculation for multi-gpu training.""" import pytest from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault @pytest.fixture(name="train_base_cfg") def fixture_train_base_cfg(min_base_cfg): return ( DictDefault( micro_batch_size=2, gradient_accumulation_steps=4, sequence_len=2048, sample_packing=True, num_epochs=1, ) | min_base_cfg ) class TestTrain: """test class for train related tests""" @pytest.mark.parametrize( "world_size, expected_batch_size", [ (1, 8), (4, 32), ], ) def test_batch_size_ddp( self, train_base_cfg, monkeypatch, world_size, expected_batch_size ): monkeypatch.setenv("WORLD_SIZE", str(world_size)) cfg = validate_config(train_base_cfg) normalize_config(cfg) assert cfg.batch_size == expected_batch_size ================================================ FILE: tests/test_triton_kernels.py ================================================ # Copyright 2026 Axolotl AI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. """Unit tests for Triton kernels: entropy_from_logits and selective_log_softmax. Adapted from harness/test_entropy.py and harness/test_selective_logsoftmax.py into proper pytest tests, plus new OOB index safety tests. """ import math import pytest import torch import torch.nn.functional as F pytestmark = pytest.mark.skipif( not torch.cuda.is_available(), reason="CUDA required for Triton kernels" ) # --------------------------------------------------------------------------- # Reference implementations # --------------------------------------------------------------------------- def _ref_entropy(logits): """Reference entropy via log_softmax (numerically stable).""" logp = F.log_softmax(logits.float(), dim=-1) return -(logp.exp() * logp).sum(dim=-1) def _ref_selective_log_softmax(logits, index): """Reference selective log softmax via PyTorch gather.""" squeeze = index.ndim == logits.ndim - 1 if squeeze: index = index.unsqueeze(-1) log_probs = F.log_softmax(logits.float(), dim=-1) result = torch.gather(log_probs, dim=-1, index=index) if squeeze: result = result.squeeze(-1) return result # --------------------------------------------------------------------------- # entropy_from_logits # --------------------------------------------------------------------------- class TestEntropyFromLogits: @pytest.mark.parametrize( "B,L", [ (1, 128), (1, 2048), (4, 512), (8, 256), (1, 1), ], ) def test_correctness_various_shapes(self, B, L): from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 1024 torch.manual_seed(42) logits = torch.randn(B, L, V, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) expected = _ref_entropy(logits) assert result.shape == (B, L) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_2d_input(self): from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.randn(16, 256, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) expected = _ref_entropy(logits) assert result.shape == (16,) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_large_vocab(self): from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 32000 logits = torch.randn(2, V, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) expected = _ref_entropy(logits) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_uniform_distribution(self): """Uniform logits -> entropy = log(V).""" from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 1024 logits = torch.zeros(2, V, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) expected_val = math.log(V) torch.testing.assert_close( result, torch.full((2,), expected_val, device="cuda", dtype=torch.float32), atol=1e-4, rtol=1e-4, ) def test_peaked_distribution(self): """One-hot-like logits -> entropy near 0.""" from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.full((2, 128), -100.0, device="cuda", dtype=torch.float32) logits[:, 0] = 100.0 result = entropy_from_logits(logits) assert (result < 1e-3).all() def test_bfloat16(self): from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.randn(4, 256, device="cuda", dtype=torch.bfloat16) result = entropy_from_logits(logits) expected = _ref_entropy(logits.float()) assert result.dtype == torch.bfloat16 torch.testing.assert_close(result.float(), expected, atol=5e-2, rtol=5e-2) def test_float16(self): from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.randn(4, 256, device="cuda", dtype=torch.float16) result = entropy_from_logits(logits) expected = _ref_entropy(logits.float()) assert result.dtype == torch.float16 torch.testing.assert_close(result.float(), expected, atol=5e-2, rtol=5e-2) def test_non_contiguous_3d_transpose(self): """Non-contiguous 3D tensor via transpose(0,1).""" from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 256 raw = torch.randn(32, 4, V, device="cuda", dtype=torch.float32) logits = raw.transpose(0, 1) # (4, 32, V) non-contiguous assert not logits.is_contiguous() result = entropy_from_logits(logits) expected = _ref_entropy(logits) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_non_contiguous_3d_slice(self): """Non-contiguous 3D tensor via batch slicing.""" from axolotl.monkeypatch.trainer.utils import entropy_from_logits V = 256 raw = torch.randn(8, 32, V, device="cuda", dtype=torch.float32) logits = raw[::2] # (4, 32, V) non-contiguous assert not logits.is_contiguous() result = entropy_from_logits(logits) expected = _ref_entropy(logits) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_many_rows_beyond_max_grid(self): """More rows than MAX_GRID (8192) to test chunked dispatch.""" from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.randn(10000, 128, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) expected = _ref_entropy(logits) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_entropy_non_negative(self): from axolotl.monkeypatch.trainer.utils import entropy_from_logits logits = torch.randn(32, 512, device="cuda", dtype=torch.float32) result = entropy_from_logits(logits) assert (result >= -1e-5).all(), f"Negative entropy: {result.min()}" # --------------------------------------------------------------------------- # selective_log_softmax — forward correctness # --------------------------------------------------------------------------- class TestSelectiveLogSoftmax: @pytest.mark.parametrize( "B,L,K", [ (1, 128, 1), (4, 512, 1), (8, 256, 1), (4, 256, 4), (4, 256, 7), (15, 129, 1), # non-power-of-2 ], ) def test_correctness_various_shapes(self, B, L, K): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 1024 torch.manual_seed(42) logits = torch.randn(B, L, V, device="cuda", dtype=torch.float32) if K == 1: index = torch.randint(0, V, (B, L), device="cuda") else: index = torch.randint(0, V, (B, L, K), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_squeezed_index(self): """Index with ndim == logits.ndim - 1 triggers squeeze path.""" from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 256 logits = torch.randn(8, V, device="cuda", dtype=torch.float32) index = torch.randint(0, V, (8,), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) assert result.shape == (8,) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_large_vocab(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 32000 logits = torch.randn(2, V, device="cuda", dtype=torch.float32) index = torch.randint(0, V, (2, 1), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_bfloat16(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 1024 torch.manual_seed(42) logits = torch.randn(4, 128, V, device="cuda", dtype=torch.bfloat16) index = torch.randint(0, V, (4, 128), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits.float(), index) assert result.dtype == torch.bfloat16 torch.testing.assert_close(result.float(), expected, atol=0.1, rtol=0.1) def test_fp32_tight_tolerance(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 1024 torch.manual_seed(42) logits = torch.randn(2, 256, V, device="cuda", dtype=torch.float32) index = torch.randint(0, V, (2, 256), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5) def test_all_same_index(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn(8, V, device="cuda", dtype=torch.float32) index = torch.zeros(8, 1, device="cuda", dtype=torch.long) result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_last_index(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn(8, V, device="cuda", dtype=torch.float32) index = torch.full((8, 1), V - 1, device="cuda", dtype=torch.long) result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) def test_output_always_nonpositive(self): """Log softmax values should always be <= 0.""" from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 256 logits = torch.randn(32, V, device="cuda", dtype=torch.float32) index = torch.randint(0, V, (32, 1), device="cuda") result = selective_log_softmax(logits, index) assert (result <= 1e-5).all(), f"Positive log-prob: {result.max()}" def test_many_rows_beyond_max_grid(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn(10000, V, device="cuda", dtype=torch.float32) index = torch.randint(0, V, (10000, 1), device="cuda") result = selective_log_softmax(logits, index) expected = _ref_selective_log_softmax(logits, index) torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4) # --------------------------------------------------------------------------- # selective_log_softmax — backward / gradient correctness # --------------------------------------------------------------------------- class TestSelectiveLogSoftmaxBackward: @pytest.mark.parametrize( "B,L,V,K", [ (2, 16, 64, 1), (2, 16, 64, 4), (1, 8, 128, 1), (2, 8, 128, 7), ], ) def test_gradient_matches_reference(self, B, L, V, K): from axolotl.monkeypatch.trainer.utils import selective_log_softmax torch.manual_seed(42) logits_ref = torch.randn( B, L, V, device="cuda", dtype=torch.float32, requires_grad=True ) logits_tri = logits_ref.detach().clone().requires_grad_(True) if K == 1: index = torch.randint(0, V, (B, L), device="cuda") else: index = torch.randint(0, V, (B, L, K), device="cuda") ref_out = _ref_selective_log_softmax(logits_ref, index) tri_out = selective_log_softmax(logits_tri, index) ref_out.sum().backward() tri_out.sum().backward() torch.testing.assert_close( logits_tri.grad, logits_ref.grad, atol=1e-5, rtol=1e-5 ) def test_gradient_bfloat16_full_vocab(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 4096 torch.manual_seed(42) logits_ref = torch.randn( 2, 64, V, device="cuda", dtype=torch.bfloat16, requires_grad=True ) logits_tri = logits_ref.detach().clone().requires_grad_(True) index = torch.randint(0, V, (2, 64), device="cuda") _ref_selective_log_softmax(logits_ref, index).sum().backward() selective_log_softmax(logits_tri, index).sum().backward() torch.testing.assert_close( logits_tri.grad.float(), logits_ref.grad.float(), atol=0.1, rtol=0.1 ) def test_gradient_k1_squeezed(self): """Gradient with squeezed (1D) index.""" from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 256 logits = torch.randn( 8, V, device="cuda", dtype=torch.float32, requires_grad=True ) index = torch.randint(0, V, (8,), device="cuda") result = selective_log_softmax(logits, index) result.sum().backward() triton_grad = logits.grad.clone() logits.grad = None ref = torch.gather( F.log_softmax(logits, dim=-1), dim=-1, index=index.unsqueeze(-1) ).squeeze(-1) ref.sum().backward() torch.testing.assert_close(triton_grad, logits.grad, atol=1e-4, rtol=1e-4) # --------------------------------------------------------------------------- # selective_log_softmax — out-of-bounds index safety # --------------------------------------------------------------------------- class TestSelectiveLogSoftmaxOOBSafety: """Verify that out-of-range indices don't crash or corrupt valid results.""" def test_negative_indices_no_crash(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn(4, V, device="cuda", dtype=torch.float32) index = torch.tensor( [[-1], [0], [V - 1], [-5]], device="cuda", dtype=torch.long ) result = selective_log_softmax(logits, index) assert result.shape == (4, 1) # Valid rows should be finite and match reference valid_idx = torch.tensor([[0], [V - 1]], device="cuda", dtype=torch.long) valid_logits = logits[1:3] expected = _ref_selective_log_softmax(valid_logits, valid_idx) torch.testing.assert_close(result[1:3], expected, atol=1e-4, rtol=1e-4) def test_index_exceeds_vocab_no_crash(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn(4, V, device="cuda", dtype=torch.float32) index = torch.tensor( [[0], [V], [V + 100], [V - 1]], device="cuda", dtype=torch.long ) result = selective_log_softmax(logits, index) assert result.shape == (4, 1) # Valid rows (0 and 3) should match reference for row_idx, idx_val in [(0, 0), (3, V - 1)]: ref = _ref_selective_log_softmax( logits[row_idx : row_idx + 1], torch.tensor([[idx_val]], device="cuda", dtype=torch.long), ) torch.testing.assert_close( result[row_idx : row_idx + 1], ref, atol=1e-4, rtol=1e-4 ) def test_mixed_valid_invalid_multi_index(self): from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 256 K = 3 logits = torch.randn(4, V, device="cuda", dtype=torch.float32) index = torch.tensor( [ [0, 10, -1], # last invalid [V, 5, 100], # first invalid [50, 60, 70], # all valid [-1, V + 1, -100], # all invalid ], device="cuda", dtype=torch.long, ) result = selective_log_softmax(logits, index) assert result.shape == (4, K) # Row 2 (all valid) must match reference exactly valid_index = torch.tensor([[50, 60, 70]], device="cuda", dtype=torch.long) expected = _ref_selective_log_softmax(logits[2:3], valid_index) torch.testing.assert_close(result[2:3], expected, atol=1e-4, rtol=1e-4) def test_oob_backward_no_crash(self): """Backward with OOB indices should not crash and grads should be finite.""" from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn( 4, V, device="cuda", dtype=torch.float32, requires_grad=True ) index = torch.tensor( [[-1], [0], [V + 10], [V - 1]], device="cuda", dtype=torch.long ) result = selective_log_softmax(logits, index) result.sum().backward() assert logits.grad is not None assert torch.isfinite(logits.grad).all() def test_oob_backward_valid_rows_correct(self): """Gradients for valid-index rows should match reference even when other rows have OOB.""" from axolotl.monkeypatch.trainer.utils import selective_log_softmax V = 128 logits = torch.randn( 4, V, device="cuda", dtype=torch.float32, requires_grad=True ) # Row 0: invalid, Row 1: valid, Row 2: invalid, Row 3: valid index = torch.tensor( [[-1], [42], [V + 5], [100]], device="cuda", dtype=torch.long ) result = selective_log_softmax(logits, index) result.sum().backward() # Compute reference gradient for valid rows only logits_ref = logits.detach().clone().requires_grad_(True) valid_rows = [1, 3] valid_indices = [42, 100] for r, idx in zip(valid_rows, valid_indices, strict=True): ref_lp = F.log_softmax(logits_ref[r : r + 1], dim=-1) ref_val = ref_lp[0, idx] ref_val.backward(retain_graph=True) for r in valid_rows: torch.testing.assert_close( logits.grad[r], logits_ref.grad[r], atol=1e-4, rtol=1e-4 ) ================================================ FILE: tests/test_utils_tee.py ================================================ import os import tempfile def _dummy_cfg(output_dir: str, append: bool = False): # Minimal object with attributes used by prepare_debug_log class Cfg: def __init__(self, out, append): self.output_dir = out self._append = append def get(self, key, default=None): if key in {"resume_from_checkpoint", "auto_resume_from_checkpoints"}: return self._append return default return Cfg(output_dir, append) def read(path: str) -> str: with open(path, "r", encoding="utf-8") as f: return f.read() def test_file_only_stream_writes_after_prepare(monkeypatch): from axolotl.utils import tee with tempfile.TemporaryDirectory() as td: # Avoid stdout tee in this test monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0") cfg = _dummy_cfg(td, append=False) # before prepare: writing to file_only_stream creates no file tee.file_only_stream.write("before\n") tee.file_only_stream.flush() assert not os.path.exists(os.path.join(td, "debug.log")) # prepare and write path = tee.prepare_debug_log(cfg) assert os.path.basename(path) == "debug.log" tee.file_only_stream.write("hello\n") tee.file_only_stream.flush() content = read(path) assert "hello" in content tee.close_debug_log() def test_stdout_is_mirrored_after_prepare(capsys, monkeypatch): from axolotl.utils import tee with tempfile.TemporaryDirectory() as td: cfg = _dummy_cfg(td, append=False) try: # Install tee while capture is disabled so stdout tee wraps real stdout. with capsys.disabled(): monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "1") path = tee.prepare_debug_log(cfg) import sys print("printed-line") sys.stdout.flush() # Now verify file contains the line content = read(path) assert "printed-line" in content finally: tee.close_debug_log() def test_truncate_vs_append_behavior(monkeypatch): from axolotl.utils import tee with tempfile.TemporaryDirectory() as td: # Avoid stdout tee in this test monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0") # First run creates file with A cfg = _dummy_cfg(td, append=False) _ = tee.prepare_debug_log(cfg) try: tee.file_only_stream.write("A\n") tee.file_only_stream.flush() finally: tee.close_debug_log() # Second run with append=False truncates cfg2 = _dummy_cfg(td, append=False) path2 = tee.prepare_debug_log(cfg2) try: tee.file_only_stream.write("B\n") tee.file_only_stream.flush() content = read(path2) assert "A\n" not in content and "B\n" in content finally: tee.close_debug_log() # Third run with append=True preserves existing cfg3 = _dummy_cfg(td, append=True) path3 = tee.prepare_debug_log(cfg3) try: tee.file_only_stream.write("C\n") tee.file_only_stream.flush() content = read(path3) assert "B\n" in content and "C\n" in content finally: tee.close_debug_log() ================================================ FILE: tests/test_validation_dataset.py ================================================ """Module for testing the validation module for the dataset config""" import warnings from typing import Optional import pytest from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault from axolotl.utils.schemas.datasets import ChatTemplate warnings.filterwarnings("error") @pytest.fixture(name="minimal_cfg") def fixture_cfg(): return DictDefault( { "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", "learning_rate": 0.000001, "micro_batch_size": 1, "gradient_accumulation_steps": 1, } ) class BaseValidation: """ Base validation module to setup the log capture """ _caplog: Optional[pytest.LogCaptureFixture] = None @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): self._caplog = caplog class TestValidationCheckDatasetConfig(BaseValidation): """ Test the validation for the dataset config to ensure no correct parameters are dropped """ def test_dataset_config_no_drop_param(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", "shards": 10, } ] } ) checked_cfg = validate_config(cfg) def _check_config(): assert checked_cfg.datasets[0].path == cfg.datasets[0].path assert checked_cfg.datasets[0].type == cfg.datasets[0].type assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards _check_config() checked_cfg = validate_config( cfg, capabilities={ "bf16": "false", "tf32": "false", "n_gpu": 1, "compute_capability": "8.0", }, env_capabilities={ "torch_version": "2.6.0", }, ) _check_config() def test_dataset_default_chat_template_no_drop_param(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "LDJnr/Puffin", "type": "chat_template", "field_messages": "conversations", "shards": 10, "message_field_role": "from", "message_field_content": "value", } ], } ) checked_cfg = validate_config(cfg) def _check_config(): assert checked_cfg.datasets[0].path == cfg.datasets[0].path assert checked_cfg.datasets[0].type == cfg.datasets[0].type assert checked_cfg.chat_template is None assert ( checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default ) assert ( checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages ) assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards assert ( checked_cfg.datasets[0].message_field_role == cfg.datasets[0].message_field_role ) assert ( checked_cfg.datasets[0].message_field_content == cfg.datasets[0].message_field_content ) _check_config() checked_cfg = validate_config( cfg, capabilities={ "bf16": "false", "n_gpu": 1, "compute_capability": "8.0", }, env_capabilities={ "torch_version": "2.6.0", }, ) _check_config() def test_dataset_partial_default_chat_template_no_drop_param(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "chat_template": "chatml", "datasets": [ { "path": "LDJnr/Puffin", "type": "chat_template", "field_messages": "conversations", "shards": 10, "message_field_role": "from", "message_field_content": "value", } ], } ) checked_cfg = validate_config(cfg) def _check_config(): assert checked_cfg.datasets[0].path == cfg.datasets[0].path assert checked_cfg.datasets[0].type == cfg.datasets[0].type assert checked_cfg.chat_template == ChatTemplate.chatml assert ( checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default ) assert ( checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages ) assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards assert ( checked_cfg.datasets[0].message_field_role == cfg.datasets[0].message_field_role ) assert ( checked_cfg.datasets[0].message_field_content == cfg.datasets[0].message_field_content ) _check_config() checked_cfg = validate_config( cfg, capabilities={ "bf16": "false", "n_gpu": 1, "compute_capability": "8.0", }, env_capabilities={ "torch_version": "2.6.0", }, ) _check_config() def test_dataset_chatml_chat_template_no_drop_param(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "chat_template": "chatml", "datasets": [ { "path": "LDJnr/Puffin", "type": "chat_template", "chat_template": "gemma", "field_messages": "conversations", "shards": 10, "message_field_role": "from", "message_field_content": "value", } ], } ) checked_cfg = validate_config(cfg) def _check_config(): assert checked_cfg.datasets[0].path == cfg.datasets[0].path assert checked_cfg.datasets[0].type == cfg.datasets[0].type assert checked_cfg.chat_template == cfg.chat_template assert ( checked_cfg.datasets[0].chat_template == cfg.datasets[0].chat_template ) assert ( checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages ) assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards assert ( checked_cfg.datasets[0].message_field_role == cfg.datasets[0].message_field_role ) assert ( checked_cfg.datasets[0].message_field_content == cfg.datasets[0].message_field_content ) _check_config() checked_cfg = validate_config( cfg, capabilities={ "bf16": "false", "n_gpu": 1, "compute_capability": "8.0", }, env_capabilities={ "torch_version": "2.6.0", }, ) _check_config() def test_dataset_sharegpt_deprecation(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "chat_template": "chatml", "datasets": [ { "path": "LDJnr/Puffin", "type": "sharegpt", "conversation": "chatml", } ], } ) # Check sharegpt deprecation is raised with pytest.raises(ValueError, match=r".*type: sharegpt.*` is deprecated.*"): validate_config(cfg) # Check that deprecation is not thrown for non-str type cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": { "field_instruction": "instruction", "field_output": "output", "field_system": "system", "format": "<|user|> {instruction} {input} <|model|>", "no_input_format": "<|user|> {instruction} <|model|>", "system_prompt": "", }, } ], } ) validate_config(cfg) # Check that deprecation is not thrown for non-sharegpt type cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], } ) validate_config(cfg) def test_message_property_mappings(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", "message_property_mappings": { "role": "role", "content": "content", }, } ], } ) validate_config(cfg) class TestOptimizerValidation(BaseValidation): """ Test muon optimizer validation """ def test_muon_deepspeed(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "optimizer": "muon", "deepspeed": "deepspeed_configs/zero3.json", } ) with pytest.raises(ValueError, match=r".*is currently incompatible with*"): validate_config(cfg) def test_muon_fsdp(self, minimal_cfg): cfg = DictDefault( minimal_cfg | { "datasets": [ { "path": "mhenrichsen/alpaca_2k_test", "type": "alpaca", } ], "optimizer": "muon", "fsdp": ["full_shard"], "fsdp_config": { "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, } ) with pytest.raises(ValueError, match=r".*only compatible with FSDP2.*"): validate_config(cfg) ================================================ FILE: tests/utils/callbacks/test_dynamic_checkpoint.py ================================================ """Unit tests for dynamic checkpoint callback""" import tempfile from pathlib import Path from unittest.mock import MagicMock, Mock, patch from axolotl.utils.callbacks.dynamic_checkpoint import ( DEFAULT_TRIGGER_FILENAME, DynamicCheckpointCallback, ) from axolotl.utils.dict import DictDefault class TestDynamicCheckpointCallbackInit: """Test callback initialization""" def test_callback_disabled_by_default(self): """Test that callback is disabled when config.enabled=False""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": False}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) assert callback.enabled is False def test_callback_disabled_when_none(self): """Test that callback is disabled when dynamic_checkpoint is None""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": None, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) assert callback.enabled is False def test_callback_enabled_when_configured(self): """Test that callback is enabled when config.enabled=True""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 10}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) assert callback.enabled is True assert callback.check_interval == 10 def test_default_trigger_filename(self): """Test that default trigger filename is used""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 10}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) assert callback.trigger_filename == DEFAULT_TRIGGER_FILENAME def test_check_interval_default(self): """Test default check interval""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) assert callback.check_interval == 100 # Default from schema class TestDynamicCheckpointFileDetection: """Test file-based checkpoint triggering""" def test_trigger_file_detected_and_deleted(self): """Test that trigger file is detected and deleted""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 1}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() assert trigger_file.exists() args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=False, ): result = callback.on_step_end(args, state, control) assert not trigger_file.exists() assert result.should_save is True def test_check_interval_honored(self): """Test that file is only checked at check_interval steps""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 10}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) args = Mock(output_dir=tmpdir) control = Mock(should_save=False) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=False, ): # Step 5 - shouldn't check (not divisible by 10) state = Mock(global_step=5) result = callback.on_step_end(args, state, control) assert trigger_file.exists() # Still there assert result.should_save is False # Step 10 - should check state = Mock(global_step=10) result = callback.on_step_end(args, state, control) assert not trigger_file.exists() # Deleted assert result.should_save is True def test_no_file_no_trigger(self): """Test that no trigger occurs when file doesn't exist""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 1}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=False, ): result = callback.on_step_end(args, state, control) assert result.should_save is False def test_file_deletion_error_handling(self): """Test that file deletion errors are handled gracefully""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 1}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=False, ): with patch.object( Path, "unlink", side_effect=OSError("Permission denied") ): result = callback.on_step_end(args, state, control) assert result.should_save is True class TestDynamicCheckpointMultiGPU: """Test multi-GPU synchronization""" def test_only_rank_0_checks_file(self): """Test that only rank 0 checks filesystem in multi-GPU setup""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 1}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) # Rank 1 (not main process) - shouldn't check file with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=False, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=True, ): with patch("torch.distributed.broadcast") as mock_broadcast: with patch( "axolotl.utils.callbacks.dynamic_checkpoint.barrier" ): mock_tensor = MagicMock() mock_tensor.item.return_value = 0 with patch("torch.tensor", return_value=mock_tensor): callback.on_step_end(args, state, control) assert trigger_file.exists() # Broadcast should have been called assert mock_broadcast.called def test_broadcast_synchronization(self): """Test that trigger decision is broadcasted to all ranks""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": True, "check_interval": 1}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) # Rank 0 detects file with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=True, ): with patch("torch.distributed.broadcast") as mock_broadcast: with patch( "axolotl.utils.callbacks.dynamic_checkpoint.barrier" ) as mock_barrier: mock_tensor = MagicMock() mock_tensor.item.return_value = 1 with patch("torch.tensor", return_value=mock_tensor): with patch("torch.cuda.current_device", return_value=0): result = callback.on_step_end(args, state, control) assert mock_broadcast.called assert mock_barrier.called # All ranks should trigger assert result.should_save is True class TestDynamicCheckpointSignalHandling: """Test signal-based checkpoint triggering""" def test_signal_trigger_via_callback(self): """Test that signal flag triggers checkpoint save""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": { "enabled": True, "check_interval": 1, "enable_signal": True, }, "output_dir": tmpdir, } ) with patch("signal.signal"): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.hasattr", return_value=True, ): callback = DynamicCheckpointCallback(cfg) callback.should_save_checkpoint = True args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process", return_value=True, ): with patch( "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed", return_value=False, ): result = callback.on_step_end(args, state, control) assert result.should_save is True assert callback.should_save_checkpoint is False def test_signal_not_registered_when_disabled(self): """Test that signal handler is not registered when disabled""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": { "enabled": True, "check_interval": 10, "enable_signal": False, }, "output_dir": tmpdir, } ) with patch("signal.signal") as mock_signal_register: _ = DynamicCheckpointCallback(cfg) assert not mock_signal_register.called class TestDynamicCheckpointDisabled: """Test behavior when callback is disabled""" def test_disabled_callback_does_nothing(self): """Test that disabled callback doesn't check or trigger""" with tempfile.TemporaryDirectory() as tmpdir: cfg = DictDefault( { "dynamic_checkpoint": {"enabled": False}, "output_dir": tmpdir, } ) callback = DynamicCheckpointCallback(cfg) trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME trigger_file.touch() args = Mock(output_dir=tmpdir) state = Mock(global_step=1) control = Mock(should_save=False) result = callback.on_step_end(args, state, control) assert trigger_file.exists() assert result.should_save is False ================================================ FILE: tests/utils/data/test_utils.py ================================================ """ Unit tests for data utility functions """ import unittest from unittest.mock import MagicMock from datasets import Dataset from axolotl.utils.data.utils import handle_long_seq_in_dataset from axolotl.utils.dict import DictDefault class TestHandleLongSeqInDataset(unittest.TestCase): """ Test class for handle_long_seq_in_dataset function """ def test_drop_strategy_removes_long_sequences(self): """Test that 'drop' strategy removes sequences longer than sequence_len""" # Create dataset with mixed length sequences dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3], # length 3 - keep [1, 2, 3, 4, 5], # length 5 - keep [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # length 11 - drop [1, 2], # length 2 - keep ] } ) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should have dropped the sequence with length 11 self.assertEqual(len(result), 3) self.assertEqual(len(result[0]["input_ids"]), 3) self.assertEqual(len(result[1]["input_ids"]), 5) self.assertEqual(len(result[2]["input_ids"]), 2) def test_drop_strategy_is_default(self): """Test that 'drop' is the default strategy when not specified""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # length 11 - should drop ] } ) cfg = DictDefault( { "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should have dropped the long sequence self.assertEqual(len(result), 1) def test_truncate_strategy_truncates_long_sequences(self): """Test that 'truncate' strategy truncates sequences to sequence_len""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3], # length 3 - keep as is [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ], # length 12 - truncate to 10 ] } ) cfg = DictDefault( { "excess_length_strategy": "truncate", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should have 2 samples self.assertEqual(len(result), 2) # First sample unchanged self.assertEqual(len(result[0]["input_ids"]), 3) # Second sample truncated to 10 self.assertEqual(len(result[1]["input_ids"]), 10) self.assertEqual(result[1]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) def test_truncate_strategy_truncates_all_auxiliary_fields(self): """Test that truncation applies to all auxiliary fields consistently""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ], "attention_mask": [ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ], "labels": [ [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ], "position_ids": [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], ], } ) cfg = DictDefault( { "excess_length_strategy": "truncate", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # All fields should be truncated to 10 self.assertEqual(len(result[0]["input_ids"]), 10) self.assertEqual(len(result[0]["attention_mask"]), 10) self.assertEqual(len(result[0]["labels"]), 10) self.assertEqual(len(result[0]["position_ids"]), 10) # Verify content is correct self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) self.assertEqual(result[0]["attention_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) self.assertEqual(result[0]["labels"], [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10]) self.assertEqual(result[0]["position_ids"], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) def test_raise_strategy_raises_on_long_sequences(self): """Test that 'raise' strategy raises ValueError when encountering long sequences""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # length 11 - should raise ] } ) cfg = DictDefault( { "excess_length_strategy": "raise", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) with self.assertRaises(ValueError): handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) def test_min_sequence_len_filters_short_sequences(self): """Test that sequences shorter than min_sample_len are filtered out""" dataset = Dataset.from_dict( { "input_ids": [ [1], # length 1 - drop (< min_sample_len=3) [1, 2], # length 2 - drop [1, 2, 3], # length 3 - keep [1, 2, 3, 4, 5], # length 5 - keep ] } ) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 3, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should only keep sequences with length >= 3 self.assertEqual(len(result), 2) self.assertEqual(len(result[0]["input_ids"]), 3) self.assertEqual(len(result[1]["input_ids"]), 5) def test_dataset_without_input_ids_column(self): """Test that datasets without 'input_ids' column are returned unchanged""" dataset = Dataset.from_dict( { "chosen": [1, 2, 3], "rejected": [4, 5, 6], } ) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 2, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Dataset should be unchanged self.assertEqual(len(result), len(dataset)) self.assertListEqual(list(result.column_names), ["chosen", "rejected"]) def test_truncate_filters_short_before_truncating(self): """Test that truncate strategy filters short sequences before truncating long ones This is important for efficiency - we should not waste time truncating sequences that will be filtered out anyway. """ dataset = Dataset.from_dict( { "input_ids": [ [1], # length 1 - filter out first [1, 2, 3], # length 3 - keep, no truncation needed [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ], # length 12 - keep and truncate ] } ) cfg = DictDefault( { "excess_length_strategy": "truncate", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should have filtered out the first (short) sequence self.assertEqual(len(result), 2) # Second sample unchanged self.assertEqual(len(result[0]["input_ids"]), 3) # Third sample truncated to 10 self.assertEqual(len(result[1]["input_ids"]), 10) def test_case_insensitive_strategy(self): """Test that excess_length_strategy is case-insensitive""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ] } ) cfg = DictDefault( { "excess_length_strategy": "TRUNCATE", # uppercase "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should still truncate self.assertEqual(len(result[0]["input_ids"]), 10) def test_raise_strategy_silently_drops_short_sequences(self): """Test that 'raise' strategy drops short sequences without raising""" dataset = Dataset.from_dict( { "input_ids": [ [1], # length 1 - too short, should be dropped silently [1, 2, 3, 4, 5], # length 5 - keep ] } ) cfg = DictDefault( { "excess_length_strategy": "raise", "min_sample_len": 3, "dataset_num_proc": None, "is_preprocess": False, } ) # Should NOT raise, just silently drop the short sequence result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) self.assertEqual(len(result), 1) self.assertEqual(len(result[0]["input_ids"]), 5) def test_drop_boundary_sequence_equal_to_sequence_len(self): """Test that drop strategy keeps sequences with length exactly equal to sequence_len""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # length 10 == sequence_len [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # length 11 > sequence_len ] } ) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Exactly equal should be kept, one over should be dropped self.assertEqual(len(result), 1) self.assertEqual(len(result[0]["input_ids"]), 10) def test_truncate_boundary_sequence_equal_to_sequence_len(self): """Test that truncate strategy leaves sequences with length exactly equal to sequence_len unchanged""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # length 10 == sequence_len ] } ) cfg = DictDefault( { "excess_length_strategy": "truncate", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should be unchanged - not truncated self.assertEqual(len(result), 1) self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) def test_empty_dataset(self): """Test that an empty dataset is handled gracefully""" dataset = Dataset.from_dict({"input_ids": []}) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) self.assertEqual(len(result), 0) def test_all_sequences_dropped_returns_empty_dataset(self): """Test that dropping all sequences results in an empty dataset""" dataset = Dataset.from_dict( { "input_ids": [ [1], # too short [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], # too long ] } ) cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 5, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) self.assertEqual(len(result), 0) def test_iterable_dataset_skips_processing(self): """Test that streaming datasets (column_names is None) are returned unchanged. The skip check in _should_skip_processing triggers when column_names is None, which happens with true streaming datasets loaded via load_dataset(..., streaming=True). """ mock_dataset = MagicMock() mock_dataset.column_names = None cfg = DictDefault( { "excess_length_strategy": "drop", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(mock_dataset, sequence_len=10, cfg=cfg) # Should be returned unchanged (same object) self.assertIs(result, mock_dataset) def test_truncate_with_partial_auxiliary_fields(self): """Test truncation when only some auxiliary fields are present""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ], "labels": [ [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ], # No attention_mask or position_ids } ) cfg = DictDefault( { "excess_length_strategy": "truncate", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) self.assertEqual(len(result[0]["input_ids"]), 10) self.assertEqual(len(result[0]["labels"]), 10) self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) self.assertEqual(result[0]["labels"], [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10]) # Confirm no extra columns were introduced self.assertListEqual(sorted(result.column_names), ["input_ids", "labels"]) def test_min_sample_len_defaults_to_two_when_not_set(self): """Test that min_sample_len defaults to 2 when not specified in config""" dataset = Dataset.from_dict( { "input_ids": [ [1], # length 1 - should be dropped (< default 2) [1, 2], # length 2 - should be kept (>= default 2) [1, 2, 3], # length 3 - should be kept ] } ) cfg = DictDefault( { "excess_length_strategy": "drop", # min_sample_len not set "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]["input_ids"]), 2) self.assertEqual(len(result[1]["input_ids"]), 3) def test_invalid_strategy_falls_through_to_drop(self): """Test that an unrecognized strategy value falls through to drop behavior""" dataset = Dataset.from_dict( { "input_ids": [ [1, 2, 3], # keep [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ], # length 11 - should be dropped ] } ) cfg = DictDefault( { "excess_length_strategy": "not_a_real_strategy", "min_sample_len": 2, "dataset_num_proc": None, "is_preprocess": False, } ) result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg) # Should behave like 'drop' self.assertEqual(len(result), 1) self.assertEqual(len(result[0]["input_ids"]), 3) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/utils/lora/test_config_validation_lora.py ================================================ import pytest from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault class TestLoRAConfigValidation: """Test suite for LoRA/QLoRA configuration validation""" def test_basic_configuration_validation(self): """Test basic LoRA configuration validation""" valid_config = DictDefault( { "adapter": "lora", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.1, "lora_target_modules": ["q_proj", "v_proj"], "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) result = validate_config(valid_config) assert result["adapter"] == "lora" with pytest.raises(ValueError, match="not compatible with DoRA"): invalid_config = DictDefault( { "adapter": "lora", "lora_mlp_kernel": True, "peft_use_dora": True, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) validate_config(invalid_config) def test_qlora_4bit_validation(self): """Test QLoRA 4-bit configuration validation""" valid_config = DictDefault( { "adapter": "qlora", "load_in_4bit": True, "bnb_4bit_compute_dtype": "float16", "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) result = validate_config(valid_config) assert result["adapter"] == "qlora" assert result["load_in_4bit"] is True # Test QLoRA without 4-bit (should fail via PEFT validation) with pytest.raises(ValueError, match=r"Require cfg\.load_in_4bit"): invalid_config = DictDefault( { "adapter": "qlora", "load_in_4bit": False, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) validate_config(invalid_config) # Test QLoRA with 8-bit (incompatible) with pytest.raises(ValueError, match="Can't load qlora in 8bit"): invalid_config = DictDefault( { "adapter": "qlora", "load_in_8bit": True, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) validate_config(invalid_config) @pytest.mark.parametrize( "kernel_field", ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"] ) def test_lora_kernels_trust_remote_code_incompatible(self, kernel_field): """Test that lora kernels are incompatible with trust_remote_code""" with pytest.raises(ValueError, match="not compatible with trust_remote_code"): invalid_config = DictDefault( { "adapter": "lora", kernel_field: True, "trust_remote_code": True, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) validate_config(invalid_config) def test_lora_kernels_trust_remote_code_false(self): """Test that lora kernels work when trust_remote_code is false""" # Test with trust_remote_code=False, lora kernels should be allowed valid_config = DictDefault( { "adapter": "lora", "lora_mlp_kernel": True, "lora_qkv_kernel": True, "lora_o_kernel": True, "trust_remote_code": False, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) result = validate_config(valid_config) assert result["lora_mlp_kernel"] is True assert result["lora_qkv_kernel"] is True assert result["lora_o_kernel"] is True # Test with trust_remote_code=None (unset), kernels should be allowed valid_config = DictDefault( { "adapter": "lora", "lora_qkv_kernel": True, "trust_remote_code": None, "datasets": [{"path": "dummy_dataset", "type": "alpaca"}], "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "base_model": "dummy_model", } ) result = validate_config(valid_config) assert result["lora_qkv_kernel"] is True assert result["trust_remote_code"] is None ================================================ FILE: tests/utils/lora/test_freeze_lora.py ================================================ import importlib.util from unittest.mock import Mock import pytest import torch import torch.nn as nn from axolotl.kernels.lora import get_lora_parameters PEFT_AVAILABLE = importlib.util.find_spec("peft") is not None class TestLoRAParameterFreezing: """Test suite for LoRA parameter freezing validation.""" def setup_method(self): self.dtype = torch.float32 def create_mock_lora_layer( self, has_adapters=True, adapters_disabled=False, merged=False ): """Create a mock LoRA layer for testing.""" mock_layer = Mock() base_layer = Mock() base_layer.weight = torch.randn(512, 256, dtype=self.dtype) base_layer.bias = torch.randn(512, dtype=self.dtype) if has_adapters: mock_layer.base_layer = base_layer mock_layer.disable_adapters = adapters_disabled mock_layer.merged = merged mock_layer.active_adapters = ["default"] mock_layer.lora_A = {"default": Mock()} mock_layer.lora_B = {"default": Mock()} mock_layer.scaling = {"default": 0.1} mock_layer.lora_A["default"].weight = torch.randn(16, 256, dtype=self.dtype) mock_layer.lora_B["default"].weight = torch.randn(512, 16, dtype=self.dtype) else: mock_layer.weight = base_layer.weight mock_layer.bias = base_layer.bias return mock_layer def test_parameter_freezing_adapters_disabled(self): """Test that LoRA parameters are None when adapters are disabled.""" layer = self.create_mock_lora_layer(has_adapters=True, adapters_disabled=True) W, b, quant_state, A, B, s = get_lora_parameters(layer) # Base parameters should be returned assert W is not None assert b is not None # LoRA parameters should be None (frozen) assert A is None assert B is None assert s is None def test_parameter_freezing_adapters_merged(self): """Test that LoRA parameters are None when adapters are merged.""" layer = self.create_mock_lora_layer(has_adapters=True, merged=True) W, b, quant_state, A, B, s = get_lora_parameters(layer) # Base parameters should be returned assert W is not None assert b is not None # LoRA parameters should be None (frozen) assert A is None assert B is None assert s is None def test_parameter_freezing_no_adapters(self): """Test parameter behavior when no adapters are present.""" layer = self.create_mock_lora_layer(has_adapters=False) W, b, quant_state, A, B, s = get_lora_parameters(layer) # Base parameters should be returned assert W is not None assert b is not None # LoRA parameters should be None (frozen) assert A is None assert B is None assert s is None def test_parameter_active_adapters_enabled(self): """Test that LoRA parameters are returned when adapters are active.""" layer = self.create_mock_lora_layer( has_adapters=True, adapters_disabled=False, merged=False ) W, b, quant_state, A, B, s = get_lora_parameters(layer) # All parameters should be returned assert W is not None assert b is not None assert A is not None assert B is not None assert s is not None assert s == 0.1 def test_parameter_shapes_consistency(self): """Test that parameter shapes are consistent when active.""" layer = self.create_mock_lora_layer( has_adapters=True, adapters_disabled=False, merged=False ) W, b, quant_state, A, B, s = get_lora_parameters(layer) # Check shape consistency assert W.shape == (512, 256) assert b.shape == (512,) assert A.shape == (16, 256) assert B.shape == (512, 16) def test_parameter_dtypes_consistency(self): """Test that parameter dtypes are consistent.""" layer = self.create_mock_lora_layer( has_adapters=True, adapters_disabled=False, merged=False ) W, b, quant_state, A, B, s = get_lora_parameters(layer) assert W.dtype == self.dtype assert b.dtype == self.dtype assert A.dtype == self.dtype assert B.dtype == self.dtype def test_quantization_state_handling(self): """Test that quantization state is properly handled.""" layer = self.create_mock_lora_layer(has_adapters=True) quant_state_mock = Mock() layer.base_layer.weight.quant_state = quant_state_mock W, b, quant_state, A, B, s = get_lora_parameters(layer) assert quant_state == quant_state_mock def test_multiple_adapters_active_adapter_selection(self): """Test that the correct adapter is selected when multiple adapters exist.""" layer = self.create_mock_lora_layer( has_adapters=True, adapters_disabled=False, merged=False ) layer.lora_A["adapter2"] = Mock() layer.lora_B["adapter2"] = Mock() layer.scaling["adapter2"] = 0.2 layer.lora_A["adapter2"].weight = torch.randn(16, 256, dtype=self.dtype) layer.lora_B["adapter2"].weight = torch.randn(512, 16, dtype=self.dtype) layer.active_adapters = ["adapter2"] W, b, quant_state, A, B, s = get_lora_parameters(layer) assert s == 0.2 assert torch.equal(A, layer.lora_A["adapter2"].weight) assert torch.equal(B, layer.lora_B["adapter2"].weight) class TestLoRAParameterFreezingIntegration: """Integration tests for parameter freezing with actual LoRA layers.""" @pytest.mark.skipif( not PEFT_AVAILABLE, reason="PEFT not available for integration tests" ) def test_parameter_freezing_with_real_lora_layer(self): """Test parameter freezing with actual PEFT LoRA layer.""" from peft import LoraConfig, get_peft_model class SimpleModel(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(256, 512) def forward(self, x): return self.linear(x) base_model = SimpleModel() lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["linear"], lora_dropout=0.1, ) model = get_peft_model(base_model, lora_config) lora_layer = model.base_model.model.linear # Test with adapters enabled W, b, quant_state, A, B, s = get_lora_parameters(lora_layer) assert A is not None assert B is not None assert s is not None # Test with adapters disabled model.disable_adapter_layers() W, b, quant_state, A, B, s = get_lora_parameters(lora_layer) assert A is None assert B is None assert s is None @pytest.mark.skipif( not PEFT_AVAILABLE, reason="PEFT not available for integration tests" ) def test_parameter_freezing_gradient_behavior(self): """Test that frozen parameters don't receive gradients.""" from peft import LoraConfig, get_peft_model class SimpleModel(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(256, 512) def forward(self, x): return self.linear(x) base_model = SimpleModel() lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["linear"], lora_dropout=0.1, ) model = get_peft_model(base_model, lora_config) x = torch.randn(1, 256) target = torch.randn(1, 512) model.enable_adapter_layers() output = model(x) loss = nn.MSELoss()(output, target) loss.backward() lora_layer = model.base_model.model.linear has_lora_grads = any( param.grad is not None for name, param in lora_layer.named_parameters() if "lora_" in name ) assert has_lora_grads, ( "LoRA parameters should have gradients when adapters are enabled" ) model.zero_grad() model.disable_adapter_layers() output = model(x) loss = nn.MSELoss()(output, target) any_requires_grad = any(param.requires_grad for param in model.parameters()) if any_requires_grad: loss.backward() has_lora_grads_disabled = any( param.grad is not None for name, param in lora_layer.named_parameters() if "lora_" in name ) assert not has_lora_grads_disabled, ( "LoRA parameters should not have gradients when adapters are disabled" ) model.zero_grad() del model, base_model, lora_layer, x, target, output, loss torch.cuda.empty_cache() if torch.cuda.is_available() else None ================================================ FILE: tests/utils/lora/test_merge_lora.py ================================================ from unittest.mock import Mock, patch import torch from axolotl.cli.merge_lora import do_merge_lora from axolotl.utils.dict import DictDefault class TestAdapterMergeUnmerge: """Test suite for LoRA adapter merging/unmerging functionality""" def setup_method(self): self.dtype = torch.float32 self.device = torch.device("cpu") def create_mock_base_model(self, vocab_size=1000, hidden_size=256): """Create a mock base model with linear layers""" mock_model = Mock() mock_model.config = Mock() mock_model.config.vocab_size = vocab_size mock_model.config.hidden_size = hidden_size mock_model.q_proj = Mock() mock_model.q_proj.weight = torch.randn( hidden_size, hidden_size, dtype=self.dtype ) mock_model.q_proj.bias = torch.randn(hidden_size, dtype=self.dtype) mock_model.v_proj = Mock() mock_model.v_proj.weight = torch.randn( hidden_size, hidden_size, dtype=self.dtype ) mock_model.v_proj.bias = torch.randn(hidden_size, dtype=self.dtype) return mock_model def create_mock_lora_model(self, base_model, r=8, alpha=16): """Create a mock LoRA model wrapping the base model""" mock_lora_model = Mock() mock_lora_model.base_model = base_model mock_lora_model.merge_and_unload = None mock_lora_model.to = Mock(return_value=mock_lora_model) mock_lora_model.generation_config = Mock() mock_lora_model.config = Mock() self.original_q_weight = base_model.q_proj.weight.clone() self.original_v_weight = base_model.v_proj.weight.clone() mock_lora_model.peft_config = {"default": Mock()} mock_lora_model.peft_config["default"].r = r mock_lora_model.peft_config["default"].lora_alpha = alpha self.lora_A_q = torch.randn( r, base_model.q_proj.weight.shape[1], dtype=self.dtype ) self.lora_B_q = torch.randn( base_model.q_proj.weight.shape[0], r, dtype=self.dtype ) self.lora_A_v = torch.randn( r, base_model.v_proj.weight.shape[1], dtype=self.dtype ) self.lora_B_v = torch.randn( base_model.v_proj.weight.shape[0], r, dtype=self.dtype ) self.scaling = alpha / r def mock_merge_and_unload(progressbar=False): """Simulate the actual merge operation""" # Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling delta_v = (self.lora_B_v @ self.lora_A_v) * self.scaling base_model.q_proj.weight = self.original_q_weight + delta_q base_model.v_proj.weight = self.original_v_weight + delta_v return base_model mock_lora_model.merge_and_unload = mock_merge_and_unload return mock_lora_model def test_basic_lora_merge_unmerge_cycle(self): """Test: original_weights -> merge -> unmerge -> should equal original_weights""" base_model = self.create_mock_base_model() lora_model = self.create_mock_lora_model(base_model) original_q_weight = self.original_q_weight.clone() original_v_weight = self.original_v_weight.clone() merged_model = lora_model.merge_and_unload() assert not torch.equal(merged_model.q_proj.weight, original_q_weight) assert not torch.equal(merged_model.v_proj.weight, original_v_weight) delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling delta_v = (self.lora_B_v @ self.lora_A_v) * self.scaling unmerged_q_weight = merged_model.q_proj.weight - delta_q unmerged_v_weight = merged_model.v_proj.weight - delta_v assert torch.allclose(unmerged_q_weight, original_q_weight, atol=1e-6) assert torch.allclose(unmerged_v_weight, original_v_weight, atol=1e-6) def test_merge_weight_calculation_accuracy(self): """Test: merged_weight = base_weight + (lora_B @ lora_A * scaling)""" base_model = self.create_mock_base_model() lora_model = self.create_mock_lora_model(base_model, r=16, alpha=32) expected_delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling expected_merged_q = self.original_q_weight + expected_delta_q merged_model = lora_model.merge_and_unload() assert torch.allclose(merged_model.q_proj.weight, expected_merged_q, atol=1e-6) @patch("axolotl.cli.merge_lora.load_model_and_tokenizer") def test_cli_do_merge_functionality(self, mock_load_model, tmp_path): base_model = self.create_mock_base_model() lora_model = self.create_mock_lora_model(base_model) tokenizer = Mock() processor = None mock_load_model.return_value = (lora_model, tokenizer, processor) cfg = DictDefault( { "save_safetensors": True, "torch_dtype": torch.float32, "local_rank": 0, "output_dir": str(tmp_path), } ) with ( patch("pathlib.Path.mkdir"), patch.object(base_model, "save_pretrained") as mock_save_model, patch.object(tokenizer, "save_pretrained") as mock_save_tokenizer, ): do_merge_lora(cfg=cfg) mock_save_model.assert_called_once() mock_save_tokenizer.assert_called_once() def test_quantized_model_merge_compatibility(self): """Test 4-bit/8-bit model merging scenarios""" base_model = self.create_mock_base_model() # Mock quantized weights base_model.q_proj.weight.quant_state = Mock() base_model.q_proj.weight.quant_state.dtype = torch.uint8 lora_model = self.create_mock_lora_model(base_model) merged_model = lora_model.merge_and_unload() assert merged_model is not None @patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": ""}) def test_memory_efficient_merge_with_cpu_offload(self, tmp_path): """Test lora_on_cpu configuration during merge""" cfg = DictDefault( { "lora_on_cpu": True, "save_safetensors": True, "output_dir": str(tmp_path), "local_rank": 0, } ) with patch("axolotl.cli.merge_lora.load_model_and_tokenizer") as mock_load: base_model = self.create_mock_base_model() lora_model = self.create_mock_lora_model(base_model) mock_load.return_value = (lora_model, Mock(), None) with patch("pathlib.Path.mkdir"), patch("torch.save"): do_merge_lora(cfg=cfg) assert mock_load.called ================================================ FILE: tests/utils/schemas/validation/test_activation_offloading.py ================================================ """Test for config validation for activation offloading.""" from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault class TestActivationOffloading: """ Test cases for activation offloading schema validation """ def test_gc_converts_offload_wo_lora(self, min_base_cfg): cfg = ( DictDefault( gradient_checkpointing="offload", ) | min_base_cfg ) cfg = validate_config(cfg) assert cfg.gradient_checkpointing is True assert cfg.activation_offloading is True def test_ac_offload_impl_noop_wo_adapter(self, min_base_cfg): cfg = ( DictDefault( gradient_checkpointing=True, activation_offloading=True, ) | min_base_cfg ) cfg = validate_config(cfg) assert cfg.gradient_checkpointing is True assert cfg.activation_offloading is True ================================================ FILE: tests/utils/schemas/validation/test_default_values.py ================================================ """Tests for default values for configurations""" from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault class TestDefaultConfigValues: """Tests for default values for configurations""" def test_pad_to_sequence_len(self, min_base_cfg): """Tests that sample packing automatically sets pad_to_sequence_len to True""" cfg = ( DictDefault( sample_packing=True, ) | min_base_cfg ) cfg = validate_config(cfg) assert cfg.pad_to_sequence_len is True ================================================ FILE: tests/utils/schemas/validation/test_fsdp.py ================================================ """ tests for pydantic fsdp validation """ import pytest from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault class TestFSDPValidation: """ test class for pydantic fsdp validation """ def test_fsdp_version_from_fsdp_config(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "version": 2, }, ) cfg = validate_config( cfg, ) assert cfg.fsdp_version == 2 def test_fsdp_version_in_fsdp_config(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_version=2, fsdp_config={ "reshard_after_forward": True, }, ) cfg = validate_config( cfg, ) assert cfg.fsdp_version == 2 assert cfg.fsdp_config.fsdp_version == 2 def test_fsdp_offload_w_8bit_optim(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "offload_params": True, }, optimizer="adamw_8bit", fsdp_version=1, ) with pytest.raises( ValueError, match="FSDP Offload not compatible with adamw_8bit" ): validate_config(cfg) def test_fsdp2_w_8bit_optim(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "offload_params": True, }, optimizer="adamw_8bit", fsdp_version=2, ) with pytest.raises( ValueError, match="FSDP2 not compatible with adamw_8bit, use `adamw_torch_8bit` instead", ): validate_config(cfg) def test_fsdp2_w_cpu_ram_efficient_loading(self, min_base_cfg): cfg = min_base_cfg | DictDefault( load_in_8bit=True, adapter="lora", fsdp_config={ "cpu_ram_efficient_loading": True, }, fsdp_version=2, ) validated_cfg = validate_config(cfg) assert validated_cfg.fsdp_version == 2 assert validated_cfg.fsdp_config.cpu_ram_efficient_loading is True def test_fsdp2_cpu_offload_pin_memory_requires_offload_params(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "cpu_offload_pin_memory": False, "offload_params": False, }, fsdp_version=2, ) with pytest.raises( ValueError, match="disabling cpu_offload_pin_memory requires enabling offload_params", ): validate_config(cfg) def test_fsdp1_cpu_offload_pin_memory_not_supported(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "cpu_offload_pin_memory": False, "offload_params": True, }, fsdp_version=1, ) with pytest.raises( ValueError, match="FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2", ): validate_config(cfg) def test_fsdp2_cpu_offload_pin_memory_w_offload_params(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "cpu_offload_pin_memory": False, "offload_params": True, }, fsdp_version=2, ) validated_cfg = validate_config(cfg) assert validated_cfg.fsdp_config.cpu_offload_pin_memory is False assert validated_cfg.fsdp_config.offload_params is True def test_fsdp_prefixes_removed(self, min_base_cfg): cfg = min_base_cfg | DictDefault( fsdp_config={ "fsdp_version": 2, "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_reshard_after_forward": True, } ) cfg = validate_config(cfg) assert cfg.fsdp_version == 2 assert cfg.fsdp_config.fsdp_version == 2 for key in cfg.fsdp_config.keys(): if key != "fsdp_version": assert not key.startswith("fsdp_") assert cfg.fsdp_config.auto_wrap_policy == "TRANSFORMER_BASED_WRAP" assert cfg.fsdp_config.transformer_layer_cls_to_wrap == "LlamaDecoderLayer" assert cfg.fsdp_config.reshard_after_forward is True def test_muon_fsdp1_rejected(self, min_base_cfg): cfg = min_base_cfg | DictDefault( optimizer="muon", fsdp_version=1, fsdp_config={"reshard_after_forward": True}, ) with pytest.raises( ValueError, match="Muon optimizer is only compatible with FSDP2" ): validate_config(cfg) @pytest.mark.parametrize( "rl", [ "dpo", "kto", "orpo", "ipo", ], ) def test_fsdp2_dpo(self, min_base_cfg, rl): cfg = min_base_cfg | DictDefault( fsdp_version=2, fsdp_config={ "reshard_after_forward": True, }, rl=rl, load_in_8bit=True, adapter="lora", remove_unused_columns=False, ) with pytest.raises( ValueError, match="FSDP2 does not support load_in_8bit or load_in_4bit with ", ): validate_config(cfg) ================================================ FILE: tests/utils/schemas/validation/test_moe_quant.py ================================================ """Tests for MoE expert quantization config validation and PEFT patch idempotency.""" import pytest from axolotl.utils.config import validate_config from axolotl.utils.dict import DictDefault @pytest.fixture() def gpu_caps(): return { "compute_capability": "sm_89", "bf16": True, "tf32": False, "n_gpu": 1, "n_node": 1, } @pytest.fixture() def env_caps(): return {"torch_version": "2.7.0"} class TestQuantizeMoeExpertsValidation: """Test suite for quantize_moe_experts config validator.""" def test_requires_adapter(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts without adapter should fail.""" cfg = ( DictDefault( quantize_moe_experts=True, ) | min_base_cfg ) with pytest.raises(ValueError, match="requires adapter"): validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) def test_requires_quantization(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts without load_in_4bit/8bit should fail.""" cfg = ( DictDefault( quantize_moe_experts=True, adapter="lora", ) | min_base_cfg ) with pytest.raises(ValueError, match="requires load_in_4bit or load_in_8bit"): validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) def test_valid_qlora_4bit(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts with qlora + 4bit should pass.""" cfg = ( DictDefault( quantize_moe_experts=True, adapter="qlora", load_in_4bit=True, ) | min_base_cfg ) result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) assert result["quantize_moe_experts"] is True def test_valid_lora_8bit(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts with lora + 8bit should pass.""" cfg = ( DictDefault( quantize_moe_experts=True, adapter="lora", load_in_8bit=True, ) | min_base_cfg ) result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) assert result["quantize_moe_experts"] is True def test_false_skips_validation(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts=false should not check adapter/quantization.""" cfg = ( DictDefault( quantize_moe_experts=False, ) | min_base_cfg ) result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) assert result["quantize_moe_experts"] is False def test_rejects_lora_target_linear(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts with lora_target_linear should fail.""" cfg = ( DictDefault( quantize_moe_experts=True, adapter="qlora", load_in_4bit=True, lora_target_linear=True, ) | min_base_cfg ) with pytest.raises(ValueError, match="lora_target_linear is not compatible"): validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) def test_default_is_false(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts should default to false.""" cfg = DictDefault({}) | min_base_cfg result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) assert result["quantize_moe_experts"] is False class TestLoraTargetParametersDropout: """Test that lora_dropout must be 0 when lora_target_parameters is set.""" def test_rejects_nonzero_dropout(self, min_base_cfg): """lora_dropout > 0 with lora_target_parameters should fail.""" cfg = ( DictDefault( adapter="lora", lora_target_parameters=["mlp.experts.gate_up_proj"], lora_dropout=0.1, load_in_8bit=True, ) | min_base_cfg ) with pytest.raises(ValueError, match="lora_dropout must be 0"): validate_config(cfg) def test_zero_dropout_passes(self, min_base_cfg): """lora_dropout=0 with lora_target_parameters should pass.""" cfg = ( DictDefault( adapter="lora", lora_target_parameters=["mlp.experts.gate_up_proj"], lora_dropout=0.0, load_in_8bit=True, ) | min_base_cfg ) result = validate_config(cfg) assert result["lora_dropout"] == 0.0 class TestPeftPatchIdempotency: """Test that patch_peft_target_parameters_matching is idempotent.""" def test_double_call_does_not_stack_wrappers(self): """Calling patch twice should not double-wrap _inject_parameters.""" from peft.tuners.tuners_utils import BaseTuner from axolotl.monkeypatch.moe_quant import ( patch_peft_target_parameters_matching, ) original = BaseTuner._inject_parameters try: patch_peft_target_parameters_matching() first_patched = BaseTuner._inject_parameters patch_peft_target_parameters_matching() second_patched = BaseTuner._inject_parameters # Should be same function, not double-wrapped assert first_patched is second_patched finally: BaseTuner._inject_parameters = original patch_peft_target_parameters_matching._axolotl_patched = False class TestMoeAdapterTrainMergeRoundtrip: """E2E: train adapter on quantized MoE experts, then merge onto plain model. Verifies that param wrapping order during training matches merge, preventing size mismatch errors when loading adapters in standard PEFT/vLLM. """ @staticmethod def _make_classes(): """Return FakeExperts and FakeModel classes shared by both model builders.""" import torch import torch.nn as nn class FakeExperts(nn.Module): def __init__(self): super().__init__() # Model definition order: gate_up_proj first, then down_proj. self.gate_up_proj = nn.Parameter(torch.randn(4, 16, 8)) self.down_proj = nn.Parameter(torch.randn(4, 8, 16)) def forward(self, x): x = torch.matmul(x, self.gate_up_proj[0].T) # (batch, 16) x = torch.matmul(x, self.down_proj[0].T) # (batch, 8) return x class FakeModel(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(8, 8) self.experts = FakeExperts() def forward(self, x): return self.linear(x) + self.experts(x) return FakeExperts, FakeModel @staticmethod def _make_quantized_model(): """Training model: parametrizations registered in alphabetical order.""" import torch.nn as nn import torch.nn.utils.parametrize as P from axolotl.monkeypatch.moe_quant import _moe_load_state _, FakeModel = TestMoeAdapterTrainMergeRoundtrip._make_classes() class PassthroughParametrization(nn.Module): def forward(self, x): return x model = FakeModel() # Record definition order before parametrization (mirrors real loading). _moe_load_state["expert_param_order"]["experts"] = list( model.experts._parameters.keys() ) # Register in alphabetical order to expose the ordering mismatch. P.register_parametrization( model.experts, "down_proj", PassthroughParametrization(), unsafe=True ) P.register_parametrization( model.experts, "gate_up_proj", PassthroughParametrization(), unsafe=True ) return model @staticmethod def _make_plain_model(): """Merge model: no parametrizations — standard branch uses definition order.""" _, FakeModel = TestMoeAdapterTrainMergeRoundtrip._make_classes() return FakeModel() def test_train_save_merge_no_size_mismatch(self, tmp_path): """Train on quantized experts, merge onto plain model — must not raise.""" import torch from peft import LoraConfig, PeftModel, get_peft_model from peft.tuners.tuners_utils import BaseTuner from axolotl.monkeypatch.moe_quant import ( _moe_load_state, patch_peft_target_parameters_matching, ) adapter_dir = tmp_path / "adapter" lora_cfg = LoraConfig( r=4, lora_alpha=8, target_modules=[], target_parameters=["experts.gate_up_proj", "experts.down_proj"], lora_dropout=0.0, bias="none", ) original_inject = BaseTuner._inject_parameters # Training phase: quantized model (parametrized branch) with axolotl patch. _moe_load_state["expert_param_order"] = {} patch_peft_target_parameters_matching() try: peft_model = get_peft_model(self._make_quantized_model(), lora_cfg) finally: BaseTuner._inject_parameters = original_inject patch_peft_target_parameters_matching._axolotl_patched = False optimizer = torch.optim.SGD(peft_model.parameters(), lr=1e-3) for _ in range(3): peft_model(torch.randn(2, 8)).sum().backward() optimizer.step() optimizer.zero_grad() peft_model.save_pretrained(str(adapter_dir)) # Merge with standard PEFT (no axolotl patch) to verify external compatibility. loaded = PeftModel.from_pretrained(self._make_plain_model(), str(adapter_dir)) merged = loaded.merge_and_unload() assert merged is not None ================================================ FILE: tests/utils/test_grpo_rw_fnc.py ================================================ import os import pytest from axolotl.core.trainers.grpo import GRPOStrategy def test_get_rollout_func_loads_successfully(): """Test that a valid rollout function can be loaded""" rollout_func = GRPOStrategy.get_rollout_func("os.path.join") assert callable(rollout_func) assert rollout_func == os.path.join def test_get_rollout_func_invalid_module_raises_error(): """Test that invalid module path raises clear ValueError""" with pytest.raises(ValueError, match="Rollout function .* not found"): GRPOStrategy.get_rollout_func("nonexistent_module.my_func") ================================================ FILE: tests/utils/test_import_helper.py ================================================ """ test cases for axolotl.utils.import_helper """ import pytest from axolotl.utils.import_helper import get_cls_from_module_str def test_get_cls_from_module_str(): cls = get_cls_from_module_str("axolotl.core.trainers.base.AxolotlTrainer") assert cls.__name__ == "AxolotlTrainer" def test_get_cls_from_module_str_empty_string(): with pytest.raises(ValueError, match="module_str must be a non-empty string"): get_cls_from_module_str("") def test_get_cls_from_module_str_whitespace_only(): with pytest.raises(ValueError, match="module_str must be a non-empty string"): get_cls_from_module_str(" ") def test_get_cls_from_module_str_invalid_format(): with pytest.raises(ValueError, match="Invalid module string format"): get_cls_from_module_str("single_part") def test_get_cls_from_module_str_nonexistent_module(): with pytest.raises(ImportError, match="Failed to import module"): get_cls_from_module_str("nonexistent.module.Class") def test_get_cls_from_module_str_nonexistent_class(): with pytest.raises(AttributeError, match="Class 'NonExistentClass' not found"): get_cls_from_module_str("axolotl.core.trainers.base.NonExistentClass") ================================================ FILE: tests/utils/test_mistral3_processor.py ================================================ """Tests for Mistral3Processor with transformers v5 ProcessorMixin integration""" from unittest.mock import MagicMock import pytest import torch from transformers.feature_extraction_utils import BatchFeature from axolotl.utils.mistral.mistral3_processor import Mistral3Processor from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer @pytest.fixture() def mock_tokenizer(): """Create a mock HFMistralTokenizer that passes v5 ProcessorMixin isinstance checks.""" return MagicMock(spec=HFMistralTokenizer) @pytest.fixture() def processor(mock_tokenizer): return Mistral3Processor(tokenizer=mock_tokenizer) class TestMistral3ProcessorInit: def test_tokenizer_is_set(self, processor, mock_tokenizer): assert processor.tokenizer is mock_tokenizer def test_chat_template_is_none(self, processor): assert processor.chat_template is None def test_audio_tokenizer_is_none(self, processor): assert processor.audio_tokenizer is None class TestApplyChatTemplateTokenized: """Test apply_chat_template with tokenize=True, return_dict=True""" @pytest.fixture() def batched_conversations(self): return [ [ {"role": "user", "content": "Describe this image."}, {"role": "assistant", "content": "It is red."}, ], [ {"role": "user", "content": "What is this?"}, {"role": "assistant", "content": "A cat."}, ], ] def test_returns_batch_feature_with_pixel_values( self, processor, mock_tokenizer, batched_conversations ): pixel_values = torch.randn(2, 3, 224, 224, dtype=torch.float64) mock_tokenizer.apply_chat_template.return_value = { "input_ids": torch.tensor([[1, 2, 3], [4, 5, 6]]), "attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]), "pixel_values": pixel_values, } result = processor.apply_chat_template( batched_conversations, tokenize=True, return_dict=True ) assert isinstance(result, BatchFeature) assert "pixel_values" in result assert "image_sizes" in result assert result["pixel_values"].dtype == torch.float32 assert result["image_sizes"].shape == (2, 2) assert result["image_sizes"][0].tolist() == [224, 224] def test_returns_batch_feature_without_pixel_values( self, processor, mock_tokenizer, batched_conversations ): mock_tokenizer.apply_chat_template.return_value = { "input_ids": torch.tensor([[1, 2, 3], [4, 5, 6]]), "attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]), } result = processor.apply_chat_template( batched_conversations, tokenize=True, return_dict=True ) assert isinstance(result, BatchFeature) assert "input_ids" in result assert "image_sizes" not in result class TestApplyChatTemplateNotTokenized: def test_single_conversation_returns_unwrapped(self, processor, mock_tokenizer): """Single conversation (not batched) should return unwrapped result.""" single_conversation = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, ] mock_tokenizer.apply_chat_template.return_value = [ "[INST]Hello[/INST]Hi" ] result = processor.apply_chat_template( single_conversation, tokenize=False, return_dict=False ) assert result == "[INST]Hello[/INST]Hi" def test_batched_conversations_returns_list(self, processor, mock_tokenizer): batched = [ [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, ], [ {"role": "user", "content": "Bye"}, {"role": "assistant", "content": "Bye"}, ], ] mock_tokenizer.apply_chat_template.return_value = ["text1", "text2"] result = processor.apply_chat_template( batched, tokenize=False, return_dict=False ) assert result == ["text1", "text2"] class TestCall: def test_delegates_to_tokenizer(self, processor, mock_tokenizer): mock_tokenizer.return_value = { "input_ids": [1, 2, 3], "attention_mask": [1, 1, 1], } result = processor("Hello world") mock_tokenizer.assert_called_once() assert isinstance(result, BatchFeature) class TestReturnTensorsValidation: def test_rejects_non_pt_return_tensors(self, processor): conversation = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, ] with pytest.raises(ValueError, match=r"only supports.*return_tensors='pt'"): processor.apply_chat_template( conversation, tokenize=True, return_dict=True, return_tensors="np" ) ================================================ FILE: tests/utils/test_train.py ================================================ """test for train checkpoint utils""" import os from axolotl.utils.dict import DictDefault from axolotl.utils.train import determine_last_checkpoint def test_determine_last_checkpoint(temp_dir): cfg = DictDefault( output_dir=temp_dir, ) for cpt_idx in [1, 9, 10, 20]: os.makedirs( os.path.join(cfg.output_dir, f"checkpoint-{cpt_idx}"), exist_ok=True ) last_checkpoint = determine_last_checkpoint(cfg, update=False) assert last_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20") cfg.resume_from_checkpoint = None cfg.auto_resume_from_checkpoints = True determine_last_checkpoint(cfg, update=True) assert cfg.resume_from_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20")