gitextract_1sp7sr39/

├── .axolotl-complete.bash
├── .bandit
├── .coderabbit.yaml
├── .coveragerc
├── .editorconfig
├── .gitattributes
├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug-report.yaml
│   │   ├── config.yml
│   │   ├── docs.yml
│   │   └── feature-request.yaml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── SECURITY.md
│   ├── SUPPORT.md
│   ├── release-drafter.yml
│   └── workflows/
│       ├── base.yml
│       ├── docs.yml
│       ├── lint.yml
│       ├── main.yml
│       ├── multi-gpu-e2e.yml
│       ├── nightlies.yml
│       ├── precommit-autoupdate.yml
│       ├── preview-docs.yml
│       ├── pypi.yml
│       ├── tests-nightly.yml
│       └── tests.yml
├── .gitignore
├── .mypy.ini
├── .pre-commit-config.yaml
├── .runpod/
│   ├── .gitignore
│   ├── Dockerfile
│   ├── README.md
│   ├── hub.json
│   ├── requirements.txt
│   ├── src/
│   │   ├── config/
│   │   │   └── config.yaml
│   │   ├── handler.py
│   │   ├── test_input.json
│   │   ├── train.py
│   │   └── utils.py
│   ├── test-input.json
│   └── tests.json
├── CITATION.cff
├── CNAME
├── FAQS.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── VERSION
├── _quarto.yml
├── benchmarks/
│   ├── bench_entropy.py
│   ├── bench_scattermoe_lora.py
│   └── bench_selective_logsoftmax.py
├── cicd/
│   ├── Dockerfile-uv.jinja
│   ├── Dockerfile.jinja
│   ├── __init__.py
│   ├── cicd.sh
│   ├── cleanup.py
│   ├── cleanup.sh
│   ├── e2e_tests.py
│   ├── multigpu.py
│   ├── multigpu.sh
│   └── single_gpu.py
├── codecov.yml
├── deepspeed_configs/
│   ├── zero1.json
│   ├── zero1_torch_compile.json
│   ├── zero2.json
│   ├── zero2_torch_compile.json
│   ├── zero3.json
│   ├── zero3_bf16.json
│   ├── zero3_bf16_cpuoffload_all.json
│   └── zero3_bf16_cpuoffload_params.json
├── devtools/
│   ├── README.md
│   └── dev_chat_template.yml
├── docker/
│   ├── Dockerfile
│   ├── Dockerfile-base
│   ├── Dockerfile-base-next
│   ├── Dockerfile-base-nightly
│   ├── Dockerfile-cloud
│   ├── Dockerfile-cloud-no-tmux
│   ├── Dockerfile-cloud-uv
│   ├── Dockerfile-tests
│   ├── Dockerfile-uv
│   └── Dockerfile-uv-base
├── docker-compose.yaml
├── docs/
│   ├── .gitignore
│   ├── amd_hpc.qmd
│   ├── attention.qmd
│   ├── batch_vs_grad.qmd
│   ├── checkpoint_saving.qmd
│   ├── cli.qmd
│   ├── custom_integrations.qmd
│   ├── dataset-formats/
│   │   ├── conversation.qmd
│   │   ├── index.qmd
│   │   ├── inst_tune.qmd
│   │   ├── pretraining.qmd
│   │   ├── stepwise_supervised.qmd
│   │   ├── template_free.qmd
│   │   └── tokenized.qmd
│   ├── dataset_loading.qmd
│   ├── dataset_preprocessing.qmd
│   ├── debugging.qmd
│   ├── docker.qmd
│   ├── expert_quantization.qmd
│   ├── faq.qmd
│   ├── fsdp_qlora.qmd
│   ├── getting-started.qmd
│   ├── gradient_checkpointing.qmd
│   ├── inference.qmd
│   ├── input_output.qmd
│   ├── installation.qmd
│   ├── lora_optims.qmd
│   ├── lr_groups.qmd
│   ├── mac.qmd
│   ├── mixed_precision.qmd
│   ├── multi-gpu.qmd
│   ├── multi-node.qmd
│   ├── multimodal.qmd
│   ├── multipack.qmd
│   ├── nccl.qmd
│   ├── nd_parallelism.qmd
│   ├── optimizations.qmd
│   ├── optimizers.qmd
│   ├── qat.qmd
│   ├── quantize.qmd
│   ├── ray-integration.qmd
│   ├── reward_modelling.qmd
│   ├── rlhf.qmd
│   ├── scripts/
│   │   ├── examples-allowlist.yml
│   │   ├── generate_config_docs.py
│   │   └── generate_examples_docs.py
│   ├── sequence_parallelism.qmd
│   ├── streaming.qmd
│   ├── telemetry.qmd
│   ├── torchao.qmd
│   └── unsloth.qmd
├── examples/
│   ├── LiquidAI/
│   │   ├── README.md
│   │   ├── lfm2-350m-fft.yaml
│   │   ├── lfm2-8b-a1b-lora.yaml
│   │   └── lfm2-vl-lora.yaml
│   ├── alst/
│   │   ├── README.md
│   │   ├── llama3-8b-deepspeed-alst.yaml
│   │   └── llama3-8b-fsdp2-alst.yaml
│   ├── apertus/
│   │   ├── README.md
│   │   └── apertus-8b-qlora.yaml
│   ├── arcee/
│   │   ├── README.md
│   │   └── afm-4.5b-qlora.yaml
│   ├── archived/
│   │   ├── README.md
│   │   ├── cerebras/
│   │   │   ├── btlm-ft.yml
│   │   │   └── qlora.yml
│   │   ├── code-llama/
│   │   │   ├── 13b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   ├── 34b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   ├── 7b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   └── README.md
│   │   ├── dbrx/
│   │   │   ├── 16bit-lora.yaml
│   │   │   ├── 8bit-lora.yaml
│   │   │   ├── README.md
│   │   │   └── fft-ds-zero3.yaml
│   │   ├── deepcoder/
│   │   │   └── deepcoder-14B-preview-lora.yml
│   │   ├── falcon/
│   │   │   ├── config-7b-lora.yml
│   │   │   ├── config-7b-qlora.yml
│   │   │   └── config-7b.yml
│   │   ├── gemma/
│   │   │   └── qlora.yml
│   │   ├── gptj/
│   │   │   └── qlora.yml
│   │   ├── jeopardy-bot/
│   │   │   └── config.yml
│   │   ├── mpt-7b/
│   │   │   ├── README.md
│   │   │   └── config.yml
│   │   ├── openllama-3b/
│   │   │   ├── README.md
│   │   │   ├── config.yml
│   │   │   ├── lora.yml
│   │   │   └── qlora.yml
│   │   ├── pythia/
│   │   │   └── lora.yml
│   │   ├── pythia-12b/
│   │   │   ├── README.md
│   │   │   └── config.yml
│   │   ├── qwen/
│   │   │   ├── README.md
│   │   │   ├── lora.yml
│   │   │   ├── qlora.yml
│   │   │   ├── qwen2-moe-lora.yaml
│   │   │   └── qwen2-moe-qlora.yaml
│   │   ├── redpajama/
│   │   │   ├── README.md
│   │   │   └── config-3b.yml
│   │   ├── replit-3b/
│   │   │   └── config-lora.yml
│   │   ├── stablelm-2/
│   │   │   ├── 1.6b/
│   │   │   │   ├── fft.yml
│   │   │   │   └── lora.yml
│   │   │   └── README.md
│   │   ├── starcoder2/
│   │   │   └── qlora.yml
│   │   ├── tiny-llama/
│   │   │   ├── README.md
│   │   │   ├── lora-mps.yml
│   │   │   ├── lora.yml
│   │   │   ├── pretrain.yml
│   │   │   └── qlora.yml
│   │   ├── xgen-7b/
│   │   │   └── xgen-7b-8k-qlora.yml
│   │   └── yi-34B-chat/
│   │       ├── README.md
│   │       └── qlora.yml
│   ├── cloud/
│   │   ├── baseten.yaml
│   │   └── modal.yaml
│   ├── cohere/
│   │   └── command-r-7b-qlora.yml
│   ├── colab-notebooks/
│   │   └── colab-axolotl-example.ipynb
│   ├── deepcogito/
│   │   ├── cogito-v1-preview-llama-3B-lora.yml
│   │   └── cogito-v1-preview-qwen-14B-lora.yml
│   ├── deepseek-v2/
│   │   ├── fft-fsdp-16b.yaml
│   │   └── qlora-fsdp-2_5.yaml
│   ├── devstral/
│   │   ├── README.md
│   │   └── devstral-small-qlora.yml
│   ├── distributed-parallel/
│   │   ├── README.md
│   │   ├── llama-3_1-8b-hsdp-tp.yaml
│   │   └── qwen3-8b-fsdp-tp-cp.yaml
│   ├── eaft/
│   │   └── eaft-example.yml
│   ├── falcon-h1/
│   │   ├── falcon-h1-1b-deep-qlora.yaml
│   │   ├── falcon-h1-1b-qlora.yaml
│   │   ├── falcon-h1-34b-qlora.yaml
│   │   ├── falcon-h1-3b-qlora.yaml
│   │   ├── falcon-h1-500m-qlora.yaml
│   │   └── falcon-h1-7b-qlora.yaml
│   ├── gemma2/
│   │   ├── qlora.yml
│   │   └── reward-model.yaml
│   ├── gemma3/
│   │   ├── gemma-3-1b-qlora.yml
│   │   ├── gemma-3-270m-qlora.yml
│   │   ├── gemma-3-4b-qlora.yml
│   │   └── gemma-3-4b-vision-qlora.yml
│   ├── gemma3n/
│   │   ├── README.md
│   │   ├── gemma-3n-e2b-qlora.yml
│   │   ├── gemma-3n-e2b-vision-audio-qlora.yml
│   │   └── gemma-3n-e2b-vision-qlora.yml
│   ├── glm4/
│   │   └── qlora-32b.yaml
│   ├── glm45/
│   │   ├── README.md
│   │   └── glm-45-air-qlora.yaml
│   ├── glm46v/
│   │   ├── README.md
│   │   ├── glm-4-6v-flash-ddp.yaml
│   │   └── glm-4-6v-flash-qlora.yaml
│   ├── glm47-flash/
│   │   ├── README.md
│   │   ├── lora.yaml
│   │   ├── lora_fsdp.yaml
│   │   ├── qlora.yaml
│   │   └── qlora_fsdp.yaml
│   ├── gpt-oss/
│   │   ├── README.md
│   │   ├── gpt-oss-120b-fft-fsdp2-offload.yaml
│   │   ├── gpt-oss-20b-fft-deepspeed-zero3.yaml
│   │   ├── gpt-oss-20b-fft-fsdp2-offload.yaml
│   │   ├── gpt-oss-20b-fft-fsdp2.yaml
│   │   ├── gpt-oss-20b-sft-lora-singlegpu.yaml
│   │   └── gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
│   ├── granite4/
│   │   ├── README.md
│   │   └── granite-4.0-tiny-fft.yaml
│   ├── hunyuan/
│   │   ├── README.md
│   │   └── hunyuan-v1-dense-qlora.yaml
│   ├── internvl3_5/
│   │   ├── README.md
│   │   └── internvl3_5-8b-qlora.yml
│   ├── jamba/
│   │   ├── README.md
│   │   ├── qlora.yaml
│   │   ├── qlora_deepspeed.yaml
│   │   └── qlora_fsdp_large.yaml
│   ├── kimi-linear/
│   │   ├── README.md
│   │   └── kimi-48b-lora.yaml
│   ├── llama-2/
│   │   ├── README.md
│   │   ├── fft_optimized.yml
│   │   ├── gptq-lora.yml
│   │   ├── lisa.yml
│   │   ├── loftq.yml
│   │   ├── lora.yml
│   │   ├── qlora-fsdp.yml
│   │   ├── qlora.yml
│   │   └── relora.yml
│   ├── llama-3/
│   │   ├── 3b-fp8-fsdp2.yaml
│   │   ├── 3b-qat-fsdp2.yaml
│   │   ├── 3b-qat-mxfp4.yaml
│   │   ├── 3b-qat-nvfp4.yaml
│   │   ├── README.md
│   │   ├── diffusion/
│   │   │   ├── pretrain-1b.yaml
│   │   │   └── sft-1b.yaml
│   │   ├── fft-8b-liger-fsdp.yaml
│   │   ├── fft-8b.yaml
│   │   ├── instruct-dpo-lora-8b.yml
│   │   ├── instruct-lora-8b.yml
│   │   ├── lora-1b-deduplicate-dpo.yml
│   │   ├── lora-1b-deduplicate-sft.yml
│   │   ├── lora-1b-kernels.yml
│   │   ├── lora-1b-ray.yml
│   │   ├── lora-1b-sample-packing-sequentially.yml
│   │   ├── lora-1b.yml
│   │   ├── lora-8b.yml
│   │   ├── opentelemetry-qlora.yml
│   │   ├── qlora-1b-gdpo.yaml
│   │   ├── qlora-1b-kto.yaml
│   │   ├── qlora-1b.yml
│   │   ├── qlora-fsdp-405b.yaml
│   │   ├── qlora-fsdp-70b.yaml
│   │   ├── qlora.yml
│   │   └── sparse-finetuning.yaml
│   ├── llama-3-vision/
│   │   └── lora-11b.yaml
│   ├── llama-4/
│   │   ├── README.md
│   │   ├── do-no-use-fa2/
│   │   │   ├── maverick-qlora-fsdp1.yaml
│   │   │   ├── scout-qlora-fsdp1.yaml
│   │   │   ├── scout-qlora-single-h100.yaml
│   │   │   └── scout-vision-qlora-fsdp.yaml
│   │   ├── scout-qlora-flexattn-fsdp2.yaml
│   │   ├── scout-qlora-single-h100-flex.yaml
│   │   └── scout-vision-qlora-fsdp2-flex.yaml
│   ├── llava/
│   │   └── lora-7b.yaml
│   ├── magistral/
│   │   ├── README.md
│   │   ├── magistral-small-fsdp-qlora.yaml
│   │   ├── magistral-small-qlora.yaml
│   │   ├── think/
│   │   │   ├── README.md
│   │   │   └── magistral-small-think-qlora.yaml
│   │   └── vision/
│   │       ├── README.md
│   │       └── magistral-small-vision-24B-qlora.yml
│   ├── mamba/
│   │   └── config.yml
│   ├── mimo/
│   │   ├── README.md
│   │   └── mimo-7b-qlora.yaml
│   ├── ministral/
│   │   ├── README.md
│   │   └── ministral-small-qlora.yaml
│   ├── ministral3/
│   │   ├── README.md
│   │   ├── ministral3-3b-qlora.yaml
│   │   ├── think/
│   │   │   ├── README.md
│   │   │   └── ministral3-3b-think-qlora.yaml
│   │   └── vision/
│   │       ├── README.md
│   │       └── ministral3-3b-vision-qlora.yml
│   ├── mistral/
│   │   ├── README.md
│   │   ├── bigstral/
│   │   │   └── bigstral-ds-zero3.yaml
│   │   ├── config.yml
│   │   ├── dpo/
│   │   │   └── mistral-dpo-qlora.yml
│   │   ├── lora.yml
│   │   ├── mistral-qlora-fsdp.yml
│   │   ├── mixtral/
│   │   │   ├── mixtral-8x22b-qlora-fsdp.yml
│   │   │   ├── mixtral-qlora-fsdp.yml
│   │   │   ├── mixtral.yml
│   │   │   └── mixtral_22.yml
│   │   ├── mps/
│   │   │   └── lora-mps.yml
│   │   ├── orpo/
│   │   │   └── mistral-qlora-orpo.yml
│   │   └── qlora.yml
│   ├── mistral-small/
│   │   ├── README.md
│   │   └── mistral-small-3.1-24B-lora.yml
│   ├── mistral4/
│   │   ├── README.md
│   │   ├── fft-text.yml
│   │   ├── fft-vision.yml
│   │   ├── qlora-text.yml
│   │   └── qlora-vision.yml
│   ├── nemotron/
│   │   └── nemotron-mini-4b-qlora.yaml
│   ├── olmo3/
│   │   ├── README.md
│   │   └── olmo3-7b-qlora.yaml
│   ├── orpheus/
│   │   ├── README.md
│   │   └── finetune.yml
│   ├── phi/
│   │   ├── README.md
│   │   ├── lora-3.5.yaml
│   │   ├── phi-ft.yml
│   │   ├── phi-qlora.yml
│   │   ├── phi2-ft.yml
│   │   ├── phi3-ft-fsdp.yml
│   │   └── phi3-ft.yml
│   ├── pixtral/
│   │   └── lora-12b.yml
│   ├── plano/
│   │   ├── README.md
│   │   └── plano-4b-qlora.yaml
│   ├── qat_nvfp4/
│   │   ├── Gemma3-12B_baseline.yml
│   │   ├── Gemma3-12B_qat.yml
│   │   ├── Math-Gemma3-12B_baseline.yml
│   │   ├── Math-Gemma3-12B_qat.yml
│   │   ├── Math-Gemma3-27B_baseline.yml
│   │   ├── Math-Gemma3-27B_qat.yml
│   │   ├── Math-Qwen2.5-72B_baseline.yml
│   │   ├── Math-Qwen2.5-72B_qat.yml
│   │   ├── Qwen2.5-72B_baseline.yml
│   │   └── Qwen2.5-72B_qat.yml
│   ├── qwen2/
│   │   ├── adamw-pretrain-fsdp2.yaml
│   │   ├── dpo.yaml
│   │   ├── muon-pretrain-fsdp2.yaml
│   │   ├── prm.yaml
│   │   ├── qlora-fsdp.yaml
│   │   └── reward-model.yaml
│   ├── qwen2-vl/
│   │   └── lora-7b.yaml
│   ├── qwen2_5-vl/
│   │   └── lora-7b.yaml
│   ├── qwen3/
│   │   ├── 32b-qlora.yaml
│   │   ├── 8b-qat-fsdp2.yml
│   │   ├── README.md
│   │   ├── qlora-fsdp.yaml
│   │   └── reward-model.yaml
│   ├── qwen3-next/
│   │   ├── README.md
│   │   └── qwen3-next-80b-a3b-qlora.yaml
│   ├── qwen3.5/
│   │   ├── 122b-a10b-moe-qlora-fsdp.yaml
│   │   ├── 122b-a10b-moe-qlora.yaml
│   │   ├── 27b-fft.yaml
│   │   ├── 27b-qlora-fsdp.yaml
│   │   ├── 27b-qlora.yaml
│   │   ├── 35b-a3b-moe-qlora-fsdp.yaml
│   │   ├── 35b-a3b-moe-qlora.yaml
│   │   ├── 9b-fft-vision.yaml
│   │   ├── 9b-lora-vision.yaml
│   │   └── README.md
│   ├── seed-oss/
│   │   ├── README.md
│   │   └── seed-oss-36b-qlora.yaml
│   ├── slurm/
│   │   ├── README.md
│   │   └── axolotl.slurm
│   ├── smolvlm2/
│   │   ├── README.md
│   │   └── smolvlm2-2B-lora.yaml
│   ├── streaming/
│   │   ├── README.md
│   │   ├── pretrain.yaml
│   │   └── sft.yaml
│   ├── swanlab/
│   │   ├── README.md
│   │   ├── custom_trainer_profiling.py
│   │   ├── dpo-swanlab-completions.yml
│   │   ├── dpo-swanlab-full-featured.yml
│   │   └── lora-swanlab-profiling.yml
│   ├── trinity/
│   │   ├── README.md
│   │   └── trinity-nano-preview-qlora.yaml
│   └── voxtral/
│       ├── README.md
│       ├── voxtral-mini-audio-qlora.yml
│       └── voxtral-mini-qlora.yml
├── index.qmd
├── pyproject.toml
├── requirements-dev.txt
├── requirements-tests.txt
├── requirements.txt
├── scripts/
│   ├── chat_datasets.py
│   ├── cloud-entrypoint-term.sh
│   ├── cloud-entrypoint.sh
│   ├── cutcrossentropy_install.py
│   ├── motd
│   └── unsloth_install.py
├── setup.py
├── src/
│   ├── axolotl/
│   │   ├── __init__.py
│   │   ├── cli/
│   │   │   ├── __init__.py
│   │   │   ├── args.py
│   │   │   ├── art.py
│   │   │   ├── checks.py
│   │   │   ├── cloud/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── baseten/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── template/
│   │   │   │   │       ├── run.sh
│   │   │   │   │       └── train_sft.py
│   │   │   │   └── modal_.py
│   │   │   ├── config.py
│   │   │   ├── delinearize_llama4.py
│   │   │   ├── evaluate.py
│   │   │   ├── inference.py
│   │   │   ├── main.py
│   │   │   ├── merge_lora.py
│   │   │   ├── merge_sharded_fsdp_weights.py
│   │   │   ├── preprocess.py
│   │   │   ├── quantize.py
│   │   │   ├── train.py
│   │   │   ├── utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── diffusion.py
│   │   │   │   ├── fetch.py
│   │   │   │   ├── load.py
│   │   │   │   ├── sweeps.py
│   │   │   │   └── train.py
│   │   │   └── vllm_serve.py
│   │   ├── common/
│   │   │   ├── __init__.py
│   │   │   ├── architectures.py
│   │   │   ├── const.py
│   │   │   └── datasets.py
│   │   ├── convert.py
│   │   ├── core/
│   │   │   ├── __init__.py
│   │   │   ├── attention/
│   │   │   │   └── __init__.py
│   │   │   ├── builders/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── causal.py
│   │   │   │   └── rl.py
│   │   │   ├── chat/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── format/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── chatml.py
│   │   │   │   │   ├── llama3x.py
│   │   │   │   │   └── shared.py
│   │   │   │   └── messages.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat.py
│   │   │   │   └── transforms/
│   │   │   │       ├── __init__.py
│   │   │   │       └── chat_builder.py
│   │   │   ├── trainers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── dpo/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── args.py
│   │   │   │   │   └── trainer.py
│   │   │   │   ├── grpo/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── args.py
│   │   │   │   │   ├── async_trainer.py
│   │   │   │   │   ├── fast_async_trainer.py
│   │   │   │   │   ├── replay_buffer.py
│   │   │   │   │   ├── sampler.py
│   │   │   │   │   └── trainer.py
│   │   │   │   ├── mamba.py
│   │   │   │   ├── mixins/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation_checkpointing.py
│   │   │   │   │   ├── checkpoints.py
│   │   │   │   │   ├── distributed_parallel.py
│   │   │   │   │   ├── optimizer.py
│   │   │   │   │   ├── packing.py
│   │   │   │   │   ├── rng_state_loader.py
│   │   │   │   │   └── scheduler.py
│   │   │   │   ├── trl.py
│   │   │   │   └── utils.py
│   │   │   ├── training_args.py
│   │   │   └── training_args_base.py
│   │   ├── datasets.py
│   │   ├── evaluate.py
│   │   ├── integrations/
│   │   │   ├── LICENSE.md
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── config.py
│   │   │   ├── cut_cross_entropy/
│   │   │   │   ├── ACKNOWLEDGEMENTS.md
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   └── args.py
│   │   │   ├── densemixer/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── plugin.py
│   │   │   ├── diffusion/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── callbacks.py
│   │   │   │   ├── generation.py
│   │   │   │   ├── plugin.py
│   │   │   │   ├── trainer.py
│   │   │   │   └── utils.py
│   │   │   ├── grokfast/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── optimizer.py
│   │   │   ├── kd/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── callbacks.py
│   │   │   │   ├── chat_template.py
│   │   │   │   ├── collator.py
│   │   │   │   ├── collator_online_teacher.py
│   │   │   │   ├── kernels/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── liger.py
│   │   │   │   │   └── models.py
│   │   │   │   ├── topk_logprob/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── forward_kl.py
│   │   │   │   ├── trainer.py
│   │   │   │   └── utils.py
│   │   │   ├── kernels/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── autotune_callback.py
│   │   │   │   ├── autotune_collector.py
│   │   │   │   ├── constants.py
│   │   │   │   ├── libs/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── scattermoe_lora/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── kernels/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── lora_ops.py
│   │   │   │   │       │   ├── ops.py
│   │   │   │   │       │   └── single.py
│   │   │   │   │       ├── layers.py
│   │   │   │   │       ├── lora_ops.py
│   │   │   │   │       ├── parallel_experts.py
│   │   │   │   │       ├── parallel_linear_lora.py
│   │   │   │   │       ├── selective_dequant.py
│   │   │   │   │       └── selective_dequant_kernel.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── sonicmoe/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── patch.py
│   │   │   │       ├── routing.py
│   │   │   │       └── weight_converter.py
│   │   │   ├── liger/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── deepseekv2.py
│   │   │   │   │   ├── jamba.py
│   │   │   │   │   ├── llama4.py
│   │   │   │   │   ├── qwen3.py
│   │   │   │   │   └── qwen3_moe.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── utils.py
│   │   │   ├── llm_compressor/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── utils.py
│   │   │   ├── lm_eval/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── cli.py
│   │   │   ├── spectrum/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── model_snr_results/
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-1.5B.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-3B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-3B.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-7B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-7B.json
│   │   │   │       ├── snr_results_google-gemma-2-2b.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-1B-Instruct.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-1B.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-3B-Instruct.json
│   │   │   │       └── snr_results_meta-llama-Llama-3.2-3B.json
│   │   │   └── swanlab/
│   │   │       ├── README.md
│   │   │       ├── __init__.py
│   │   │       ├── args.py
│   │   │       ├── callbacks.py
│   │   │       ├── completion_logger.py
│   │   │       ├── plugins.py
│   │   │       └── profiling.py
│   │   ├── kernels/
│   │   │   ├── __init__.py
│   │   │   ├── geglu.py
│   │   │   ├── lora.py
│   │   │   ├── quantize.py
│   │   │   ├── swiglu.py
│   │   │   └── utils.py
│   │   ├── loaders/
│   │   │   ├── __init__.py
│   │   │   ├── adapter.py
│   │   │   ├── adapters/
│   │   │   │   └── __init__.py
│   │   │   ├── constants.py
│   │   │   ├── model.py
│   │   │   ├── patch_manager.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   └── utils.py
│   │   ├── logging_config.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   └── mamba/
│   │   │       ├── __init__.py
│   │   │       ├── configuration_mamba.py
│   │   │       └── modeling_mamba.py
│   │   ├── monkeypatch/
│   │   │   ├── __init__.py
│   │   │   ├── accelerate/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── fsdp2.py
│   │   │   │   └── parallelism_config.py
│   │   │   ├── attention/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── flash_attn_4.py
│   │   │   │   ├── flex_attn.py
│   │   │   │   ├── sage_attn.py
│   │   │   │   └── xformers.py
│   │   │   ├── btlm_attn_hijack_flash.py
│   │   │   ├── data/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_dataset_fetcher.py
│   │   │   ├── deepspeed_utils.py
│   │   │   ├── fsdp2_qlora.py
│   │   │   ├── gradient_checkpointing/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── offload_cpu.py
│   │   │   │   └── offload_disk.py
│   │   │   ├── llama_attn_hijack_flash.py
│   │   │   ├── llama_attn_hijack_xformers.py
│   │   │   ├── lora_kernels.py
│   │   │   ├── loss/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chunked.py
│   │   │   │   └── eaft.py
│   │   │   ├── mistral_attn_hijack_flash.py
│   │   │   ├── mixtral/
│   │   │   │   └── __init__.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── apertus/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── activation.py
│   │   │   │   ├── kimi_linear/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── configuration_kimi.py
│   │   │   │   │   ├── modeling_kimi.py
│   │   │   │   │   ├── patch_kimi_linear.py
│   │   │   │   │   └── tokenization_kimi.py
│   │   │   │   ├── llama4/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   ├── mistral3/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── mistral_common_tokenizer.py
│   │   │   │   ├── pixtral/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling_flash_attention_utils.py
│   │   │   │   ├── qwen3_5/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   ├── qwen3_next/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   └── voxtral/
│   │   │   │       ├── __init__.py
│   │   │   │       └── modeling.py
│   │   │   ├── moe_quant.py
│   │   │   ├── multipack.py
│   │   │   ├── peft/
│   │   │   │   ├── __init__.py
│   │   │   │   └── utils.py
│   │   │   ├── relora.py
│   │   │   ├── ring_attn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── adapters/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── batch.py
│   │   │   │   └── patch.py
│   │   │   ├── scaled_softmax_attn.py
│   │   │   ├── stablelm_attn_hijack_flash.py
│   │   │   ├── tiled_mlp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   └── patch.py
│   │   │   ├── trainer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lr.py
│   │   │   │   ├── trl.py
│   │   │   │   ├── trl_vllm.py
│   │   │   │   └── utils.py
│   │   │   ├── trainer_accelerator_args.py
│   │   │   ├── trainer_fsdp_optim.py
│   │   │   ├── transformers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── trainer_context_parallel.py
│   │   │   │   └── trainer_loss_calc.py
│   │   │   ├── transformers_fa_utils.py
│   │   │   ├── unsloth_.py
│   │   │   ├── utils.py
│   │   │   └── xformers_/
│   │   │       └── __init__.py
│   │   ├── processing_strategies.py
│   │   ├── prompt_strategies/
│   │   │   ├── __init__.py
│   │   │   ├── alpaca_chat.py
│   │   │   ├── alpaca_instruct.py
│   │   │   ├── alpaca_w_system.py
│   │   │   ├── base.py
│   │   │   ├── bradley_terry/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat_template.py
│   │   │   │   └── llama3.py
│   │   │   ├── chat_template.py
│   │   │   ├── completion.py
│   │   │   ├── context_qa.py
│   │   │   ├── creative_acr.py
│   │   │   ├── dpo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat_template.py
│   │   │   │   ├── chatml.py
│   │   │   │   ├── llama3.py
│   │   │   │   ├── passthrough.py
│   │   │   │   ├── user_defined.py
│   │   │   │   └── zephyr.py
│   │   │   ├── input_output.py
│   │   │   ├── jinja_template_analyzer.py
│   │   │   ├── kto/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chatml.py
│   │   │   │   ├── llama3.py
│   │   │   │   └── user_defined.py
│   │   │   ├── llama2_chat.py
│   │   │   ├── messages/
│   │   │   │   ├── __init__.py
│   │   │   │   └── chat.py
│   │   │   ├── metharme.py
│   │   │   ├── orcamini.py
│   │   │   ├── orpo/
│   │   │   │   ├── __init__.py
│   │   │   │   └── chat_template.py
│   │   │   ├── pretrain.py
│   │   │   ├── pygmalion.py
│   │   │   ├── stepwise_supervised.py
│   │   │   └── user_defined.py
│   │   ├── prompt_tokenizers.py
│   │   ├── prompters.py
│   │   ├── scripts/
│   │   │   ├── __init__.py
│   │   │   ├── vllm_serve_lora.py
│   │   │   └── vllm_worker_ext.py
│   │   ├── telemetry/
│   │   │   ├── __init__.py
│   │   │   ├── callbacks.py
│   │   │   ├── errors.py
│   │   │   ├── manager.py
│   │   │   ├── runtime_metrics.py
│   │   │   └── whitelist.yaml
│   │   ├── train.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── bench.py
│   │       ├── callbacks/
│   │       │   ├── __init__.py
│   │       │   ├── comet_.py
│   │       │   ├── dynamic_checkpoint.py
│   │       │   ├── generation.py
│   │       │   ├── lisa.py
│   │       │   ├── mlflow_.py
│   │       │   ├── models.py
│   │       │   ├── opentelemetry.py
│   │       │   ├── perplexity.py
│   │       │   ├── profiler.py
│   │       │   ├── qat.py
│   │       │   ├── swanlab.py
│   │       │   ├── tokens_per_second.py
│   │       │   └── trackio_.py
│   │       ├── chat_templates/
│   │       │   ├── __init__.py
│   │       │   ├── base.py
│   │       │   └── templates/
│   │       │       ├── alpaca.jinja
│   │       │       ├── aya.jinja
│   │       │       ├── chatml.jinja
│   │       │       ├── cohere.jinja
│   │       │       ├── command_a.jinja
│   │       │       ├── command_a_rag.jinja
│   │       │       ├── command_a_tool_use.jinja
│   │       │       ├── deepseek_v2.jinja
│   │       │       ├── deepseek_v3.jinja
│   │       │       ├── exaone.jinja
│   │       │       ├── exaone4.jinja
│   │       │       ├── falcon_h1.jinja
│   │       │       ├── gemma.jinja
│   │       │       ├── gemma3.jinja
│   │       │       ├── gemma3n.jinja
│   │       │       ├── jamba.jinja
│   │       │       ├── llama3.jinja
│   │       │       ├── llama3_2_vision.jinja
│   │       │       ├── llama4.jinja
│   │       │       ├── llava.jinja
│   │       │       ├── metharme.jinja
│   │       │       ├── mistral_v1.jinja
│   │       │       ├── mistral_v2v3.jinja
│   │       │       ├── mistral_v3_tekken.jinja
│   │       │       ├── mistral_v7_tekken.jinja
│   │       │       ├── phi_3.jinja
│   │       │       ├── phi_35.jinja
│   │       │       ├── phi_4.jinja
│   │       │       ├── pixtral.jinja
│   │       │       ├── qwen2_vl.jinja
│   │       │       ├── qwen3.jinja
│   │       │       ├── qwen3_5.jinja
│   │       │       └── qwen_25.jinja
│   │       ├── collators/
│   │       │   ├── __init__.py
│   │       │   ├── batching.py
│   │       │   ├── core.py
│   │       │   ├── mamba.py
│   │       │   └── mm_chat.py
│   │       ├── comet_.py
│   │       ├── config/
│   │       │   ├── __init__.py
│   │       │   └── models/
│   │       │       └── __init__.py
│   │       ├── ctx_managers/
│   │       │   ├── __init__.py
│   │       │   └── sequence_parallel.py
│   │       ├── data/
│   │       │   ├── __init__.py
│   │       │   ├── lock.py
│   │       │   ├── rl.py
│   │       │   ├── sft.py
│   │       │   ├── shared.py
│   │       │   ├── streaming.py
│   │       │   ├── utils.py
│   │       │   └── wrappers.py
│   │       ├── datasets.py
│   │       ├── dict.py
│   │       ├── distributed.py
│   │       ├── environment.py
│   │       ├── freeze.py
│   │       ├── generation/
│   │       │   ├── __init__.py
│   │       │   └── sft.py
│   │       ├── import_helper.py
│   │       ├── logging.py
│   │       ├── lora.py
│   │       ├── mistral/
│   │       │   ├── __init__.py
│   │       │   ├── mistral3_processor.py
│   │       │   └── mistral_tokenizer.py
│   │       ├── mlflow_.py
│   │       ├── model_shard_quant.py
│   │       ├── optimizers/
│   │       │   ├── __init__.py
│   │       │   └── adopt.py
│   │       ├── quantization.py
│   │       ├── samplers/
│   │       │   ├── __init__.py
│   │       │   ├── multipack.py
│   │       │   └── utils.py
│   │       ├── schedulers.py
│   │       ├── schemas/
│   │       │   ├── __init__.py
│   │       │   ├── config.py
│   │       │   ├── datasets.py
│   │       │   ├── deprecated.py
│   │       │   ├── dynamic_checkpoint.py
│   │       │   ├── enums.py
│   │       │   ├── fsdp.py
│   │       │   ├── integrations.py
│   │       │   ├── internal/
│   │       │   │   └── __init__.py
│   │       │   ├── model.py
│   │       │   ├── multimodal.py
│   │       │   ├── peft.py
│   │       │   ├── quantization.py
│   │       │   ├── training.py
│   │       │   ├── trl.py
│   │       │   ├── utils.py
│   │       │   ├── validation.py
│   │       │   └── vllm.py
│   │       ├── tee.py
│   │       ├── tokenization.py
│   │       ├── trackio_.py
│   │       ├── train.py
│   │       ├── trainer.py
│   │       └── wandb_.py
│   └── setuptools_axolotl_dynamic_dependencies.py
├── styles.css
└── tests/
    ├── __init__.py
    ├── cli/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_cli_base.py
    │   ├── test_cli_evaluate.py
    │   ├── test_cli_fetch.py
    │   ├── test_cli_inference.py
    │   ├── test_cli_interface.py
    │   ├── test_cli_merge_lora.py
    │   ├── test_cli_merge_sharded_fsdp_weights.py
    │   ├── test_cli_preprocess.py
    │   ├── test_cli_sweeps.py
    │   ├── test_cli_train.py
    │   ├── test_cli_version.py
    │   ├── test_nested_options.py
    │   └── test_utils.py
    ├── conftest.py
    ├── constants.py
    ├── core/
    │   ├── chat/
    │   │   ├── __init__.py
    │   │   ├── format/
    │   │   │   └── __init__.py
    │   │   └── test_messages.py
    │   ├── test_async_grpo.py
    │   └── test_builders.py
    ├── e2e/
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── integrations/
    │   │   ├── test_cut_cross_entropy.py
    │   │   ├── test_fp8.py
    │   │   ├── test_hooks.py
    │   │   ├── test_kd.py
    │   │   ├── test_liger.py
    │   │   ├── test_llm_compressor.py
    │   │   ├── test_scattermoe_lora_kernels.py
    │   │   ├── test_scattermoe_lora_olmoe.py
    │   │   └── test_sonicmoe.py
    │   ├── kernels/
    │   │   ├── test_geglu.py
    │   │   ├── test_lora.py
    │   │   ├── test_quantize.py
    │   │   └── test_swiglu.py
    │   ├── multigpu/
    │   │   ├── __init__.py
    │   │   ├── patched/
    │   │   │   ├── __init__.py
    │   │   │   └── test_sp.py
    │   │   ├── solo/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_flex.py
    │   │   │   ├── test_gdpo.py
    │   │   │   └── test_grpo.py
    │   │   ├── test_dist_muon_fsdp2.py
    │   │   ├── test_eval.py
    │   │   ├── test_fp8_fsdp2.py
    │   │   ├── test_fsdp1.py
    │   │   ├── test_fsdp2.py
    │   │   ├── test_gemma3.py
    │   │   ├── test_llama.py
    │   │   ├── test_locking.py
    │   │   ├── test_ray.py
    │   │   └── test_tp.py
    │   ├── patched/
    │   │   ├── __init__.py
    │   │   ├── lora_kernels/
    │   │   │   ├── __init__.py
    │   │   │   └── test_lora_kernel_patching.py
    │   │   ├── test_4d_multipack_llama.py
    │   │   ├── test_activation_checkpointing.py
    │   │   ├── test_cli_integrations.py
    │   │   ├── test_fa_xentropy.py
    │   │   ├── test_falcon_samplepack.py
    │   │   ├── test_flattening.py
    │   │   ├── test_fsdp2_qlora.py
    │   │   ├── test_fused_llama.py
    │   │   ├── test_llama_s2_attention.py
    │   │   ├── test_lora_llama_multipack.py
    │   │   ├── test_mistral_samplepack.py
    │   │   ├── test_mixtral_samplepack.py
    │   │   ├── test_model_patches.py
    │   │   ├── test_peft_embeddings.py
    │   │   ├── test_phi_multipack.py
    │   │   ├── test_resume.py
    │   │   ├── test_unsloth_integration.py
    │   │   └── test_unsloth_qlora.py
    │   ├── solo/
    │   │   ├── __init__.py
    │   │   ├── test_flex.py
    │   │   └── test_relora_llama.py
    │   ├── test_activation_offloading.py
    │   ├── test_deepseekv3.py
    │   ├── test_diffusion.py
    │   ├── test_dpo.py
    │   ├── test_embeddings_lr.py
    │   ├── test_evaluate.py
    │   ├── test_falcon.py
    │   ├── test_gemma2.py
    │   ├── test_gemma3_text.py
    │   ├── test_imports.py
    │   ├── test_llama.py
    │   ├── test_llama_pretrain.py
    │   ├── test_llama_vision.py
    │   ├── test_load_model.py
    │   ├── test_lora_llama.py
    │   ├── test_mamba.py
    │   ├── test_mistral.py
    │   ├── test_mixtral.py
    │   ├── test_optimizers.py
    │   ├── test_packing_loss.py
    │   ├── test_phi.py
    │   ├── test_preprocess.py
    │   ├── test_process_reward_model_smollm2.py
    │   ├── test_profiler.py
    │   ├── test_qat.py
    │   ├── test_quantization.py
    │   ├── test_qwen.py
    │   ├── test_reward_model_smollm2.py
    │   ├── test_save_first_step.py
    │   ├── test_schedulers.py
    │   ├── test_streaming.py
    │   ├── test_tokenizer.py
    │   └── utils.py
    ├── fixtures/
    │   ├── alpaca/
    │   │   └── alpaca.json
    │   ├── conversation.json
    │   ├── conversation.missingturns.json
    │   ├── conversation.tokenized.json
    │   └── conversation.tokenized_llama2chat.json
    ├── hf_offline_utils.py
    ├── integrations/
    │   ├── __init__.py
    │   ├── test_diffusion.py
    │   ├── test_diffusion_callback.py
    │   ├── test_kd_chat_template.py
    │   ├── test_liger.py
    │   ├── test_routing_parity.py
    │   ├── test_scattermoe_autotune_telemetry.py
    │   ├── test_scattermoe_lora.py
    │   ├── test_scattermoe_lora_kernels.py
    │   ├── test_sonicmoe.py
    │   ├── test_sonicmoe_gradients.py
    │   └── test_swanlab.py
    ├── monkeypatch/
    │   ├── test_llama_attn_hijack_flash.py
    │   ├── test_pixtral_flash_attention_patch.py
    │   ├── test_qwen3_next_modeling_patch.py
    │   ├── test_trainer_accelerator_args.py
    │   ├── test_trainer_context_parallel_patch.py
    │   ├── test_trainer_loss_calc.py
    │   ├── test_trl_vllm.py
    │   └── test_voxtral_modeling_patch.py
    ├── patched/
    │   └── test_validation.py
    ├── prompt_strategies/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── messages/
    │   │   ├── __init__.py
    │   │   └── test_chat.py
    │   ├── test_alpaca.py
    │   ├── test_chat_template_ds_schema_unification.py
    │   ├── test_chat_template_utils.py
    │   ├── test_chat_templates.py
    │   ├── test_chat_templates_advanced.py
    │   ├── test_chat_templates_mistral.py
    │   ├── test_chat_templates_thinking.py
    │   ├── test_chat_templates_tool_call_string_arguments.py
    │   ├── test_dpo_chat_templates.py
    │   ├── test_dpo_chatml.py
    │   ├── test_jinja_template_analyzer.py
    │   ├── test_raw_io.py
    │   └── test_stepwise.py
    ├── telemetry/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_callbacks.py
    │   ├── test_errors.py
    │   ├── test_manager.py
    │   └── test_runtime_metrics.py
    ├── test_chunked_xentropy.py
    ├── test_context_parallel_batch_size.py
    ├── test_convert.py
    ├── test_data.py
    ├── test_datasets.py
    ├── test_dict.py
    ├── test_exact_deduplication.py
    ├── test_freeze.py
    ├── test_loaders.py
    ├── test_logging_config_file_capture.py
    ├── test_lora.py
    ├── test_normalize_config.py
    ├── test_opentelemetry_callback.py
    ├── test_packed_batch_sampler.py
    ├── test_packed_dataset.py
    ├── test_packed_pretraining.py
    ├── test_perplexity.py
    ├── test_prompt_tokenizers.py
    ├── test_prompters.py
    ├── test_revision_parameter.py
    ├── test_save_deduplicated.py
    ├── test_schedulers.py
    ├── test_streaming.py
    ├── test_tensor_parallel_batch_size.py
    ├── test_tokenizers.py
    ├── test_train.py
    ├── test_triton_kernels.py
    ├── test_utils_tee.py
    ├── test_validation_dataset.py
    └── utils/
        ├── callbacks/
        │   └── test_dynamic_checkpoint.py
        ├── data/
        │   └── test_utils.py
        ├── lora/
        │   ├── test_config_validation_lora.py
        │   ├── test_freeze_lora.py
        │   └── test_merge_lora.py
        ├── schemas/
        │   └── validation/
        │       ├── test_activation_offloading.py
        │       ├── test_default_values.py
        │       ├── test_fsdp.py
        │       └── test_moe_quant.py
        ├── test_grpo_rw_fnc.py
        ├── test_import_helper.py
        ├── test_mistral3_processor.py
        └── test_train.py