Showing preview only (7,759K chars total). Download the full file or copy to clipboard to get everything.
Repository: modelscope/ms-swift
Branch: main
Commit: fe928a9f1464
Files: 1228
Total size: 7.2 MB
Directory structure:
gitextract_9q3kx9l7/
├── .dev_scripts/
│ ├── build_docs.sh
│ ├── ci_container_test.sh
│ ├── dockerci.sh
│ └── dockerci_npu.sh
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1-bug-report.yml
│ │ ├── 2-feature-request.yml
│ │ ├── 3-question-discussion.yml
│ │ └── config.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── SECURITY.md
│ └── workflows/
│ ├── citest.yaml
│ ├── citest_npu.yaml
│ ├── close_tale_issue.yaml
│ ├── lint.yaml
│ └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .pre-commit-config_local.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTING_CN.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_CN.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── source/
│ │ ├── .readthedocs.yaml
│ │ ├── BestPractices/
│ │ │ ├── Elastic.md
│ │ │ ├── Embedding.md
│ │ │ ├── GRPO-Code-Training.md
│ │ │ ├── GRPO-Multi-Modal-Training.md
│ │ │ ├── GRPO.md
│ │ │ ├── MLLM-Registration.md
│ │ │ ├── Metax-support.md
│ │ │ ├── More-Best-Practices.md
│ │ │ ├── NPU-support.md
│ │ │ ├── Qwen3-Best-Practice.md
│ │ │ ├── Qwen3-VL-Best-Practice.md
│ │ │ ├── Qwen3_5-Best-Practice.md
│ │ │ ├── Rapidly-Training-VL-model.md
│ │ │ └── Reranker.md
│ │ ├── Customization/
│ │ │ ├── Architecture.md
│ │ │ ├── Custom-dataset.md
│ │ │ └── Custom-model.md
│ │ ├── GetStarted/
│ │ │ ├── Quick-start.md
│ │ │ ├── SWIFT-installation.md
│ │ │ └── Web-UI.md
│ │ ├── Instruction/
│ │ │ ├── Agent-support.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── Evaluation.md
│ │ │ ├── Export-and-push.md
│ │ │ ├── Frequently-asked-questions.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO/
│ │ │ │ ├── AdvancedResearch/
│ │ │ │ │ ├── CHORD.md
│ │ │ │ │ ├── CISPO.md
│ │ │ │ │ ├── DAPO.md
│ │ │ │ │ ├── GSPO.md
│ │ │ │ │ ├── REINFORCEPP.md
│ │ │ │ │ ├── RLOO.md
│ │ │ │ │ ├── SAPO.md
│ │ │ │ │ ├── deepeyes.md
│ │ │ │ │ ├── entropy_mask.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ │ └── treepo.md
│ │ │ │ ├── DeveloperGuide/
│ │ │ │ │ ├── gym_env.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── loss_types.md
│ │ │ │ │ ├── multi_task.md
│ │ │ │ │ ├── multi_turn.md
│ │ │ │ │ ├── reward_function.md
│ │ │ │ │ └── reward_model.md
│ │ │ │ ├── GetStarted/
│ │ │ │ │ ├── GRPO.md
│ │ │ │ │ └── index.rst
│ │ │ │ └── index.rst
│ │ │ ├── Inference-and-deployment.md
│ │ │ ├── Pre-training-and-Fine-tuning.md
│ │ │ ├── RLHF.md
│ │ │ ├── Ray.md
│ │ │ ├── Reinforced-Fine-tuning.md
│ │ │ ├── Sample.md
│ │ │ ├── Supported-models-and-datasets.md
│ │ │ └── Use-tuners.md
│ │ ├── Megatron-SWIFT/
│ │ │ ├── Ascend.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO.md
│ │ │ ├── LoRA-Training.md
│ │ │ ├── Mcore-Bridge.md
│ │ │ ├── Multimodal-Model.md
│ │ │ └── Quick-start.md
│ │ ├── _templates/
│ │ │ ├── autosummary/
│ │ │ │ └── class.rst
│ │ │ ├── classtemplate.rst
│ │ │ └── sobolengine.rst
│ │ ├── conf.py
│ │ └── index.rst
│ └── source_en/
│ ├── .readthedocs.yaml
│ ├── BestPractices/
│ │ ├── Elastic.md
│ │ ├── Embedding.md
│ │ ├── GRPO-Code-Training.md
│ │ ├── GRPO-Multi-Modal-Training.md
│ │ ├── GRPO.md
│ │ ├── MLLM-Registration.md
│ │ ├── Metax-support.md
│ │ ├── More-Best-Practices.md
│ │ ├── NPU-support.md
│ │ ├── Qwen3-Best-Practice.md
│ │ ├── Qwen3-VL-Best-Practice.md
│ │ ├── Qwen3_5-Best-Practice.md
│ │ ├── Rapidly-Training-VL-model.md
│ │ └── Reranker.md
│ ├── Customization/
│ │ ├── Architecture.md
│ │ ├── Custom-dataset.md
│ │ └── Custom-model.md
│ ├── GetStarted/
│ │ ├── Quick-start.md
│ │ ├── SWIFT-installation.md
│ │ └── Web-UI.md
│ ├── Instruction/
│ │ ├── Agent-support.md
│ │ ├── Command-line-parameters.md
│ │ ├── Evaluation.md
│ │ ├── Export-and-push.md
│ │ ├── Frequently-asked-questions.md
│ │ ├── GKD.md
│ │ ├── GRPO/
│ │ │ ├── AdvancedResearch/
│ │ │ │ ├── CHORD.md
│ │ │ │ ├── CISPO.md
│ │ │ │ ├── DAPO.md
│ │ │ │ ├── GSPO.md
│ │ │ │ ├── REINFORCEPP.md
│ │ │ │ ├── RLOO.md
│ │ │ │ ├── SAPO.md
│ │ │ │ ├── deepeyes.md
│ │ │ │ ├── entropy_mask.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ └── treepo.md
│ │ │ ├── DeveloperGuide/
│ │ │ │ ├── gym_env.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── loss_types.md
│ │ │ │ ├── multi_task.md
│ │ │ │ ├── multi_turn.md
│ │ │ │ ├── reward_function.md
│ │ │ │ └── reward_model.md
│ │ │ ├── GetStarted/
│ │ │ │ ├── GRPO.md
│ │ │ │ └── index.rst
│ │ │ └── index.rst
│ │ ├── Inference-and-deployment.md
│ │ ├── Pre-training-and-Fine-tuning.md
│ │ ├── RLHF.md
│ │ ├── Ray.md
│ │ ├── Reinforced-Fine-tuning.md
│ │ ├── Sample.md
│ │ ├── Supported-models-and-datasets.md
│ │ └── Use-tuners.md
│ ├── Megatron-SWIFT/
│ │ ├── Ascend.md
│ │ ├── Command-line-parameters.md
│ │ ├── GKD.md
│ │ ├── GRPO.md
│ │ ├── LoRA-Training.md
│ │ ├── Mcore-Bridge.md
│ │ ├── Multimodal-Model.md
│ │ └── Quick-start.md
│ ├── _templates/
│ │ ├── autosummary/
│ │ │ └── class.rst
│ │ ├── classtemplate.rst
│ │ └── sobolengine.rst
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── app/
│ │ ├── base_url/
│ │ │ ├── demo.py
│ │ │ └── demo.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ └── mllm.sh
│ ├── ascend/
│ │ ├── activation_cpu_offload/
│ │ │ ├── fsdp2.json
│ │ │ └── train.sh
│ │ ├── deploy/
│ │ │ └── vllm.sh
│ │ ├── infer/
│ │ │ └── vllm/
│ │ │ └── dp_tp.sh
│ │ ├── megatron/
│ │ │ └── train_sft_full.sh
│ │ ├── multi-node/
│ │ │ └── megatron/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ └── train/
│ │ ├── qwen3/
│ │ │ ├── qwen3_lora_deepspeed.sh
│ │ │ ├── qwen3_lora_fsdp/
│ │ │ │ ├── fsdp.json
│ │ │ │ └── train.sh
│ │ │ └── qwen3_lora_megatron.sh
│ │ ├── qwen3_next/
│ │ │ └── qwen3_next_megatron.sh
│ │ ├── qwen3_omni/
│ │ │ └── qwen3_omni_full_mindspeed.sh
│ │ └── qwen3_vl/
│ │ └── moe_full_mindspeed.sh
│ ├── custom/
│ │ ├── dataset.py
│ │ ├── infer.sh
│ │ ├── model.py
│ │ ├── model_hf.py
│ │ ├── my_qwen2_5_omni/
│ │ │ ├── my_register.py
│ │ │ ├── test_register.py
│ │ │ └── train.py
│ │ └── sft.sh
│ ├── deploy/
│ │ ├── README.md
│ │ ├── agent/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── bert/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── client/
│ │ │ ├── llm/
│ │ │ │ ├── base/
│ │ │ │ │ ├── openai_client.py
│ │ │ │ │ └── swift_client.py
│ │ │ │ └── chat/
│ │ │ │ ├── openai_client.py
│ │ │ │ └── swift_client.py
│ │ │ └── mllm/
│ │ │ ├── openai_client.py
│ │ │ └── swift_client.py
│ │ ├── embedding/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── lora/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── reranker/
│ │ │ ├── client.py
│ │ │ ├── client_generative.py
│ │ │ └── server.sh
│ │ ├── reward_model/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── seq_cls/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── sglang.sh
│ │ ├── vllm.sh
│ │ └── vllm_dp.sh
│ ├── eval/
│ │ ├── eval_url/
│ │ │ ├── demo.py
│ │ │ └── eval.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ ├── train_eval/
│ │ │ └── train.sh
│ │ └── vlm/
│ │ └── eval.sh
│ ├── export/
│ │ ├── merge_lora.sh
│ │ ├── ollama.sh
│ │ ├── push_to_hub.sh
│ │ └── quantize/
│ │ ├── awq.sh
│ │ ├── bert/
│ │ │ ├── bnb.sh
│ │ │ └── gptq.sh
│ │ ├── bnb.sh
│ │ ├── fp8.sh
│ │ ├── gptq.sh
│ │ ├── gptq_v2.sh
│ │ ├── mllm/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── moe/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── omni/
│ │ │ └── gptq.sh
│ │ └── reward_model/
│ │ ├── bnb.sh
│ │ └── gptq.sh
│ ├── infer/
│ │ ├── cli_demo.sh
│ │ ├── demo.py
│ │ ├── demo_agent.py
│ │ ├── demo_bert.py
│ │ ├── demo_embedding.py
│ │ ├── demo_grounding.py
│ │ ├── demo_hf.py
│ │ ├── demo_lora.py
│ │ ├── demo_mllm.py
│ │ ├── demo_reranker.py
│ │ ├── demo_reward_model.py
│ │ ├── demo_vllm_reasoning_parser.py
│ │ ├── lmdeploy/
│ │ │ ├── batch_ddp.sh
│ │ │ └── mllm_tp.sh
│ │ ├── sglang/
│ │ │ ├── demo.sh
│ │ │ ├── distill_qwen3_235b.sh
│ │ │ ├── mtp.sh
│ │ │ └── tp.sh
│ │ ├── transformers/
│ │ │ ├── batch_ddp.sh
│ │ │ ├── bert.sh
│ │ │ ├── lora.sh
│ │ │ ├── mllm_device_map.sh
│ │ │ ├── prm.sh
│ │ │ └── reward_model.sh
│ │ └── vllm/
│ │ ├── dp_tp.sh
│ │ ├── mllm_ddp.sh
│ │ ├── mllm_tp.sh
│ │ └── mtp.sh
│ ├── megatron/
│ │ ├── base_to_chat.sh
│ │ ├── benchmark/
│ │ │ └── deepspeed.sh
│ │ ├── dense/
│ │ │ ├── 72b_offload.sh
│ │ │ └── qwen3_32b.sh
│ │ ├── embedding/
│ │ │ ├── qwen3_emb.sh
│ │ │ └── qwen3_vl_emb.sh
│ │ ├── export/
│ │ │ ├── full.sh
│ │ │ └── lora.sh
│ │ ├── fp8/
│ │ │ ├── benchmark.sh
│ │ │ ├── llm.sh
│ │ │ └── vlm.sh
│ │ ├── grpo/
│ │ │ ├── dense_colocate.sh
│ │ │ ├── dense_server.sh
│ │ │ ├── moe_colocate_full.sh
│ │ │ ├── moe_colocate_lora.sh
│ │ │ └── sapo.sh
│ │ ├── long_text.sh
│ │ ├── lora/
│ │ │ ├── dense.sh
│ │ │ ├── dpo.sh
│ │ │ ├── loss_scale.sh
│ │ │ ├── moe.sh
│ │ │ ├── mtp.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── qwen3_235b.sh
│ │ ├── mcore_bridge/
│ │ │ ├── full/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── lora/
│ │ │ ├── moe.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── seq_cls.sh
│ │ ├── moe/
│ │ │ ├── deepseek_v3.sh
│ │ │ ├── moe.sh
│ │ │ ├── qwen3_moe.sh
│ │ │ └── qwen3_moe_offload.sh
│ │ ├── multi-node/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ ├── multimodal/
│ │ │ ├── dense/
│ │ │ │ ├── dpo.sh
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── lora_llm_vit_full/
│ │ │ │ └── sft.sh
│ │ │ ├── moe/
│ │ │ │ ├── full_dpo_offload.sh
│ │ │ │ └── lora.sh
│ │ │ └── omni/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── pretrain.sh
│ │ ├── reranker/
│ │ │ ├── qwen3_reranker.sh
│ │ │ └── qwen3_vl_reranker.sh
│ │ ├── rlhf/
│ │ │ ├── dpo/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── group_by_length.sh
│ │ │ │ ├── moe.sh
│ │ │ │ └── packing.sh
│ │ │ ├── gkd/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── opsd.sh
│ │ │ │ └── teacher_server.sh
│ │ │ ├── kto/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── rm/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── seq_cls/
│ │ │ ├── full.sh
│ │ │ └── lora/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ └── sft.sh
│ ├── models/
│ │ ├── deepseek_ocr/
│ │ │ ├── infer.py
│ │ │ └── train.sh
│ │ ├── deepseek_vl2/
│ │ │ └── train.sh
│ │ ├── glm-4.6v/
│ │ │ ├── flash.sh
│ │ │ └── mcore.sh
│ │ ├── gpt_oss/
│ │ │ ├── internvl3_5_gpt.sh
│ │ │ ├── mcore.sh
│ │ │ └── train.sh
│ │ ├── hunyuan_ocr/
│ │ │ └── train.sh
│ │ ├── internvl3/
│ │ │ └── train.sh
│ │ ├── keye/
│ │ │ └── train.sh
│ │ ├── llama4/
│ │ │ └── mcore.sh
│ │ ├── minicpmv/
│ │ │ └── train.sh
│ │ ├── ovis2/
│ │ │ └── train.sh
│ │ ├── qwen3_5/
│ │ │ ├── mcore.sh
│ │ │ ├── mcore_full.sh
│ │ │ ├── mcore_grpo_moe.sh
│ │ │ ├── packing.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_next/
│ │ │ ├── mcore.sh
│ │ │ ├── mtp.sh
│ │ │ ├── non_padding_free.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_omni/
│ │ │ ├── transformers.sh
│ │ │ └── zero3.sh
│ │ └── qwen3_vl/
│ │ ├── mcore.sh
│ │ ├── mcore_full.sh
│ │ ├── mixed.sh
│ │ ├── transformers.sh
│ │ └── zero3.sh
│ ├── notebook/
│ │ ├── qwen2_5-self-cognition/
│ │ │ ├── infer.ipynb
│ │ │ ├── infer.sh
│ │ │ ├── self-cognition-sft.ipynb
│ │ │ └── sft.sh
│ │ ├── qwen2_5-vl-grounding/
│ │ │ └── zh.ipynb
│ │ └── qwen2vl-ocr/
│ │ ├── infer.ipynb
│ │ └── ocr-sft.ipynb
│ ├── sampler/
│ │ ├── distill/
│ │ │ ├── distill.sh
│ │ │ └── distill.yaml
│ │ └── sample/
│ │ ├── sample.sh
│ │ └── sampling.yaml
│ ├── train/
│ │ ├── agent/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── glm4.sh
│ │ │ ├── loss_scale/
│ │ │ │ ├── infer_lora.py
│ │ │ │ └── train.sh
│ │ │ └── qwen2_5.sh
│ │ ├── all_to_all/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ ├── base_to_chat/
│ │ │ ├── full.sh
│ │ │ ├── lora.sh
│ │ │ └── lora2.sh
│ │ ├── cached_dataset/
│ │ │ ├── dpo.sh
│ │ │ ├── mcore.sh
│ │ │ ├── pretrained.sh
│ │ │ ├── reranker.sh
│ │ │ ├── seq_cls.sh
│ │ │ ├── sft.sh
│ │ │ └── vlm.sh
│ │ ├── early_stop/
│ │ │ └── lora_sft.sh
│ │ ├── embedding/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_emb.sh
│ │ │ │ └── qwen3_vl_emb.sh
│ │ │ └── train_gme.sh
│ │ ├── flash_attention_3/
│ │ │ ├── mcore.sh
│ │ │ └── transformers.sh
│ │ ├── full/
│ │ │ ├── dft.sh
│ │ │ ├── infer.sh
│ │ │ ├── qwen2_5_32b.sh
│ │ │ └── train.sh
│ │ ├── grpo/
│ │ │ ├── external/
│ │ │ │ ├── README.md
│ │ │ │ ├── agent.sh
│ │ │ │ ├── grpo_32b_full.sh
│ │ │ │ ├── grpo_7b.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── vllm_gym.sh
│ │ │ │ └── vllm_multi_turn.sh
│ │ │ ├── internal/
│ │ │ │ ├── README.md
│ │ │ │ ├── chord.sh
│ │ │ │ ├── full_lmdeploy.sh
│ │ │ │ ├── gspo.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── qlora.sh
│ │ │ │ ├── reinforce_plus_plus.sh
│ │ │ │ ├── rloo.sh
│ │ │ │ ├── sapo.sh
│ │ │ │ ├── transformers.sh
│ │ │ │ ├── vllm_72b_4gpu.sh
│ │ │ │ ├── vllm_lora_qwenvl72b.sh
│ │ │ │ ├── vllm_multi_turn.sh
│ │ │ │ └── vllm_vl7b.sh
│ │ │ ├── multi_node/
│ │ │ │ ├── Qwen2_5_32B_full.sh
│ │ │ │ ├── colocate_multi_node1.sh
│ │ │ │ ├── colocate_multi_node2.sh
│ │ │ │ ├── server_multi_node.sh
│ │ │ │ └── train_dlc.sh
│ │ │ ├── plugin/
│ │ │ │ ├── deepeyes/
│ │ │ │ │ ├── deepeyes.sh
│ │ │ │ │ └── deepeyes_plugin.py
│ │ │ │ ├── gsm8k/
│ │ │ │ │ ├── gsm8k.sh
│ │ │ │ │ └── gsm8k_plugin.py
│ │ │ │ ├── plugin.py
│ │ │ │ ├── run_external_reward_func.sh
│ │ │ │ ├── run_external_reward_model.sh
│ │ │ │ ├── run_external_scheduler.sh
│ │ │ │ └── treepo/
│ │ │ │ ├── tree_rollout.py
│ │ │ │ ├── tree_rollout.sh
│ │ │ │ └── tree_rollout_plugin.py
│ │ │ ├── prompt.txt
│ │ │ └── qwen2_5_omni/
│ │ │ ├── grpo.sh
│ │ │ └── infer.sh
│ │ ├── infer.sh
│ │ ├── liger/
│ │ │ └── sft.sh
│ │ ├── lora_sft.sh
│ │ ├── moe/
│ │ │ ├── llama4.sh
│ │ │ └── qwen3_moe.sh
│ │ ├── multi-gpu/
│ │ │ ├── ddp/
│ │ │ │ └── train.sh
│ │ │ ├── ddp_device_map/
│ │ │ │ └── train.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── train_zero2.sh
│ │ │ │ └── train_zero3.sh
│ │ │ ├── device_map/
│ │ │ │ └── train.sh
│ │ │ ├── fsdp2_lora/
│ │ │ │ ├── fsdp2.json
│ │ │ │ └── train.sh
│ │ │ └── fsdp_qlora/
│ │ │ ├── fsdp_offload.json
│ │ │ └── train.sh
│ │ ├── multi-node/
│ │ │ ├── accelerate/
│ │ │ │ ├── multi_node.yaml
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── README.md
│ │ │ │ ├── host.txt
│ │ │ │ └── train.sh
│ │ │ ├── dlc/
│ │ │ │ └── train.sh
│ │ │ ├── ray/
│ │ │ │ ├── sft.sh
│ │ │ │ └── sft.yaml
│ │ │ ├── swift/
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ └── torchrun/
│ │ │ ├── train_node1.sh
│ │ │ └── train_node2.sh
│ │ ├── multimodal/
│ │ │ ├── audio.sh
│ │ │ ├── caption.sh
│ │ │ ├── grounding.sh
│ │ │ ├── infer.sh
│ │ │ ├── lora_llm_full_vit/
│ │ │ │ ├── infer.sh
│ │ │ │ ├── merge_lora.sh
│ │ │ │ ├── seq_cls.sh
│ │ │ │ └── sft.sh
│ │ │ ├── ocr.sh
│ │ │ ├── omni/
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── rlhf/
│ │ │ │ ├── dpo/
│ │ │ │ │ ├── full.sh
│ │ │ │ │ └── lora.sh
│ │ │ │ ├── gkd/
│ │ │ │ │ ├── fast.sh
│ │ │ │ │ └── full.sh
│ │ │ │ └── kto.sh
│ │ │ ├── video.sh
│ │ │ └── vit_gradient_checkpointing.sh
│ │ ├── new_special_tokens/
│ │ │ ├── infer.sh
│ │ │ ├── merge_lora.sh
│ │ │ ├── tokens.txt
│ │ │ └── train.sh
│ │ ├── on_policy_distillation.sh
│ │ ├── optimizer/
│ │ │ ├── muon.sh
│ │ │ └── muonclip.sh
│ │ ├── packing/
│ │ │ ├── dpo.sh
│ │ │ ├── dpo_vlm.sh
│ │ │ ├── liger_kernel.sh
│ │ │ ├── llm.sh
│ │ │ ├── qwen2_5_omni.sh
│ │ │ ├── qwen2_5_vl.sh
│ │ │ └── streaming.sh
│ │ ├── padding_free/
│ │ │ ├── dpo_vlm.sh
│ │ │ └── sft.sh
│ │ ├── plugins/
│ │ │ ├── loss_scale.sh
│ │ │ └── tuner_phi4_mm.sh
│ │ ├── predict_with_generate/
│ │ │ └── train.sh
│ │ ├── pretrain/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ ├── awq/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── bnb/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── gptq.sh
│ │ │ └── hqq.sh
│ │ ├── reranker/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_reranker.sh
│ │ │ │ └── qwen3_vl_reranker.sh
│ │ │ ├── train_generative_reranker.sh
│ │ │ ├── train_generative_reranker_listwise.sh
│ │ │ ├── train_reranker.sh
│ │ │ ├── train_reranker_auto_patch.sh
│ │ │ ├── train_reranker_listwise.sh
│ │ │ └── train_reranker_mm.sh
│ │ ├── rft/
│ │ │ ├── math.json
│ │ │ └── rft.py
│ │ ├── rlhf/
│ │ │ ├── README.md
│ │ │ ├── cpo.sh
│ │ │ ├── dpo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── gkd/
│ │ │ │ ├── fast.sh
│ │ │ │ ├── full.sh
│ │ │ │ ├── teacher_server.sh
│ │ │ │ ├── think_model.sh
│ │ │ │ ├── vllm_colocate.sh
│ │ │ │ └── vllm_server.sh
│ │ │ ├── kto.sh
│ │ │ ├── mpo.sh
│ │ │ ├── opsd/
│ │ │ │ ├── opsd.sh
│ │ │ │ └── opsd_plugin.py
│ │ │ ├── orpo.sh
│ │ │ ├── ppo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── rm.sh
│ │ │ └── simpo.sh
│ │ ├── seq_cls/
│ │ │ ├── bert/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── multi_label/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ ├── sft.sh
│ │ │ │ └── vlm.sh
│ │ │ ├── qwen2_5/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── qwen2_5_omni/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ └── regression/
│ │ │ ├── deploy.sh
│ │ │ ├── infer.sh
│ │ │ └── sft.sh
│ │ ├── sequence_parallel/
│ │ │ ├── sequence_parallel.sh
│ │ │ ├── sequence_parallel_512k.sh
│ │ │ ├── sequence_parallel_dpo.sh
│ │ │ ├── sequence_parallel_emb.sh
│ │ │ ├── sequence_parallel_grpo.sh
│ │ │ ├── sequence_parallel_reranker.sh
│ │ │ └── sequence_parallel_seq_cls.sh
│ │ ├── streaming/
│ │ │ ├── lazy_tokenize.sh
│ │ │ └── streaming.sh
│ │ ├── think_model/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── qwen3_demo1.sh
│ │ │ └── qwen3_demo2.sh
│ │ └── tuners/
│ │ ├── adalora/
│ │ │ └── train.sh
│ │ ├── adapter/
│ │ │ └── train.sh
│ │ ├── boft/
│ │ │ └── train.sh
│ │ ├── bone/
│ │ │ └── train.sh
│ │ ├── dora/
│ │ │ └── train.sh
│ │ ├── galore/
│ │ │ ├── train_galore.sh
│ │ │ └── train_qgalore.sh
│ │ ├── lisa/
│ │ │ └── train.sh
│ │ ├── llamapro/
│ │ │ └── train.sh
│ │ ├── longlora/
│ │ │ └── train.sh
│ │ ├── lora/
│ │ │ └── train.sh
│ │ ├── lora-ga/
│ │ │ └── train.sh
│ │ ├── neftune/
│ │ │ └── train.sh
│ │ ├── olora/
│ │ │ └── train.sh
│ │ ├── pissa/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ └── train.sh
│ │ ├── reft/
│ │ │ └── train.sh
│ │ └── unsloth/
│ │ └── train.sh
│ └── yaml/
│ ├── sft.sh
│ └── sft.yaml
├── requirements/
│ ├── docs.txt
│ ├── eval.txt
│ ├── framework.txt
│ ├── install_all.sh
│ ├── ray.txt
│ ├── swanlab.txt
│ └── tests.txt
├── requirements.txt
├── scripts/
│ ├── benchmark/
│ │ ├── config/
│ │ │ └── tuner.json
│ │ ├── exp.py
│ │ ├── exp_utils.py
│ │ └── generate_report.py
│ └── utils/
│ ├── plot_loss.py
│ ├── run_dataset_info.py
│ ├── run_model_info.py
│ ├── run_template.py
│ └── test_link_valid.py
├── setup.cfg
├── setup.py
├── swift/
│ ├── __init__.py
│ ├── agent_template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── deepseek_v3_1.py
│ │ ├── extra.py
│ │ ├── glm4.py
│ │ ├── hermes.py
│ │ ├── llama.py
│ │ ├── mapping.py
│ │ ├── minimax_m2.py
│ │ ├── mistral.py
│ │ ├── qwen.py
│ │ ├── qwen3_coder.py
│ │ ├── react.py
│ │ ├── seed_oss.py
│ │ ├── toolbench.py
│ │ └── youtu.py
│ ├── arguments/
│ │ ├── __init__.py
│ │ ├── app_args.py
│ │ ├── base_args/
│ │ │ ├── __init__.py
│ │ │ ├── base_args.py
│ │ │ ├── data_args.py
│ │ │ ├── generation_args.py
│ │ │ ├── model_args.py
│ │ │ ├── quant_args.py
│ │ │ └── template_args.py
│ │ ├── deploy_args.py
│ │ ├── eval_args.py
│ │ ├── export_args.py
│ │ ├── infer_args.py
│ │ ├── merge_args.py
│ │ ├── pretrain_args.py
│ │ ├── rlhf_args.py
│ │ ├── sampling_args.py
│ │ ├── sft_args.py
│ │ ├── tuner_args.py
│ │ └── webui_args.py
│ ├── callbacks/
│ │ ├── __init__.py
│ │ ├── activation_cpu_offload.py
│ │ ├── adalora.py
│ │ ├── base.py
│ │ ├── deepspeed_elastic.py
│ │ ├── early_stop.py
│ │ ├── lisa.py
│ │ ├── mapping.py
│ │ └── perf_log.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── _megatron/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── main.py
│ │ │ ├── pt.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── app.py
│ │ ├── deploy.py
│ │ ├── eval.py
│ │ ├── export.py
│ │ ├── infer.py
│ │ ├── main.py
│ │ ├── merge_lora.py
│ │ ├── pt.py
│ │ ├── rlhf.py
│ │ ├── rollout.py
│ │ ├── sample.py
│ │ ├── sft.py
│ │ ├── utils.py
│ │ └── web_ui.py
│ ├── config/
│ │ ├── fsdp2.json
│ │ ├── zero0.json
│ │ ├── zero1.json
│ │ ├── zero2.json
│ │ ├── zero2_offload.json
│ │ ├── zero3.json
│ │ └── zero3_offload.json
│ ├── dataloader/
│ │ ├── __init__.py
│ │ ├── dispatcher.py
│ │ └── shard.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ └── dataset_info.json
│ │ ├── dataset/
│ │ │ ├── __init__.py
│ │ │ ├── llm.py
│ │ │ └── mllm.py
│ │ ├── dataset_meta.py
│ │ ├── dataset_syntax.py
│ │ ├── indexed_dataset.py
│ │ ├── loader.py
│ │ ├── media.py
│ │ ├── packing.py
│ │ ├── preprocessor/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── extra.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── hub/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ └── hub.py
│ ├── infer_engine/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── grpo_vllm_engine.py
│ │ ├── infer_client.py
│ │ ├── infer_engine.py
│ │ ├── lmdeploy_engine.py
│ │ ├── patch.py
│ │ ├── protocol.py
│ │ ├── sglang_engine.py
│ │ ├── transformers_engine.py
│ │ ├── utils.py
│ │ └── vllm_engine.py
│ ├── loss/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── causal_lm.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ └── reranker.py
│ ├── loss_scale/
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── base.py
│ │ ├── config/
│ │ │ ├── agentflan.json
│ │ │ ├── alpha_umi.json
│ │ │ ├── hermes.json
│ │ │ ├── ignore_empty_think.json
│ │ │ ├── qwen.json
│ │ │ └── react.json
│ │ ├── mapping.py
│ │ ├── other.py
│ │ └── utils.py
│ ├── megatron/
│ │ ├── __init__.py
│ │ ├── arguments/
│ │ │ ├── __init__.py
│ │ │ ├── export_args.py
│ │ │ ├── megatron_args.py
│ │ │ ├── megatron_base_args.py
│ │ │ ├── pretrain_args.py
│ │ │ ├── rlhf_args.py
│ │ │ └── sft_args.py
│ │ ├── callbacks/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── default_flow.py
│ │ │ ├── mapping.py
│ │ │ ├── print.py
│ │ │ ├── swanlab.py
│ │ │ ├── tensorboard.py
│ │ │ ├── utils.py
│ │ │ └── wandb.py
│ │ ├── convert.py
│ │ ├── init.py
│ │ ├── model/
│ │ │ ├── __init__.py
│ │ │ ├── constant.py
│ │ │ ├── gpt_bridge.py
│ │ │ ├── gpt_model.py
│ │ │ ├── gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm4.py
│ │ │ │ ├── minimax_m2.py
│ │ │ │ ├── olmoe.py
│ │ │ │ ├── qwen3_emb.py
│ │ │ │ └── qwen3_next.py
│ │ │ ├── mm_gpt_model.py
│ │ │ ├── mm_gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm.py
│ │ │ │ ├── internvl.py
│ │ │ │ ├── kimi_vl.py
│ │ │ │ ├── llama4.py
│ │ │ │ ├── qwen.py
│ │ │ │ ├── qwen3_5.py
│ │ │ │ ├── qwen3_5_gdn.py
│ │ │ │ ├── qwen3_vl.py
│ │ │ │ └── utils.py
│ │ │ ├── model_config.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gated_delta_net.py
│ │ │ │ └── gated_self_attention.py
│ │ │ ├── register.py
│ │ │ └── rope.py
│ │ ├── pipelines/
│ │ │ ├── __init__.py
│ │ │ ├── export/
│ │ │ │ ├── __init__.py
│ │ │ │ └── export.py
│ │ │ └── train/
│ │ │ ├── __init__.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── trainers/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── batch_sampler.py
│ │ │ ├── dpo_trainer.py
│ │ │ ├── embedding_trainer.py
│ │ │ ├── gkd_trainer.py
│ │ │ ├── grpo_trainer.py
│ │ │ ├── kto_trainer.py
│ │ │ ├── reranker_trainer.py
│ │ │ ├── reward_trainer.py
│ │ │ ├── rlhf_mixin.py
│ │ │ ├── rollout_mixin.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── vocab_parallel_utils.py
│ │ ├── tuners/
│ │ │ ├── __init__.py
│ │ │ └── lora.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── convert_utils.py
│ │ ├── megatron_lm_utils.py
│ │ ├── parallel_utils.py
│ │ ├── patcher.py
│ │ └── utils.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── acc.py
│ │ ├── base.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ ├── nlg.py
│ │ ├── reranker.py
│ │ └── utils.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ ├── model_arch.py
│ │ ├── model_meta.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baichuan.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── codefuse.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── internlm.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── mamba.py
│ │ │ ├── microsoft.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── mistral.py
│ │ │ ├── mllm.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── skywork.py
│ │ │ ├── stepfun.py
│ │ │ ├── telechat.py
│ │ │ ├── tencent.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── npu_patcher.py
│ │ ├── patcher.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── galore/
│ │ │ ├── __init__.py
│ │ │ ├── adafactor.py
│ │ │ ├── adamw.py
│ │ │ ├── adamw8bit.py
│ │ │ ├── galore_projector.py
│ │ │ └── utils.py
│ │ ├── lorap.py
│ │ ├── mapping.py
│ │ ├── multimodal.py
│ │ ├── muon.py
│ │ └── muonclip.py
│ ├── pipelines/
│ │ ├── __init__.py
│ │ ├── app/
│ │ │ ├── __init__.py
│ │ │ ├── app.py
│ │ │ ├── build_ui.py
│ │ │ └── locale.py
│ │ ├── base.py
│ │ ├── eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ └── utils.py
│ │ ├── export/
│ │ │ ├── __init__.py
│ │ │ ├── cached_dataset.py
│ │ │ ├── export.py
│ │ │ ├── merge_lora.py
│ │ │ ├── ollama.py
│ │ │ └── quant.py
│ │ ├── infer/
│ │ │ ├── __init__.py
│ │ │ ├── deploy.py
│ │ │ ├── infer.py
│ │ │ ├── rollout.py
│ │ │ └── utils.py
│ │ ├── sampling/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── distill_sampler.py
│ │ │ ├── sampling.py
│ │ │ ├── utils.py
│ │ │ └── vanilla_sampler.py
│ │ ├── train/
│ │ │ ├── __init__.py
│ │ │ ├── kto.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ ├── sft.py
│ │ │ └── tuner.py
│ │ └── utils.py
│ ├── ray/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── base.py
│ │ └── resource_manager.py
│ ├── rewards/
│ │ ├── __init__.py
│ │ ├── orm.py
│ │ ├── prm.py
│ │ └── rm_plugin.py
│ ├── rlhf_trainers/
│ │ ├── __init__.py
│ │ ├── args_mixin.py
│ │ ├── arguments.py
│ │ ├── cpo_trainer.py
│ │ ├── dpo_trainer.py
│ │ ├── gkd_trainer.py
│ │ ├── grpo_trainer.py
│ │ ├── kto_trainer.py
│ │ ├── orpo_trainer.py
│ │ ├── ppo_trainer.py
│ │ ├── reward_trainer.py
│ │ ├── rlhf_mixin.py
│ │ ├── rollout_mixin.py
│ │ ├── utils.py
│ │ └── vllm_client.py
│ ├── rollout/
│ │ ├── __init__.py
│ │ ├── gym_env.py
│ │ └── multi_turn.py
│ ├── sequence_parallel/
│ │ ├── __init__.py
│ │ ├── ulysses.py
│ │ ├── utils.py
│ │ └── zigzag_ring_attn.py
│ ├── template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── constant.py
│ │ ├── grounding.py
│ │ ├── register.py
│ │ ├── template_inputs.py
│ │ ├── template_meta.py
│ │ ├── templates/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── deepseek.py
│ │ │ ├── dots.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── idefics3.py
│ │ │ ├── internlm.py
│ │ │ ├── internvl.py
│ │ │ ├── kwai.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── megrez.py
│ │ │ ├── microsoft.py
│ │ │ ├── midashenglm.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── minimind.py
│ │ │ ├── mistral.py
│ │ │ ├── molmo.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── pixtral.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── stepfun.py
│ │ │ ├── tencent.py
│ │ │ ├── utils.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── utils.py
│ │ └── vision_utils.py
│ ├── trainers/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── embedding_trainer.py
│ │ ├── mixin.py
│ │ ├── patcher.py
│ │ ├── reranker_trainer.py
│ │ ├── seq2seq_trainer.py
│ │ ├── trainer.py
│ │ ├── trainer_factory.py
│ │ └── utils.py
│ ├── tuner_plugin/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dummy.py
│ │ ├── ia3.py
│ │ ├── lora_llm.py
│ │ └── mapping.py
│ ├── tuners/
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── base.py
│ │ ├── llamapro.py
│ │ ├── longlora/
│ │ │ ├── __init__.py
│ │ │ ├── llama.py
│ │ │ └── longlora.py
│ │ ├── lora.py
│ │ ├── lora_layers.py
│ │ ├── mapping.py
│ │ ├── neftune.py
│ │ ├── part.py
│ │ ├── peft.py
│ │ ├── prompt.py
│ │ ├── reft.py
│ │ ├── restuning.py
│ │ ├── restuning_components.py
│ │ ├── scetuning/
│ │ │ ├── __init__.py
│ │ │ ├── scetuning.py
│ │ │ └── scetuning_components.py
│ │ ├── side.py
│ │ └── utils.py
│ ├── ui/
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── base.py
│ │ ├── llm_eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ ├── llm_eval.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_export/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── llm_export.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_grpo/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── external_rollout.py
│ │ │ ├── external_runtime.py
│ │ │ ├── grpo_advanced.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_grpo.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── reward.py
│ │ │ ├── rollout.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_infer/
│ │ │ ├── __init__.py
│ │ │ ├── generate.py
│ │ │ ├── llm_infer.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_rlhf/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_rlhf.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── rlhf.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_sample/
│ │ │ ├── __init__.py
│ │ │ ├── llm_sample.py
│ │ │ ├── model.py
│ │ │ ├── runtime.py
│ │ │ └── sample.py
│ │ └── llm_train/
│ │ ├── __init__.py
│ │ ├── advanced.py
│ │ ├── dataset.py
│ │ ├── hyper.py
│ │ ├── llm_train.py
│ │ ├── lora.py
│ │ ├── model.py
│ │ ├── optimizer.py
│ │ ├── quantization.py
│ │ ├── report_to.py
│ │ ├── runtime.py
│ │ ├── save.py
│ │ ├── self_cog.py
│ │ ├── target.py
│ │ ├── task.py
│ │ ├── tuner.py
│ │ └── utils.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── dequantizer.py
│ │ ├── env.py
│ │ ├── hf_config.py
│ │ ├── hub_utils.py
│ │ ├── import_utils.py
│ │ ├── io_utils.py
│ │ ├── logger.py
│ │ ├── np_utils.py
│ │ ├── processor_utils.py
│ │ ├── safetensors.py
│ │ ├── shutdown_manager.py
│ │ ├── tb_utils.py
│ │ ├── torch_utils.py
│ │ ├── transformers_utils.py
│ │ └── utils.py
│ └── version.py
└── tests/
├── __init__.py
├── app/
│ └── test_app.py
├── deploy/
│ ├── test_dataset.py
│ └── test_logprobs.py
├── eval/
│ └── test_eval.py
├── export/
│ └── test_quant.py
├── general/
│ ├── test_arch.py
│ ├── test_dataset.py
│ ├── test_model.py
│ ├── test_stream.py
│ └── test_template.py
├── hub/
│ ├── __init__.py
│ └── test_check_model.py
├── infer/
│ ├── test_agent.py
│ ├── test_infer.py
│ ├── test_logprobs.py
│ ├── test_main.py
│ ├── test_max_memory.py
│ ├── test_mllm.py
│ └── test_sglang.py
├── llm/
│ ├── __init__.py
│ ├── config/
│ │ ├── infer.json
│ │ └── sft.json
│ ├── data/
│ │ ├── alpaca.csv
│ │ ├── alpaca.jsonl
│ │ ├── alpaca2.csv
│ │ ├── chatml.jsonl
│ │ ├── conversations.jsonl
│ │ ├── multi_modal_1.jsonl
│ │ ├── multi_modal_2.jsonl
│ │ ├── multi_modal_3.jsonl
│ │ ├── sharegpt.jsonl
│ │ ├── swift_multi.json
│ │ ├── swift_multi.jsonl
│ │ ├── swift_pre.csv
│ │ ├── swift_pre.jsonl
│ │ ├── swift_single.csv
│ │ └── swift_single.jsonl
│ ├── test_custom.py
│ ├── test_dataset.py
│ ├── test_ollama_export.py
│ ├── test_run.py
│ ├── test_template.py
│ ├── test_utils.py
│ └── test_web_ui.py
├── megatron/
│ ├── export/
│ │ └── test_export.py
│ ├── test_align/
│ │ ├── test_llm.py
│ │ └── test_mllm.py
│ ├── test_embedding.py
│ ├── test_export.py
│ ├── test_gkd.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_lora.py
│ ├── test_rlhf.py
│ └── test_train.py
├── model_tag.py
├── models/
│ ├── test_flash_attn.py
│ ├── test_llm.py
│ └── test_mllm.py
├── run.py
├── run_config.yaml
├── sample/
│ └── test_client.py
├── test_align/
│ ├── test_cls.py
│ ├── test_lmdeploy_vlm.py
│ ├── test_padding_side.py
│ ├── test_rlhf_loss.py
│ ├── test_template/
│ │ ├── test_agent.py
│ │ ├── test_audio.py
│ │ ├── test_gene.py
│ │ ├── test_llm.py
│ │ ├── test_template.py
│ │ ├── test_tool.py
│ │ ├── test_video.py
│ │ └── test_vision.py
│ └── test_vllm_vlm.py
├── test_utils.py
├── train/
│ ├── test_channel.py
│ ├── test_cls.py
│ ├── test_embedding.py
│ ├── test_export_cached_dataset.py
│ ├── test_freeze.py
│ ├── test_gkd.py
│ ├── test_grounding.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_liger.py
│ ├── test_multilabel.py
│ ├── test_packing.py
│ ├── test_ppo.py
│ ├── test_pt.py
│ ├── test_resume_from_checkpoint.py
│ ├── test_rlhf.py
│ ├── test_sample.py
│ ├── test_sft.py
│ ├── test_train_eval.py
│ ├── test_vit_lr.py
│ └── test_vllm_importance_sampling_basic.py
├── tuners/
│ ├── __init__.py
│ ├── test_extra_state_dict.py
│ ├── test_merged_linear.py
│ ├── test_neft.py
│ ├── test_peft.py
│ ├── test_scetuning.py
│ ├── test_swift_base.py
│ ├── test_swift_device_map.py
│ └── test_swift_restuning.py
└── utils/
├── __init__.py
├── test_async_rewards.py
├── test_file_utils.py
├── test_io_utils.py
├── test_rewards.py
├── test_split_str_parts_by.py
└── test_torch_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dev_scripts/build_docs.sh
================================================
pip install -r requirements/docs.txt
cd docs
rm -rf build
# update api rst
#rm -rf source/api/
#sphinx-apidoc --module-first -o source/api/ ../modelscope/
make html
================================================
FILE: .dev_scripts/ci_container_test.sh
================================================
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install -r requirements/tests.txt -i https://mirrors.aliyun.com/pypi/simple/
git config --global --add safe.directory /ms-swift
git config --global user.email tmp
git config --global user.name tmp.com
# linter test
# use internal project for pre-commit due to the network problem
if [ `git remote -v | grep alibaba | wc -l` -gt 1 ]; then
pre-commit run -c .pre-commit-config_local.yaml --all-files
if [ $? -ne 0 ]; then
echo "linter test failed, please run 'pre-commit run --all-files' to check"
echo "From the repository folder"
echo "Run 'pip install -r requirements/tests.txt' install test dependencies."
echo "Run 'pre-commit install' install pre-commit hooks."
echo "Finally run linter with command: 'pre-commit run --all-files' to check."
echo "Ensure there is no failure!!!!!!!!"
exit -1
fi
fi
pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/
pip install decord einops -U -i https://mirrors.aliyun.com/pypi/simple/
pip uninstall autoawq -y
pip install optimum
pip install diffusers
pip install "transformers<5.0"
# pip install autoawq -U --no-deps
# test with install
pip install .
pip install auto_gptq bitsandbytes deepspeed -U -i https://mirrors.aliyun.com/pypi/simple/
else
echo "Running case in release image, run case directly!"
fi
# remove torch_extensions folder to avoid ci hang.
rm -rf ~/.cache/torch_extensions
if [ $# -eq 0 ]; then
ci_command="python tests/run.py --subprocess"
else
ci_command="$@"
fi
echo "Running case with command: $ci_command"
$ci_command
================================================
FILE: .dev_scripts/dockerci.sh
================================================
#!/bin/bash
MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
CODE_DIR=$PWD
CODE_DIR_IN_CONTAINER=/ms-swift
MODELSCOPE_SDK_DEBUG=True
echo "$USER"
gpus='0,1 2,3'
cpu_sets='0-15 16-31'
cpu_sets_arr=($cpu_sets)
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
echo "ci command: $CI_COMMAND"
PR_CHANGED_FILES="${PR_CHANGED_FILES:-}"
echo "PR modified files: $PR_CHANGED_FILES"
PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
idx=0
for gpu in $gpus
do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
echo "get gpu lock $gpu"
CONTAINER_NAME="swift-ci-$idx"
let is_get_file_lock=true
# pull image if there are update
docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
echo 'debugging'
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
-e MODELSCOPE_SDK_DEBUG=True \
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_ENVIRONMENT='ci' \
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \
-e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
-e PR_CHANGED_FILES=$PR_CHANGED_FILES \
--workdir=$CODE_DIR_IN_CONTAINER \
${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND
else
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_ENVIRONMENT='ci' \
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \
-e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
-e PR_CHANGED_FILES=$PR_CHANGED_FILES \
--workdir=$CODE_DIR_IN_CONTAINER \
${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND
fi
if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1
fi
break
done
if [ "$is_get_file_lock" = false ] ; then
echo 'No free GPU!'
exit 1
fi
================================================
FILE: .dev_scripts/dockerci_npu.sh
================================================
#!/bin/bash
MODELSCOPE_CACHE_DIR=/modelscope_cache
CODE_DIR=$PWD
MODELSCOPE_SDK_DEBUG=True
echo "$USER"
gpus='0,1 2,3'
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
echo "ci command: $CI_COMMAND"
PR_CHANGED_FILES="${PR_CHANGED_FILES:-}"
echo "PR modified files: $PR_CHANGED_FILES"
PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
idx=0
for gpu in $gpus
do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
echo "get gpu lock $gpu"
let is_get_file_lock=true
# 设置环境变量
export CI_TEST=True
export TEST_LEVEL=$TEST_LEVEL
export MODELSCOPE_CACHE=${MODELSCOPE_CACHE:-$MODELSCOPE_CACHE_DIR}
export MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN
export HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT
export TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST
export TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV
export MODELSCOPE_ENVIRONMENT='ci'
export TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN
export MODEL_TAG_URL=$MODEL_TAG_URL
export MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN
export PR_CHANGED_FILES=$PR_CHANGED_FILES
export CUDA_VISIBLE_DEVICES=$gpu
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
export MODELSCOPE_SDK_DEBUG=True
echo 'debugging'
fi
# 切换到代码目录并执行命令
cd $CODE_DIR
eval $CI_COMMAND
if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1
fi
break
done
if [ "$is_get_file_lock" = false ] ; then
echo 'No free GPU!'
exit 1
fi
================================================
FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml
================================================
name: "🐛 Bug Report"
description: Create a bug report to help us improve ms-swift
labels: ["bug"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues,确认这是一个新的 bug report。
required: true
- type: textarea
id: bug-description
validations:
required: true
attributes:
label: Bug Description / Bug 描述
description: |
Please describe the issue you encountered. It's better to include error screenshots or stack trace information.
请详细描述你遇到的问题,最好包含报错截图或报错栈信息。
- type: textarea
id: reproduction-steps
validations:
required: true
attributes:
label: How to Reproduce / 如何复现
description: |
Please provide steps to reproduce the issue, including ms-swift version, runtime environment, and detailed reproduction steps.
请提供复现问题的步骤,包括 ms-swift 的版本、运行环境、详细的复现步骤等。
- type: textarea
id: additional-information
attributes:
label: Additional Information / 补充信息
description: |
Please provide any additional information here.
在这里补充其他相关信息。
================================================
FILE: .github/ISSUE_TEMPLATE/2-feature-request.yml
================================================
name: "🚀 Feature Request"
description: Submit a request for a new feature
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new feature request. / 我已经搜索过现有的 issues,确认这是一个新的 Feature Request。
required: true
- type: textarea
id: feature-request-description
validations:
required: true
attributes:
label: Feature Request Description / Feature Request 描述
description: |
Please provide a detailed description of the new feature you would like to see added.
请详细描述您希望添加的新功能特性。
- type: textarea
id: pull-request
attributes:
label: Pull Request / Pull Request 信息
description: |
Have you already submitted or plan to submit a Pull Request? Please share your plans.
你是否已经提交或即将提交 Pull Request?请说明你的计划。
================================================
FILE: .github/ISSUE_TEMPLATE/3-question-discussion.yml
================================================
name: "🤔 Question & Discussion"
description: Create an issue for questions and discussions
labels: ["question"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new question or discussion topic. / 我已经搜索过现有的 issues,确认这是一个新的问题与讨论。
required: true
- type: textarea
id: question-description
validations:
required: true
attributes:
label: Question Description / 问题描述
description: |
Please describe the question or topic you would like to discuss.
请描述你想要讨论的问题或话题。
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# PR type
- [ ] Bug Fix
- [ ] New Feature
- [ ] Document Updates
- [ ] More Models or Datasets Support
# PR information
Write the detail information belongs to this PR.
## Experiment results
Paste your experiment result here(if needed).
================================================
FILE: .github/SECURITY.md
================================================
# Reporting Security Issues
Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
================================================
FILE: .github/workflows/citest.yaml
================================================
name: citest
on:
push:
branches:
- master
- "release/**"
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
pull_request:
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unittest:
# The type of runner that the job will run on
runs-on: [self-hosted]
timeout-minutes: 240
steps:
- name: ResetFileMode
shell: bash
run: |
# reset filemode to allow action runner to delete files
# generated by root in docker
set -e
source ~/.bashrc
sudo chown -R $USER:$USER $GITHUB_WORKSPACE
- name: Checkout
uses: actions/checkout@v3
env:
GIT_CONFIG_PARAMETERS: "'core.hooksPath='"
with:
lfs: 'true'
submodules: 'false'
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
- name: Get changed files
id: changed-files
run: |
if ${{ github.event_name == 'pull_request' }}; then
echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV
else
echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV
fi
- name: Checkout LFS objects
run: git lfs checkout
- name: Run unittest
shell: bash
run: |
set -e
source /mnt/modelscope/ci_env.sh
bash .dev_scripts/dockerci.sh
================================================
FILE: .github/workflows/citest_npu.yaml
================================================
name: citest-npu
on:
push:
branches:
- master
- "release/**"
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
pull_request:
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unittest:
# The type of runner that the job will run on
runs-on: [linux-aarch64-a2-1]
timeout-minutes: 240
container:
image: 'ascendai/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
steps:
- name: Config mirrors
run: |
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
- name: Get changed files
id: changed-files
run: |
if ${{ github.event_name == 'pull_request' }}; then
echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV
else
echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV
fi
- name: Run unittest
shell: bash
run: |
set -e
export IMAGE_NAME=ascendai/cann
export IMAGE_VERSION=8.3.rc2-910b-ubuntu22.04-py3.11
export TEST_LEVEL=0
mkdir -p ~/.cache
export MODELSCOPE_CACHE=~/.cache
export CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --subprocess --run_config tests/run_config.yaml'
bash .dev_scripts/dockerci_npu.sh
================================================
FILE: .github/workflows/close_tale_issue.yaml
================================================
name: Close Stale Issues
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
jobs:
close-stale:
runs-on: ubuntu-latest
steps:
- name: Close stale issues
uses: actions/stale@v8
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
days-before-stale: 90
days-before-close: 7
stale-issue-message: 'This issue has been inactive for over 3 months and will be automatically closed in 7 days. If this issue is still relevant, please reply to this message.'
close-issue-message: 'This issue has been automatically closed due to inactivity. If needed, it can be reopened.'
stale-issue-label: 'stale'
exempt-all-issue-labels: true
================================================
FILE: .github/workflows/lint.yaml
================================================
name: Lint test
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install pre-commit hook
run: |
pip install pre-commit
- name: Linting
run: pre-commit run --all-files
================================================
FILE: .github/workflows/publish.yaml
================================================
name: release
on:
push:
tags:
- 'v**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-publish
cancel-in-progress: true
jobs:
build-n-publish:
runs-on: ubuntu-22.04
#if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install wheel
run: pip install wheel packaging setuptools==69.5.1
- name: Build ModelScope Swift
run: python setup.py sdist bdist_wheel
- name: Publish package to PyPI
run: |
pip install twine
twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
tmp
*.ttf
__pycache__/
*.py[cod]
*$py.class
test.py
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
/package
/temp
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.vscode
.idea
.run
# custom
*.pkl
*.pkl.json
*.log.json
*.whl
*.tar.gz
*.swp
*.log
*.tar.gz
source.sh
tensorboard.sh
.DS_Store
replace.sh
result.png
result.jpg
result.mp4
output/
outputs/
wandb/
swanlog/
*.out
benchmarks/
eval_output/
eval_outputs/
vlmeval/
my_model/
/data
result/
images
/custom/
megatron_output/
/*-mcore/
/*-hf/
/*_cached_dataset/
/sample_output/
# Pytorch
*.pth
*.pt
# ast template
ast_index_file.py
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pycqa/flake8.git
rev: 7.3.0
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort.git
rev: 8.0.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf.git
rev: v0.32.0
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks.git
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: mixed-line-ending
args: ["--fix=lf"]
================================================
FILE: .pre-commit-config_local.yaml
================================================
repos:
- repo: /home/admin/pre-commit/flake8
rev: 7.3.0
hooks:
- id: flake8
- repo: /home/admin/pre-commit/isort
rev: 8.0.0
hooks:
- id: isort
- repo: /home/admin/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
- repo: /home/admin/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: mixed-line-ending
args: ["--fix=lf"]
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
contact@modelscope.cn.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations
================================================
FILE: CONTRIBUTING.md
================================================
# Contributor Guide
_Welcome to offer PRs, bug reports, documentation supplements or other types of contributions to SWIFT!_
## Table of Contents
- [Code of Conduct](#-code-of-conduct)
- [Contribution Process](#-contribution-process)
- [Hardware support](#-Hardware-support)
## 📖 Code of Conduct
Please refer to our [Code of Conduct documentation](./CODE_OF_CONDUCT.md).
## 🔁 Contribution Process
### What We Need
- New Technologies and New Models: SWIFT needs to support more open-source models and datasets, or new technologies that we have not paid attention to. If you are interested please submit a PR to us.
- Technical Propagation: If you are interested in technical propagation, you are welcome to help us write tutorials, documents or videos on any website, and send us the link.
- Community Contribution: You can write technical articles related to SWIFT, and submit them to us. After review and approval, we will publish them on the official ModelScope accounts (Zhihu, WeChat, etc.), with your name assigned.
### Incentives
- we will issue electronic certificates to contributors on behalf of the ModelScope community, to encourage your selfless contributions.
- We will offer small souvenirs related to the ModelScope Community.
- We will provide free A10 computing power during the development period. For more details, please refer to [Hardware-support](#-Hardware-support) section.
### Submitting PR (Pull Requests)
Any feature development is carried out in the form of Fork and then PR on GitHub.
1. Fork: Go to the [ms-swift](https://github.com/modelscope/ms-swift) page and click the **Fork button**. After completion, a SWIFT code repository will be cloned under your personal organization.
2. Clone: Clone the code repository generated in the first step to your local machine and **create a new branch** for development. During development, please click the **Sync Fork button** in time to synchronize with the `main` branch to prevent code expiration and conflicts.
3. Submit PR: After development and testing, push the code to the remote branch. On GitHub, go to the **Pull Requests page**, create a new PR, select your code branch as the source branch, and the `modelscope/ms-swift:main` branch as the target branch.
4. Write Description: It is necessary to provide a good feature description in the PR, so that the reviewers know the content of your modification.
5. Review: We hope that the code to be merged is concise and efficient, so we may raise some questions and discuss them. Please note that any issues raised in the review are aimed at the code itself, not at you personally. Once all issues are discussed and resolved, your code will be approved.
### Code Standards and Development Approach
SWIFT has conventional variable naming conventions and development approaches. Please follow these approaches as much as possible during development.
1. Variable names are separated by underscores, and class names are named with the first letter of each word capitalized.
2. All Python indentation uses four spaces instead of a tab.
3. Choose well-known open-source libraries, avoid using closed-source libraries or unstable open-source libraries, and avoid repeating the existing code.
After the PR is submitted, SWIFT will perform two types of tests:
- Code Lint Test: A static code compliance check test. please make sure that you have performed code lint locally in advance.
```shell
pip install pre-commit # In the swift folder
pre-commit run --all-files # Fix the errors reported by pre-commit until all checks are successful
```
- CI Tests: Smoke tests and unit tests, please refer to the next section.
### Running CI Tests
Before submitting the PR, please ensure that your development code is protected by test cases, such as smoke tests for new features, or unit tests for various edge cases. Reviewers will also pay attention to this during code review. At the same time, there will be dedicated services running CI Tests, running all test cases, and the code can only be merged after the test cases pass.
## ✅ Hardware support
SWIFT will provide hardware support for developers, including free GPUs. If needed, please email us ([contact@modelscope.cn](mailto:contact@modelscope.cn)) or join our WeChat group:
<p align="left">
<img src="asset/wechat.png" width="250" style="display: inline-block;">
</p>
================================================
FILE: CONTRIBUTING_CN.md
================================================
# 贡献者指引
*欢迎帮SWIFT提供Feature PR、Bug反馈、文档补充或其他类型的贡献!*
## 目录
- [代码规约](#-代码规约)
- [贡献流程](#-贡献流程)
- [资源支持](#-资源支持)
## 📖 代码规约
请查看我们的[代码规约文档](./CODE_OF_CONDUCT.md).
## 🔁 贡献流程
### 我们需要什么
- 新技术和新模型:SWIFT需要支持更多的开源模型和数据集,或我们没有关注到的新技术,如果您对此有兴趣,可以提交PR给我们。
- 技术布道:如果您对技术布道有兴趣,欢迎在任何网站上帮我们撰写教程文档或视频等,并将链接发给我们。
- 社区供稿:您可以撰写和SWIFT有关的技术文章,并供稿给我们,我们审核通过后会在魔搭官方账号(知乎、公众号等)上进行发布,并属上您的名字。
### 激励
- 我们会以魔搭社区的身份给贡献者颁发电子证书,以鼓励您的无私贡献。
- 我们会赠送相关魔搭社区相关周边小礼品。
- 我们会赠送开发期间的免费A10算力,具体可以查看[资源支持](#-资源支持)章节。
### 提交PR(Pull Requests)
任何feature开发都在github上以先Fork后PR的形式进行。
1. Fork:进入[ms-swift](https://github.com/modelscope/ms-swift)页面后,点击**Fork按钮**执行。完成后会在您的个人组织下克隆出一个SWIFT代码库
2. Clone:将第一步产生的代码库clone到本地并**拉新分支**进行开发,开发中请及时点击**Sync Fork按钮**同步`main`分支,防止代码过期并冲突
3. 提交PR:开发、测试完成后将代码推送到远程分支。在github上点击**Pull Requests页面**,新建一个PR,源分支选择您提交的代码分支,目标分支选择`modelscope/ms-swift:main`分支
4. 撰写描述:在PR中填写良好的feature描述是必要的,让Reviewers知道您的修改内容
5. Review:我们希望合入的代码简洁高效,因此可能会提出一些问题并讨论。请注意,任何review中提出的问题是针对代码本身,而非您个人。在所有问题讨论通过后,您的代码会被通过
### 代码规范和开发方式
SWIFT有约定俗成的变量命名方式和开发方式。在开发中请尽量遵循这些方式。
1. 变量命名以下划线分割,类名以所有单词首字母大写方式命名
2. 所有的python缩进都是四个空格取代一个tab
3. 选用知名的开源库,避免使用闭源库或不稳定的开源库,避免重复造轮子
SWIFT在PR提交后会进行两类测试:
- Code Lint测试 对代码进行静态规范走查的测试,为保证改测试通过,请保证本地预先进行了Code lint。方法是:
```shell
pip install pre-commit
# 在swift文件夹内
pre-commit run --all-files
# 对pre-commit报的错误进行修改,直到所有的检查都是成功状态
```
- CI Tests 冒烟测试和单元测试,请查看下一章节
### Running CI Tests
在提交PR前,请保证您的开发代码已经受到了测试用例的保护。例如,对新功能的冒烟测试,或者各种边缘case的单元测试等。在代码review时Reviewers也会关注这一点。同时,也会有服务专门运行CI Tests,运行所有的测试用例,测试用例通过后代码才可以合并。
## ✅ 资源支持
SWIFT会为开发者提供资源支持,包括免费的GPU算力。如果需要请邮件联系我们([contact@modelscope.cn](mailto:contact@modelscope.cn))或加入我们的微信群:
<p align="left">
<img src="asset/wechat.png" width="250" style="display: inline-block;">
</p>
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
recursive-include requirements *.txt
================================================
FILE: Makefile
================================================
WHL_BUILD_DIR :=package
DOC_BUILD_DIR :=docs/build/
# default rule
default: whl docs
.PHONY: docs
docs:
bash .dev_scripts/build_docs.sh
.PHONY: linter
linter:
bash .dev_scripts/linter.sh
.PHONY: test
test:
bash .dev_scripts/citest.sh
.PHONY: whl
whl:
python setup.py sdist bdist_wheel
.PHONY: clean
clean:
rm -rf $(WHL_BUILD_DIR) $(DOC_BUILD_DIR)
================================================
FILE: README.md
================================================
# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
<p align="center">
<br>
<img src="asset/banner.png"/>
<br>
<p>
<p align="center">
<a href="https://modelscope.cn/home">ModelScope Community Website</a>
<br>
<a href="README_CN.md">中文</a>   |   English  
</p>
<p align="center">
<img src="https://img.shields.io/badge/python-3.11-5be.svg">
<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.23-5D91D4.svg"></a>
<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
<a href="https://github.com/modelscope/ms-swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/ms-swift"></a>
<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
<a href="https://github.com/modelscope/ms-swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
</p>
<p align="center">
<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
<p align="center">
<a href="https://arxiv.org/abs/2408.05517">Paper</a>   | <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a>  
</p>
## 📖 Table of Contents
- [Groups](#-Groups)
- [Introduction](#-introduction)
- [News](#-news)
- [Installation](#%EF%B8%8F-installation)
- [Quick Start](#-quick-Start)
- [Usage](#-Usage)
- [License](#-License)
- [Citation](#-citation)
## ☎ Groups
You can contact us and communicate with us by adding our group:
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group
:-------------------------:|:-------------------------:
<img src="asset/discord_qr.jpg" width="200" height="200"> | <img src="asset/wechat.png" width="200" height="200">
## 📝 Introduction
🍲 **ms-swift** is a large model and multimodal large model fine-tuning and deployment framework provided by the ModelScope community. It now supports training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment for 600+ text-only large models and 400+ multimodal large models. Large models include: Qwen3, Qwen3.5, InternLM3, GLM4.5, Mistral, DeepSeek-R1, Llama4, etc. Multimodal large models include: Qwen3-VL, Qwen3-Omni, Llava, InternVL3.5, MiniCPM-V-4, Ovis2.5, GLM4.5-V, DeepSeek-VL2, etc.
🍔 In addition, ms-swift integrates the latest training technologies, including Megatron parallelism techniques such as TP, PP, CP, EP to accelerate training, as well as numerous GRPO algorithm family reinforcement learning algorithms including: GRPO, DAPO, GSPO, SAPO, CISPO, RLOO, Reinforce++, etc. to enhance model intelligence. ms-swift supports a wide range of training tasks, including preference learning algorithms such as DPO, KTO, RM, CPO, SimPO, ORPO, as well as Embedding, Reranker, and sequence classification tasks. ms-swift provides full-pipeline support for large model training, including acceleration for inference, evaluation, and deployment modules using vLLM, SGLang, and LMDeploy, as well as model quantization using GPTQ, AWQ, BNB, and FP8 technologies.
**Why Choose ms-swift?**
- 🍎 **Model Types**: Supports **600+ text-only large models**, **400+ multimodal large models**, and All-to-All full modality models from training to deployment full pipeline, with Day-0 support for popular models.
- **Dataset Types**: Built-in 150+ datasets for pre-training, fine-tuning, human alignment, multimodal and various other tasks, with support for custom datasets. Users only need to prepare datasets for one-click training.
- **Hardware Support**: Supports A10/A100/H100, RTX series, T4/V100, CPU, MPS, and domestic hardware Ascend NPU, etc.
- **Lightweight Training**: Supports lightweight fine-tuning methods such as LoRA, QLoRA, DoRA, LoRA+, LLaMAPro, LongLoRA, LoRA-GA, ReFT, RS-LoRA, Adapter, LISA, etc.
- **Quantized Training**: Supports training on BNB, AWQ, GPTQ, AQLM, HQQ, EETQ quantized models, requiring only 9GB training resources for 7B models.
- **Memory Optimization**: GaLore, Q-Galore, UnSloth, Liger-Kernel, Flash-Attention 2/3, and **Ulysses and Ring-Attention sequence parallelism techniques** support, reducing memory consumption for long-text training.
- **Distributed Training**: Supports distributed data parallelism (DDP), device_map simple model parallelism, DeepSpeed ZeRO2 ZeRO3, FSDP/FSDP2, and Megatron distributed training technologies.
- 🍓 **Multimodal Training**: Supports multimodal packing technology to improve training speed by 100%+, supports mixed modality data training with text, images, video and audio, and supports independent control of vit/aligner/llm.
- **Agent Training**: Supports Agent templates, allowing one dataset to be used for training different models.
- 🍊 **Training Tasks**: Supports pre-training and instruction fine-tuning, as well as training tasks such as DPO, GKD, KTO, RM, CPO, SimPO, ORPO, and supports **Embedding/Reranker** and sequence classification tasks.
- 🥥 **Megatron Parallelism**: Provides TP/PP/SP/CP/ETP/EP/VPP parallel strategies to significantly boost **MoE model training speed**. Supports full-parameter and LoRA training methods for 300+ pure text large models and 100+ multimodal large models. Supports CPT/SFT/GRPO/DPO/KTO/RM training tasks.
- 🍉 **Reinforcement Learning**: Built-in **rich GRPO family algorithms**, including GRPO, DAPO, GSPO, SAPO, CISPO, CHORD, RLOO, Reinforce++, etc. Supports synchronous and asynchronous vLLM engine inference acceleration, with extensible reward functions, multi-turn inference Schedulers, and environments through plugins.
- **Full-Pipeline Capabilities**: Covers the entire workflow of training, inference, evaluation, quantization, and deployment.
- **UI Training**: Provides Web-UI interface for training, inference, evaluation, and quantization, completing the full pipeline for large models.
- **Inference Acceleration**: Supports Transformers, vLLM, SGLang, and LmDeploy inference acceleration engines, providing OpenAI interfaces for accelerating inference, deployment, and evaluation modules.
- **Model Evaluation**: Uses EvalScope as the evaluation backend, supporting 100+ evaluation datasets for evaluating text-only and multimodal models.
- **Model Quantization**: Supports quantization export for AWQ, GPTQ, FP8, and BNB. Exported models support inference acceleration using vLLM/SGLang/LmDeploy.
## 🎉 News
- 🎁 2026.03.03: **ms-swift v4.0** major version is officially released. For release notes, please refer to [here](https://github.com/modelscope/ms-swift/releases/tag/v4.0.0). You can provide your suggestions to us in [this issue](https://github.com/modelscope/ms-swift/issues/7250). Thank you for your support.
- 🎁 2025.11.14: Megatron GRPO is now available! Check out the [docs](./docs/source_en/Megatron-SWIFT/GRPO.md) and [examples](examples/megatron/grpo).
- 🎁 2025.11.04: Support for [Mcore-Bridge](docs/source_en/Megatron-SWIFT/Mcore-Bridge.md), making Megatron training as simple and easy to use as transformers.
- 🎁 2025.10.28: Ray [here](docs/source_en/Instruction/Ray.md).
- 🎁 2025.09.07: Added support for CHORD training algorithm. See the [documentation](./docs/source_en/Instruction/GRPO/AdvancedResearch/CHORD.md).
- 🎁 2025.09.06: Ulysses can now be used with ring-attention, allowing sequences to be sharded into any number of chunks (no longer limited by the number of heads). The argument remains `--sequence_parallel_size N`.
- 🎁 2025.09.02: Megatron-SWIFT now supports multimodal model training. Documentation can be found [here](./docs/source_en/Megatron-SWIFT/Multimodal-Model.md).
- 🎁 2025.08.12: Support [Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT) in SFT training, use parameter `--enable_dft_loss true`. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh).
- 🎁 2025.07.09: Megatron-SWIFT supports LoRA training. Compared to ms-swift, it achieves significant speedup on MoE models. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/lora).
- 🎁 2025.06.23: Fine-tuning of reranker models is supported. Training scripts can be found here: [Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh).
- 🎁 2025.06.15: Support for GKD training on both pure text large models and multimodal models. Training scripts can be found here: [Pure Text](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd), [Multimodal](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd).
<details><summary>More</summary>
- 🎁 2025.06.11: Support for using Megatron parallelism techniques for RLHF training. The training script can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf).
- 🎁 2025.05.29: Support sequence parallel in pretrain, sft, dpo and grpo, check script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel).
- 🎁 2025.05.11: GRPO now supports custom processing logic for reward models. See the GenRM example [here](./docs/source_en/Instruction/GRPO/DeveloperGuide/reward_model.md).
- 🎁 2025.04.15: The ms-swift paper has been accepted by AAAI 2025. You can find the paper at [this link](https://ojs.aaai.org/index.php/AAAI/article/view/35383).
- 🎁 2025.03.23: Multi-round GRPO is now supported for training multi-turn dialogue scenarios (e.g., agent tool calling). Please refer to the [doc](./docs/source_en/Instruction/GRPO/DeveloperGuide/multi_turn.md).
- 🎁 2025.03.16: Support for Megatron's parallel training techniques is now available. Please see the [Megatron-SWIFT training documentation](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/Quick-start.html).
- 🎁 2025.03.15: Fine-tuning of embedding models for both pure text and multimodal models is supported. Please check the [training script](examples/train/embedding).
- 🎁 2025.03.05: The hybrid mode for GRPO is supported, with a script for training a 72B model on 4 GPUs (4*80G) available [here](examples/train/grpo/internal/vllm_72b_4gpu.sh). Tensor parallelism with vllm is also supported, with the training script available [here](examples/train/grpo/internal).
- 🎁 2025.02.21: The GRPO algorithm now supports LMDeploy, with the training script available [here](examples/train/grpo/internal/full_lmdeploy.sh). Additionally, the performance of the GRPO algorithm has been tested, achieving a training speed increase of up to 300% using various tricks. Please check the WanDB table [here](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz).
- 🎁 2025.02.21: The `swift sample` command is now supported. The reinforcement fine-tuning script can be found [here](docs/source_en/Instruction/Reinforced-Fine-tuning.md), and the large model API distillation sampling script is available [here](examples/sampler/distill/distill.sh).
- 🔥 2025.02.12: Support for the GRPO (Group Relative Policy Optimization) training algorithm has been added. Documentation is available [here](docs/source_en/Instruction/GRPO/GetStarted/GRPO.md).
- 🎁 2024.12.04: Major update to **ms-swift 3.0**. Please refer to the [release notes and changes](docs/source_en/Instruction/ReleaseNote3.0.md).
- 🎉 2024.08.12: The ms-swift paper has been published on arXiv and can be read [here](https://arxiv.org/abs/2408.05517).
- 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
- 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
- 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
</details>
## 🛠️ Installation
To install using pip:
```shell
pip install ms-swift -U
# Using uv
pip install uv
uv pip install ms-swift -U --torch-backend=auto
```
To install from source:
```shell
# pip install git+https://github.com/modelscope/ms-swift.git
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
# The main branch is for swift 4.x. To install swift 3.x, please run the following command:
# git checkout release/3.12
pip install -e .
# Using uv
uv pip install -e . --torch-backend=auto
```
Running Environment:
| | Range | Recommended | Notes |
|--------------|--------------|---------------------|-------------------------------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | No need to install if using CPU, NPU, MPS |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| trl | >=0.15,<0.29 | 0.28.0 | RLHF |
| deepspeed | >=0.14 | 0.18.8 | Training |
| vllm | >=0.5.1 | 0.11.0/0.17.1 | Inference/Deployment |
| sglang | >=0.4.6 | | Inference/Deployment |
| lmdeploy | >=0.5 | 0.10.1 | Inference/Deployment |
| evalscope | >=1.0 | | Evaluation |
| gradio | | 5.32.1 | Web-UI/App |
For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
## 🚀 Quick Start
10 minutes of self-cognition fine-tuning of Qwen3-4B-Instruct-2507 on a single 3090 GPU:
### Command Line Interface (Recommended)
```shell
# 13GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-4B-Instruct-2507 \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
Tips:
- If you want to train with a custom dataset, you can refer to [this guide](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) to organize your dataset format and specify `--dataset <dataset_path>`.
- The `--model_author` and `--model_name` parameters are only effective when the dataset includes `swift/self-cognition`.
- To train with a different model, simply modify `--model <model_id/model_path>`.
- By default, **ModelScope** is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
After training is complete, use the following command to infer with the trained weights:
- Here, `--adapters` should be replaced with the last checkpoint folder generated during training. Since the adapters folder contains the training parameter file `args.json`, there is no need to specify `--model`, `--system` separately; Swift will automatically read these parameters. To disable this behavior, you can set `--load_args false`.
```shell
# Using an interactive command line for inference.
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
# merge-lora and use vLLM for inference acceleration
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--merge_lora true \
--infer_backend vllm \
--vllm_max_model_len 8192 \
--temperature 0 \
--max_new_tokens 2048
```
Finally, use the following command to push the model to ModelScope:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '<your-model-id>' \
--hub_token '<your-sdk-token>' \
--use_hf false
```
### Web-UI
The Web-UI is a **zero-threshold** training and deployment interface solution based on Gradio interface technology. For more details, you can check [here](https://swift.readthedocs.io/en/latest/GetStarted/Web-UI.html).
```shell
SWIFT_UI_LANG=en swift web-ui
```

### Using Python
ms-swift also supports training and inference using Python. Below is pseudocode for training and inference. For more details, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb).
Training:
```python
from peft import LoraConfig, get_peft_model
from swift import get_model_processor, get_template, load_dataset, EncodePreprocessor
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Retrieve the model and template, and add a trainable LoRA module
model, tokenizer = get_model_processor(model_id_or_path, ...)
template = get_template(tokenizer, ...)
lora_config = LoraConfig(...)
model = get_peft_model(model, lora_config)
# Download and load the dataset, and encode the text into tokens
train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
# Train the model
training_args = Seq2SeqTrainingArguments(...)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
template=template,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
```
Inference:
```python
from swift import TransformersEngine, InferRequest, RequestConfig
# Perform inference using the native Transformers engine
engine = TransformersEngine(model_id_or_path, adapters=[lora_checkpoint])
infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
resp_list = engine.infer([infer_request], request_config)
print(f'response: {resp_list[0].choices[0].message.content}')
```
## ✨ Usage
Here is a minimal example of training to deployment using ms-swift. For more details, you can check the [examples](https://github.com/modelscope/ms-swift/tree/main/examples).
- If you want to use other models or datasets (including multimodal models and datasets), you only need to modify `--model` to specify the corresponding model's ID or path, and modify `--dataset` to specify the corresponding dataset's ID or path.
- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
| Useful Links |
| ------ |
| [🔥Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html) |
| [Megatron-SWIFT](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/Quick-start.html) |
| [GRPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/GetStarted/GRPO.html) |
| [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) |
| [Custom Models](https://swift.readthedocs.io/en/latest/Customization/Custom-model.html), [🔥Custom Datasets](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) |
| [LLM Tutorial](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
### Training
Supported Training Methods:
| Method | Full-Parameter | LoRA | QLoRA | Deepspeed | Multi-Machine | Multimodal |
| ------------------------------------------------------------ | ------------------------------------------------------------ | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| [Pre-training](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Supervised Fine-Tuning](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
| [GRPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd) |
| [PPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [DPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo) |
| [KTO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
| [Reward Model](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [CPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SimPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [ORPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Sequence Classification](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
Pre-training:
```shell
# 8*A100
NPROC_PER_NODE=8 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift pt \
--model Qwen/Qwen2.5-7B \
--dataset swift/chinese-c4 \
--streaming true \
--tuner_type full \
--deepspeed zero2 \
--output_dir output \
--max_steps 10000 \
...
```
Fine-tuning:
```shell
CUDA_VISIBLE_DEVICES=0 swift sft \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset AI-ModelScope/alpaca-gpt4-data-en \
--tuner_type lora \
--output_dir output \
...
```
RLHF:
```shell
CUDA_VISIBLE_DEVICES=0 swift rlhf \
--rlhf_type dpo \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
--tuner_type lora \
--output_dir output \
...
```
### Megatron-SWIFT
ms-swift supports using Megatron parallelism techniques to accelerate training, including large-scale cluster training and MoE model training. The following training methods are supported:
| Method | Full-Parameter | LoRA | MoE | Multimodal | FP8 |
| ---------------------- | -------------- | ---- | ---- | ---------- | ---- |
| Pre-training | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Supervised Fine-Tuning](https://github.com/modelscope/ms-swift/tree/main/examples/megatron) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GRPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [DPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [KTO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/kto) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [RM](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/rm) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/embedding) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/reranker) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Sequence Classification](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ |
```shell
NPROC_PER_NODE=2 CUDA_VISIBLE_DEVICES=0,1 megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
### Reinforcement Learning
ms-swift supports a rich set of GRPO family algorithms:
| Method | Full-Parameter | LoRA | Multimodal | Multi-Machine |
| ------------------------------------------------------------ | -------------- | ---- | ---------- | ------------- |
| [GRPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/GetStarted/GRPO.html) | ✅ | ✅ | ✅ | ✅ |
| [DAPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/DAPO.html) | ✅ | ✅ | ✅ | ✅ |
| [GSPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/GSPO.html) | ✅ | ✅ | ✅ | ✅ |
| [SAPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/SAPO.html) | ✅ | ✅ | ✅ | ✅ |
| [CISPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/CISPO.html) | ✅ | ✅ | ✅ | ✅ |
| [CHORD](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/CHORD.html) | ✅ | ✅ | ✅ | ✅ |
| [RLOO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/RLOO.html) | ✅ | ✅ | ✅ | ✅ |
| [Reinforce++](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/REINFORCEPP.html) | ✅ | ✅ | ✅ | ✅ |
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--tuner_type lora \
--use_vllm true \
--vllm_mode colocate \
--dataset AI-MO/NuminaMath-TIR#10000 \
--output_dir output \
...
```
### Inference
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
# LoRA
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--adapters swift/test_lora \
--stream true \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048
```
### Interface Inference
```shell
CUDA_VISIBLE_DEVICES=0 swift app \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
```
### Deployment
```shell
CUDA_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm
```
### Sampling
```shell
CUDA_VISIBLE_DEVICES=0 swift sample \
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
--sampler_engine transformers \
--num_return_sequences 5 \
--dataset AI-ModelScope/alpaca-gpt4-data-zh#5
```
### Evaluation
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend lmdeploy \
--eval_backend OpenCompass \
--eval_dataset ARC_c
```
### Quantization
```shell
CUDA_VISIBLE_DEVICES=0 swift export \
--model Qwen/Qwen2.5-7B-Instruct \
--quant_bits 4 --quant_method awq \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--output_dir Qwen2.5-7B-Instruct-AWQ
```
### Push Model
```shell
swift export \
--model <model-path> \
--push_to_hub true \
--hub_model_id '<model-id>' \
--hub_token '<sdk-token>'
```
## 🏛 License
This framework is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/ms-swift/blob/master/LICENSE). For models and datasets, please refer to the original resource page and follow the corresponding License.
## 📎 Citation
```bibtex
@misc{zhao2024swiftascalablelightweightinfrastructure,
title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
year={2024},
eprint={2408.05517},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.05517},
}
```
## Star History
[](https://star-history.com/#modelscope/ms-swift&Date)
================================================
FILE: README_CN.md
================================================
# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
<p align="center">
<br>
<img src="asset/banner.png"/>
<br>
<p>
<p align="center">
<a href="https://modelscope.cn/home">魔搭社区官网</a>
<br>
中文  |  <a href="README.md">English</a> 
</p>
<p align="center">
<img src="https://img.shields.io/badge/python-3.11-5be.svg">
<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.23-5D91D4.svg"></a>
<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
<a href="https://github.com/modelscope/ms-swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/ms-swift"></a>
<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
<a href="https://github.com/modelscope/ms-swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
</p>
<p align="center">
<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
<p align="center">
<a href="https://arxiv.org/abs/2408.05517">论文</a>   | <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a>  
</p>
## 📖 目录
- [用户群](#-用户群)
- [简介](#-简介)
- [新闻](#-新闻)
- [安装](#%EF%B8%8F-安装)
- [快速开始](#-快速开始)
- [如何使用](#-如何使用)
- [License](#-license)
- [引用](#-引用)
## ☎ 用户群
请扫描下面的二维码来加入我们的交流群:
[Discord Group](https://discord.com/invite/D27yfEFVz5) | 微信群
:-------------------------:|:-------------------------:
<img src="asset/discord_qr.jpg" width="200" height="200"> | <img src="asset/wechat.png" width="200" height="200">
## 📝 简介
🍲 **ms-swift**是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持600+纯文本大模型与400+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen3、Qwen3.5、InternLM3、GLM4.5、Mistral、DeepSeek-R1、Llama4等模型,多模态大模型包括:Qwen3-VL、Qwen3-Omni、Llava、InternVL3.5、MiniCPM-V-4、Ovis2.5、GLM4.5-V、DeepSeek-VL2等模型。
🍔 除此之外,ms-swift汇集了最新的训练技术,包括集成Megatron并行技术,包括TP、PP、CP、EP等为训练提供加速,以及众多GRPO算法族强化学习的算法,包括:GRPO、DAPO、GSPO、SAPO、CISPO、RLOO、Reinforce++等提升模型智能。ms-swift支持广泛的训练任务,包括DPO、KTO、RM、CPO、SimPO、ORPO等偏好学习算法,以及Embedding、Reranker、序列分类任务。ms-swift提供了大模型训练全链路的支持,包括使用vLLM、SGLang和LMDeploy对推理、评测、部署模块提供加速,以及使用GPTQ、AWQ、BNB、FP8技术对大模型进行量化。
**为什么选择ms-swift?**
- 🍎 **模型类型**:支持**600+纯文本大模型**、**400+多模态大模型**以及All-to-All全模态模型训练到部署全流程,热门模型Day0支持。
- **数据集类型**:内置150+预训练、微调、人类对齐、多模态等各种任务数据集,并支持自定义数据集,用户只需准备数据集即可一键训练。
- **硬件支持**:支持A10/A100/H100、RTX系列、T4/V100、CPU、MPS以及国产硬件Ascend NPU等。
- **轻量训练**:支持了LoRA、QLoRA、DoRA、LoRA+、LLaMAPro、LongLoRA、LoRA-GA、ReFT、RS-LoRA、Adapter、LISA等轻量微调方式。
- **量化训练**:支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练,7B模型训练只需9GB训练资源。
- **显存优化**: GaLore、Q-Galore、UnSloth、Liger-Kernel、Flash-Attention 2/3 以及 **Ulysses和Ring-Attention序列并行技术**支持,降低长文本训练显存占用。
- **分布式训练**:支持分布式数据并行(DDP)、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP/FSDP2以及Megatron等分布式训练技术。
- 🍓 **多模态训练**:支持多模态packing技术提升训练速度100%+,支持文本、图像、视频和语音混合模态数据训练,支持vit/aligner/llm单独控制。
- **Agent训练**:支持Agent template,准备一套数据集可用于不同模型的训练。
- 🍊 **训练任务**:支持预训练和指令微调,以及DPO、GKD、KTO、RM、CPO、SimPO、ORPO等训练任务,支持**Embedding/Reranker**和序列分类任务。
- 🥥 **Megatron并行技术**:提供TP/PP/SP/CP/ETP/EP/VPP并行策略,显著提升**MoE模型训练速度**。支持300+纯文本大模型和100+多模态大模型的全参数和LoRA训练方法。支持CPT/SFT/GRPO/DPO/KTO/RM训练任务。
- 🍉 **强化学习**:内置**丰富GRPO族算法**,包括GRPO、DAPO、GSPO、SAPO、CISPO、CHORD、RLOO、Reinforce++等,支持同步和异步vLLM引擎推理加速,可使用插件拓展奖励函数、多轮推理调度器以及环境等。
- **全链路能力**:覆盖训练、推理、评测、量化和部署全流程。
- **界面训练**:提供使用Web-UI界面的方式进行训练、推理、评测、量化,完成大模型的全链路。
- **推理加速**:支持Transformers、vLLM、SGLang和LmDeploy推理加速引擎,并提供OpenAI接口,为推理、部署和评测模块提供加速。
- **模型评测**:以EvalScope作为评测后端,支持100+评测数据集对纯文本和多模态模型进行评测。
- **模型量化**:支持AWQ、GPTQ、FP8和BNB的量化导出,导出的模型支持使用vLLM/SGLang/LmDeploy推理加速。
## 🎉 新闻
- 🎁 2026.03.03: **ms-swift v4.0**大版本正式发布,release note参考[这里](https://github.com/modelscope/ms-swift/releases/tag/v4.0.0),您的建议可以在[这个issue](https://github.com/modelscope/ms-swift/issues/7250)中反馈给我们,感谢您的支持。
- 🎁 2025.11.14: Megatron GRPO现已支持!查看[文档](./docs/source/Megatron-SWIFT/GRPO.md)和[示例](examples/megatron/grpo)。
- 🎁 2025.11.04: 支持[Mcore-Bridge](docs/source/Megatron-SWIFT/Mcore-Bridge.md),使Megatron训练像transformers一样简单易用。
- 🎁 2025.10.28: Ray [已支持](docs/source/Instruction/Ray.md)。
- 🎁 2025.09.07: 支持CHORD训练算法,请查看[文档](docs/source/Instruction/GRPO/AdvancedResearch/CHORD.md)。
- 🎁 2025.09.06: Ulysses现已支持与ring-attention结合使用,使得输入序列可以被切分成任意数量的块(不再受限于num_heads),命令参数仍然是`--sequence_parallel_size N`。
- 🎁 2025.09.02: Megatron-SWIFT支持多模态模型训练。文档参考[这里](./docs/source/Megatron-SWIFT/Mcore-Bridge.md)。
- 🎁 2025.08.12: 支持在SFT训练中使用[Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT),使用参数 `--enable_dft_loss true`。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh)
- 🎁 2025.07.09: Megatron-SWIFT支持LoRA训练。相比ms-swift,在MoE模型提速显著。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/lora)。
- 🎁 2025.06.23: 支持Reranker模型训练,训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh)。
- 🎁 2025.06.15: 支持对纯文本大模型和多模态模型进行GKD训练。训练脚本参考这里:[纯文本](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd), [多模态](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd)。
<details><summary>更多</summary>
- 🎁 2025.06.11: 支持使用Megatron并行技术进行RLHF训练,训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf)。
- 🎁 2025.05.29: 支持pt、sft、dpo、grpo的序列并行,具体请查看[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel)。
- 🎁 2025.05.11: GRPO中的奖励模型支持自定义处理逻辑,GenRM的例子参考[这里](./docs/source/Instruction/GRPO/DeveloperGuide/reward_model.md)。
- 🎁 2025.04.15: ms-swift论文已经被AAAI 2025接收,论文地址在[这里](https://ojs.aaai.org/index.php/AAAI/article/view/35383)。
- 🎁 2025.03.23: 支持了多轮GRPO,用于构建多轮对话场景的训练(例如agent tool calling),请查看[文档](docs/source/Instruction/GRPO/DeveloperGuide/multi_turn.md)。
- 🎁 2025.03.16: 支持了Megatron的并行技术进行训练,请查看[Megatron-SWIFT训练文档](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Quick-start.html)。
- 🎁 2025.03.15: 支持纯文本和多模态模型的embedding模型的微调,请查看[训练脚本](examples/train/embedding)。
- 🎁 2025.03.05: 支持GRPO的hybrid模式,4GPU(4*80G)训练72B模型的脚本参考[这里](examples/train/grpo/internal/vllm_72b_4gpu.sh)。同时支持vllm的tensor并行,训练脚本参考[这里](examples/train/grpo/internal)。
- 🎁 2025.02.21: GRPO算法支持使用LMDeploy,训练脚本参考[这里](examples/train/grpo/internal/full_lmdeploy.sh)。此外测试了GRPO算法的性能,使用一些tricks使训练速度提高到300%。WanDB表格请查看[这里](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz)。
- 🎁 2025.02.21: 支持`swift sample`命令。强化微调脚本参考[这里](docs/source/Instruction/Reinforced-Fine-tuning.md),大模型API蒸馏采样脚本参考[这里](examples/sampler/distill/distill.sh)。
- 🔥 2025.02.12: 支持GRPO (Group Relative Policy Optimization) 训练算法,文档参考[这里](docs/source/Instruction/GRPO/GetStarted/GRPO.md)。
- 🎁 2024.12.04: **ms-swift3.0**大版本更新。请查看[发布说明和更改](docs/source/Instruction/ReleaseNote3.0.md)。
- 🎉 2024.08.12: ms-swift论文已经发布到arXiv上,可以点击[这里](https://arxiv.org/abs/2408.05517)阅读。
- 🔥 2024.08.05: 支持使用[evalscope](https://github.com/modelscope/evalscope/)作为后端进行大模型和多模态模型的评测。
- 🔥 2024.07.29: 支持使用[vllm](https://github.com/vllm-project/vllm), [lmdeploy](https://github.com/InternLM/lmdeploy)对大模型和多模态大模型进行推理加速,在infer/deploy/eval时额外指定`--infer_backend vllm/lmdeploy`即可。
- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练,包括DPO/ORPO/SimPO/CPO/KTO/RM/PPO。
- 🔥 2024.02.01: 支持Agent训练!训练算法源自这篇[论文](https://arxiv.org/pdf/2309.00986.pdf)。
</details>
## 🛠️ 安装
使用pip进行安装:
```shell
pip install ms-swift -U
# 使用uv
pip install uv
uv pip install ms-swift -U --torch-backend=auto
```
从源代码安装:
```shell
# pip install git+https://github.com/modelscope/ms-swift.git
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
# main分支为swift4.x。若安装swift3.x,请运行以下命令
# git checkout release/3.12
pip install -e .
# 使用uv
uv pip install -e . --torch-backend=auto
```
运行环境:
| | 范围 | 推荐 | 备注 |
|--------------|--------------|---------------------|--------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | 使用cpu、npu、mps则无需安装 |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| trl | >=0.15,<0.29 | 0.28.0 | RLHF |
| deepspeed | >=0.14 | 0.18.8 | 训练 |
| vllm | >=0.5.1 | 0.11.0/0.17.1 | 推理/部署 |
| sglang | >=0.4.6 | | 推理/部署 |
| lmdeploy | >=0.5 | 0.10.1 | 推理/部署 |
| evalscope | >=1.0 | | 评测 |
| gradio | | 5.32.1 | Web-UI/App |
更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
## 🚀 快速开始
**10分钟**在单卡3090上对Qwen3-4B-Instruct-2507进行自我认知微调:
### 命令行(推荐)
```shell
# 13GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-4B-Instruct-2507 \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
小贴士:
- 如果要使用自定义数据集进行训练,你可以参考[这里](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html)组织数据集格式,并指定`--dataset <dataset_path>`。
- `--model_author`和`--model_name`参数只有当数据集中包含`swift/self-cognition`时才生效。
- 如果要使用其他模型进行训练,你只需要修改`--model <model_id/model_path>`即可。
- 默认使用**ModelScope**进行模型和数据集的下载。如果要使用HuggingFace,指定`--use_hf true`即可。
训练完成后,使用以下命令对训练后的权重进行推理:
- 这里的`--adapters`需要替换成训练生成的last checkpoint文件夹。由于adapters文件夹中包含了训练的参数文件`args.json`,因此不需要额外指定`--model`,`--system`,swift会自动读取这些参数。如果要关闭此行为,可以设置`--load_args false`。
```shell
# 使用交互式命令行进行推理
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
# merge-lora并使用vLLM进行推理加速
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--merge_lora true \
--infer_backend vllm \
--vllm_max_model_len 8192 \
--temperature 0 \
--max_new_tokens 2048
```
最后,使用以下命令将模型推送到ModelScope:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '<your-model-id>' \
--hub_token '<your-sdk-token>' \
--use_hf false
```
### Web-UI
Web-UI是基于gradio界面技术的**零门槛**训练、部署界面方案,具体可以查看[这里](https://swift.readthedocs.io/zh-cn/latest/GetStarted/Web-UI.html)。
```shell
swift web-ui
```

### 使用Python
ms-swift也支持使用python的方式进行训练和推理。下面给出训练和推理的**伪代码**,具体可以查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)。
训练:
```python
from peft import LoraConfig, get_peft_model
from swift import get_model_processor, get_template, load_dataset, EncodePreprocessor
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# 获取模型和template,并加入可训练的LoRA模块
model, tokenizer = get_model_processor(model_id_or_path, ...)
template = get_template(tokenizer, ...)
lora_config = LoraConfig(...)
model = get_peft_model(model, lora_config)
# 下载并载入数据集,并将文本encode成tokens
train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
# 进行训练
training_args = Seq2SeqTrainingArguments(...)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
template=template,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
```
推理:
```python
from swift import TransformersEngine, InferRequest, RequestConfig
# 使用原生 transformers 引擎进行推理
engine = TransformersEngine(model_id_or_path, adapters=[lora_checkpoint])
infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
resp_list = engine.infer([infer_request], request_config)
print(f'response: {resp_list[0].choices[0].message.content}')
```
## ✨ 如何使用
这里给出使用ms-swift进行训练到部署的最简示例,具体可以查看[examples](https://github.com/modelscope/ms-swift/tree/main/examples)。
- 若想使用其他模型或者数据集(含多模态模型和数据集),你只需要修改`--model`指定对应模型的id或者path,修改`--dataset`指定对应数据集的id或者path即可。
- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace,指定`--use_hf true`即可。
| 常用链接 |
| ------ |
| [🔥命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html) |
| [Megatron-SWIFT](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Quick-start.html) |
| [GRPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/GetStarted/GRPO.html) |
| [支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html) |
| [自定义模型](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-model.html), [🔥自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html) |
| [大模型教程](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
### 训练
支持的训练方法:
| 方法 | 全参数 | LoRA | QLoRA | Deepspeed | 多机 | 多模态 |
| ------ | ------ |---------------------------------------------------------------------------------------------| ----- | ------ | ------ |----------------------------------------------------------------------------------------------|
| [预训练](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
| [GRPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd) |
| [PPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [DPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo) |
| [KTO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
| [奖励模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [CPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SimPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ | ✅ | ✅| ✅ | ✅ |
| [ORPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
预训练:
```shell
# 8*A100
NPROC_PER_NODE=8 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift pt \
--model Qwen/Qwen2.5-7B \
--dataset swift/chinese-c4 \
--streaming true \
--tuner_type full \
--deepspeed zero2 \
--output_dir output \
--max_steps 10000 \
...
```
微调:
```shell
CUDA_VISIBLE_DEVICES=0 swift sft \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
RLHF:
```shell
CUDA_VISIBLE_DEVICES=0 swift rlhf \
--rlhf_type dpo \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
--tuner_type lora \
--output_dir output \
...
```
### Megatron-SWIFT
ms-swift支持使用Megatron并行技术加速训练,包括大规模集群训练和MoE模型训练。以下为支持的训练方法:
| 方法 | 全参数 | LoRA | MoE | 多模态 | FP8 |
| ------ | ------ | ---- | ----- | ----- | ----- |
| 预训练 | ✅ | ✅| ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/tree/main/examples/megatron) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GRPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/grpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd) | ✅ | ✅| ✅ | ✅ | ✅ |
| [DPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/dpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [KTO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/kto) | ✅ | ✅| ✅ | ✅ | ✅ |
| [RM](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/rm) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/embedding) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/reranker) | ✅ | ✅| ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/seq_cls) | ✅ | ✅| ✅ | ✅ | ✅ |
```shell
NPROC_PER_NODE=2 CUDA_VISIBLE_DEVICES=0,1 megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
### 强化学习
ms-swift支持丰富GRPO族算法:
| 方法 | 全参数 | LoRA | 多模态 | 多机 |
| ------ | ------ | ---- | ----- | ----- |
| [GRPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/GetStarted/GRPO.html) | ✅ | ✅| ✅ | ✅ |
| [DAPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/DAPO.html) | ✅ | ✅| ✅ | ✅ |
| [GSPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/GSPO.html) | ✅ | ✅| ✅ | ✅ |
| [SAPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/SAPO.html) | ✅ | ✅| ✅ | ✅ |
| [CISPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/CISPO.html) | ✅ | ✅| ✅ | ✅ |
| [CHORD](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/CHORD.html) | ✅ | ✅| ✅ | ✅ |
| [RLOO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/RLOO.html) | ✅ | ✅| ✅ | ✅ |
| [Reinforce++](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/REINFORCEPP.html) | ✅ | ✅| ✅ | ✅ |
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--tuner_type lora \
--use_vllm true \
--vllm_mode colocate \
--dataset AI-MO/NuminaMath-TIR#10000 \
--output_dir output \
...
```
### 推理
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
# LoRA
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--adapters swift/test_lora \
--stream true \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048
```
### 界面推理
```shell
CUDA_VISIBLE_DEVICES=0 swift app \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048 \
--lang zh
```
### 部署
```shell
CUDA_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm
```
### 采样
```shell
CUDA_VISIBLE_DEVICES=0 swift sample \
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
--sampler_engine transformers \
--num_return_sequences 5 \
--dataset AI-ModelScope/alpaca-gpt4-data-zh#5
```
### 评测
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend lmdeploy \
--eval_backend OpenCompass \
--eval_dataset ARC_c
```
### 量化
```shell
CUDA_VISIBLE_DEVICES=0 swift export \
--model Qwen/Qwen2.5-7B-Instruct \
--quant_bits 4 --quant_method awq \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--output_dir Qwen2.5-7B-Instruct-AWQ
```
### 推送模型
```shell
swift export \
--model <model-path> \
--push_to_hub true \
--hub_model_id '<model-id>' \
--hub_token '<sdk-token>'
```
## 🏛 License
本框架使用[Apache License (Version 2.0)](https://github.com/modelscope/ms-swift/blob/master/LICENSE)进行许可。模型和数据集请查看原资源页面并遵守对应License。
## 📎 引用
```bibtex
@misc{zhao2024swiftascalablelightweightinfrastructure,
title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
year={2024},
eprint={2408.05517},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.05517},
}
```
## Star History
[](https://star-history.com/#modelscope/ms-swift&Date)
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
## maintain docs
1. build docs
```shell
# in root directory:
make docs
```
2. doc string format
We adopt the google style docstring format as the standard, please refer to the following documents.
1. Google Python style guide docstring [link](http://google.github.io/styleguide/pyguide.html#381-docstrings)
2. Google docstring example [link](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
3. sample:torch.nn.modules.conv [link](https://pytorch.org/docs/stable/_modules/torch/nn/modules/conv.html#Conv1d)
4. load function as an example:
```python
def load(file, file_format=None, **kwargs):
"""Load data from json/yaml/pickle files.
This method provides a unified api for loading data from serialized files.
Args:
file (str or :obj:`Path` or file-like object): Filename or a file-like
object.
file_format (str, optional): If not specified, the file format will be
inferred from the file extension, otherwise use the specified one.
Currently supported formats include "json", "yaml/yml".
Examples:
>>> load('/path/of/your/file') # file is stored in disk
>>> load('https://path/of/your/file') # file is stored on internet
>>> load('oss://path/of/your/file') # file is stored in petrel
Returns:
The content from the file.
"""
```
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
================================================
FILE: docs/source/.readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.10"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/framework.txt
================================================
FILE: docs/source/BestPractices/Elastic.md
================================================
# Elastic
## 安装依赖
集群部署K8S,并在集群中部署DLrover,[DLRover](https://github.com/intelligent-machine-learning/dlrover),
`pip install dlrover && pip install tornado && pip install kubernetes && pip install ms-swift`
经过反复测试验证的训练镜像中的其它依赖以及版本:
deepspeed 0.16.5(需参考https://github.com/deepspeedai/DeepSpeed/pull/7585/files 修复universal checkpoint 相关问题)
pytorch 2.6.0
## 如何启动
通过在`--callbacks`中添加`deepspeed_elastic`(可选`graceful_exit`)启用弹性训练,并配置DeepSpeed弹性参数。
命令组成=dlrover-run +dlrover 命令参数+swift 启动命令 +swift参数,dlrover-run除自定义的参数外,其他参数与torchrun一致;
dlrover-run 参数如下:
```
usage: dlrover-run [-h] [--nnodes NNODES] [--nproc-per-node NPROC_PER_NODE]
[--rdzv-backend RDZV_BACKEND] [--rdzv-endpoint RDZV_ENDPOINT] [--rdzv-id RDZV_ID]
[--rdzv-conf RDZV_CONF] [--standalone] [--max-restarts MAX_RESTARTS]
[--monitor-interval MONITOR_INTERVAL] [--start-method {spawn,fork,forkserver}]
[--role ROLE] [-m] [--no-python] [--run-path] [--log-dir LOG_DIR] [-r REDIRECTS]
[-t TEE] [--local-ranks-filter LOCAL_RANKS_FILTER] [--node-rank NODE_RANK]
[--master-addr MASTER_ADDR] [--master-port MASTER_PORT] [--local-addr LOCAL_ADDR]
[--logs-specs LOGS_SPECS] [--precheck {0,1,2}] [--node_unit NODE_UNIT]
[--auto_config] [--auto_tunning] [--exclude-straggler] [--save_at_breakpoint]
[--accelerator {nvidia.com/gpu,ascend-npu}] [--training_port TRAINING_PORT]
[--switchbox-check] [--box-pairs PAIR [PAIR ...]] [--min-bandwidth MIN_BANDWIDTH]
[--min-channels MIN_CHANNELS] [--numa-affinity] [--network-check]
[--comm-perf-test] [--ucp_device_type UCP_DEVICE_TYPE]
training_script
```
在弹性训练中我们需要关注的参数为:
--nnodes NNODES Number of nodes, or the range of nodes in form
<minimum_nodes>:<maximum_nodes>.
--nproc-per-node NPROC_PER_NODE Number of processes per node.
示例:
```bash
model=your model path
dataset=your dataset
output= your output dir
export CUDA_VISIBLE_DEVICES=0 根据实际使用的GPU情况设置
deepspeed_config_or_type=deepspeed类型或者配置文件的路径,如 zero1 或者/xxx/ms-swift/swift/llm/ds_config/zero1.json
dlrover-run --nnodes 1:$NODE_NUM --nproc_per_node=1 \
/opt/conda/lib/python3.10/site-packages/swift/cli/sft.py --model $model \
--model_type qwen3 \
--tuner_type lora \
--torch_dtype bfloat16 \
--dataset $dataset \
--num_train_epochs 4 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 5e-7 \
--gradient_accumulation_steps 8 \
--eval_steps 500 \
--save_steps 10 \
--save_total_limit 20 \
--logging_steps 1 \
--output_dir $output \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--temperature 1.0 \
--system 'You are a helpful assistant.' \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--dataset_num_proc 1 \
--use_flash_ckpt true \
--callbacks deepspeed_elastic graceful_exit \
--deepspeed $deepspeed_config_or_type \
```
## 配置文件示例
默认情况下的zero1为以下示例配置,
```json
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
}
```
如果用户需要自定义,可以在启动命令中deepspeed_config_or_type指定自定义的zero1.json的存放路径,其中弹性相关的配置为:
```json
...
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
```
- ignore_non_elastic_batch_info:代表在elasticity里的配置会忽略外层的batch_size相关的配置,训练过程中会根据实际的训练进程个数实时修改batch_size等相关的参数
计算原则为:
global-training-batch-size = micro-batch-size * gradient-accumulation-steps * world-size
- max_train_batch_size:最大batch_size数
- micro_batch_sizes:elasticity下允许的每卡micro-batch size列表,相当于train_micro_batch_size_per_gpu的候选值
- min_gpus:最小gpu数目
- max_gpus:最大gpu数目
更详细的内容见:[Deepspeed](https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02)
## 启动训练
```yaml
---
apiVersion: elastic.iml.github.io/v1alpha1
kind: ElasticJob
metadata:
name: deepspeed-elastic-swift
namespace: dlrover
spec:
distributionStrategy: AllreduceStrategy
optimizeMode: single-job
replicaSpecs:
worker:
replicas: 1 #【这里需要与启动命令中的--nnodes NNODES的最大值一致】
template:
spec:
restartPolicy: Never
containers:
- name: main
image: #【训练镜像,需要安装deepspeed,dlrover 和swift 】
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- sh start.sh # 启动脚本
resources:
limits:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '1'
volumeMounts:
- mountPath: /model
name: volume-model
- mountPath: /dev/shm
name: volume-shm
restartPolicy: Never
volumes:
- hostPath:
path: /model
type: Directory
name: volume-model
- emptyDir:
medium: Memory
sizeLimit: 200Gi
name: volume-shm
```
================================================
FILE: docs/source/BestPractices/Embedding.md
================================================
# Embedding训练
SWIFT已经支持Embedding模型的训练,包括纯文本和多模态两个类型。目前已经支持的模型有:
1. modernbert embedding模型
- [ModelScope](https://modelscope.cn/models/iic/gte-modernbert-base) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)
2. gte embedding模型
- 1.5B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-1.5B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)
3. gme embedding模型
- 2B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)
4. qwen3-embedding模型
- 0.6B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)
- 4B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-4B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-4B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-8B)
5. qwen3-vl-embedding模型
- 2B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-2B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)
开发者可以自行集成自己的模型,模型forward输出值需要满足:
```text
{"last_hidden_state": some-embedding-tensor}
```
返回值是一个json,具有`last_hidden_state` key,value是embedding tensor即可,输入部分可以使用我们已经支持的template。用户也可以通过指定
```shell
--task_type embedding
```
参数来将任意一个其他模型转换为embedding模型进行训练。
需要注意的是,SWIFT目前支持的embedding模型均为符合纯文本或多模态LLM,目前并不支持CLIP类型的模型训练。
此外,SWIFT支持的所有embedding模型在模型forward最后都增加了normalize,如自行增加新模型请注意增加normalize层。
## loss
目前SWIFT支持的Embedding模型可以使用的loss有:
- cosine_similarity: cosine相似度loss,计算两个embedding的相似度,并根据label的值拟合,实际为MSE loss
- contrastive: 可调margin的对比学习loss,label仅支持0和1两个值
- online_contrastive: 考虑hard negative和hard positive部分的contrastive loss,label仅支持0和1两个值
- infonce: 在同一个batch中不同row两两计算cosine相似度,并使row内部相似度最大,不同row相似度最小,不需要label
loss的源代码可以在[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)找到。
## 数据集格式
> 注:
> 1. `<image>`标签可以出现在`messages`/`positive_messages`/`negative_messages`的任意位置;它们各自拥有独立的`images`/`positive_images`/`negative_images`字段用于提供图片路径或URL。
> 2. 不再需要跨字段的“对应顺序”。对齐规则为:`images`的长度等于`messages`中`<image>`标签的数量;`positive_images`与`negative_images`均为“list of list”,其外层长度分别等于`positive_messages`与`negative_messages`的长度;并且外层每一项的内层列表长度等于该条消息序列中`<image>`标签的数量。
> 3. `messages`代表anchor样本(anchor sample);`positive_messages`/`negative_messages`为“list of messages”(因此多一层`[]`);相应地,`positive_images`/`negative_images`也多一层`[]`并与之逐项对齐。
> 4. 也支持`<video>`, `<audio>`标签;可按相同规则分别通过`videos`/`positive_videos`/`negative_videos`与`audios`/`positive_audios`/`negative_audios`提供对应模态数据。
> 5. 当前约束:`positive_messages`的外层长度必须为1(即仅提供一个positive样本);对应地,`positive_images`的外层长度也必须为1。
### cosine_similarity loss对应的格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 0.8}
# MLLM
{"messages": [{"role": "user", "content": "<image>"}], "images": ["/some/images1.jpg"],"positive_messages": [[{"role": "user", "content": "<image>sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 0.7}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "<image>sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0.7}
```
### contrastive/online_contrastive loss对应的格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 1}
# MLLM
{"messages": [{"role": "user", "content": "<image>"}], "images": ["/some/images1.jpg"], "positive_messages": [[{"role": "user", "content": "<image>sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 1}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "<image>sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0}
```
评测的指标分别是两个embedding的欧式距离、点积等的pearson系数以及spearman系数,共八个指标。
### infonce 格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]]}
# MLLM
{"messages": [{"role": "user", "content": "<image>"}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]]}
{"messages": [{"role": "user", "content": "<image>sentence1"}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "user", "content": "<image>sentence2"}]], "positive_images": [["/some/positive_images.jpg"]], "negative_messages": [[{"role": "user", "content": "<image><image>sentence3"}], [{"role": "user", "content": "<image>sentence4"}]], "negative_images": [["/some/negative_images1.jpg", "/some/negative_images2.jpg"], ["/some/negative_images3.jpg"]]}
```
infonce loss支持几个环境变量:
1. `INFONCE_TEMPERATURE`: temperature参数,不设置的话默认值是0.1
2. `INFONCE_USE_BATCH`: 使用sample内部的`negative_messages`(hard negative样例)还是使用一个batch内其他样本作为in-batch negatives;默认为True,表示使用batch内部的样本作为负例
3. `INFONCE_HARD_NEGATIVES`: hard negatives的数量;如果不设置会使用数据中提供的所有`negative_messages`。由于长度未必一致,因此会采用for循环计算loss(计算会慢)。若设置为某个数值,则不足会随机采样补齐,超长会选用前`INFONCE_HARD_NEGATIVES`个
4. `INFONCE_MASK_FAKE_NEGATIVE`: mask掉假negative。默认为False,开启时会判断 `positive_similarity + INFONCE_FAKE_NEG_MARGIN`,比该阈值大的样本相似度会被设置为 `-inf`,以防止正样本泄露问题
5. `INFONCE_FAKE_NEG_MARGIN`:假负样本屏蔽的边际,默认 `0.1`。
6. `INFONCE_INCLUDE_QQ`:是否在分母中加入 q–q 分量(query 间相似度)作为负例,默认 `False`。
7. `INFONCE_INCLUDE_DD`:是否在分母中加入 d–d 分量(正样本文档与 batch 内所有文档的相似度)作为负例,默认 `False`。
> 也可以在数据集中将hard negatives数量设置为数量相等,这样即使不设置也不会使用for循环方式,加快计算速度
> `negative_messages`也可以不提供。在这种情况下,保持`INFONCE_USE_BATCH=True`,会使用一个batch内部的其他样本作为负例
infonce loss的评测会有下面几个指标:
- mean_neg 所有hard_negative的平均值
- mean_pos 所有positive的平均值
- margin positive-max_hard_negative的平均值
## 训练
SWIFT提供的脚手架训练脚本:
- [Qwen3-Embedding/Qwen3-VL-Embedding模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/qwen3)
- [GME模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh)
## 推理
SWIFT已经支持GME、GTE、Qwen3-Embedding模型的部署,请查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/embedding/client.py)。
- 推理脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_embedding.py)。
也可以使用原模型的代码进行推理:
https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct
https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct
如果使用了其他模型从0训练embedding(例如,原版`qwen2-vl`模型+`--task_type embedding`),也可以使用gme的推理代码,但请注意:
https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct/file/view/master/gme_inference.py?status=1#L111
这里的模板请修改为模型自身的template,以免最后的embedding对不上。需要额外注意的是,gme模型的template和`qwen2-vl`或`qwen2.5-vl`系列的chatml template并不相同,其推理代码最后的结束字符是`<|endoftext|>`而非`<|im_end|>`.
## 高级功能
- Qwen3-Embedding 自定义 Instruction:
- 默认无 Instruction,输入模板为:`{Query}<|endoftext|>`。
- 通过在 system message 中添加 Instruction,可将输入改为:`{Instruction} {Query}<|endoftext|>`。
- 示例:
```json lines
{"messages": [
{"role": "system", "content": "请用中文回答,并输出简洁要点"},
{"role": "user", "content": "介绍一下Qwen3-Embedding"}
]}
```
> 说明:Qwen3-Embedding 模板会将 system 内容前置拼接到首条 user 消息中,并使用 `<|endoftext|>` 作为结束标记。
### 转换前后示例
- 不加 Instruction:
输入数据(messages):
```json lines
{"messages": [
{"role": "user", "content": "北京明天天气如何?"}
]}
```
模板转换后(送入模型的实际文本):
```text
北京明天天气如何?<|endoftext|>
```
- 加 Instruction:
输入数据(messages,包含system):
```json lines
{"messages": [
{"role": "system", "content": "请使用中文、精炼输出要点"},
{"role": "user", "content": "北京明天天气如何?"}
]}
```
模板转换后(送入模型的实际文本):
```text
请使用中文、精炼输出要点 北京明天天气如何?<|endoftext|>
```
- positive/negative 同理:
若在某个 positive/negative 的消息序列中提供 system,则会将该 system 内容前置到该序列首条 user 内容之前;未提供 system 则不前置。
输入数据(包含一个 positive 带 system,和一个 negative 无 system):
```json lines
{
"messages": [
{"role": "user", "content": "Anchor"}
],
"positive_messages": [[
{"role": "system", "content": "指令"},
{"role": "user", "content": "Positive"}
]],
"negative_messages": [[
{"role": "user", "content": "Negative"}
]]
}
```
模板转换后(送入模型的实际文本):
```text
Anchor<|endoftext|>
指令 Positive<|endoftext|>
Negative<|endoftext|>
```
================================================
FILE: docs/source/BestPractices/GRPO-Code-Training.md
================================================
# GRPO代码训练
本文档介绍如何使用GRPO对模型进行代码训练
模型:[Qwen/Qwen2.5-7B-Instruct](https://www.modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)
数据集:[open-r1/verifiable-coding-problems-python-10k](https://www.modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k/dataPeview)
数据集样例
```json
{
"problem": "Solve the following coding problem using the programming language python: Polycarp has $n$ different binary words. A word called binary if it contains only characters '0' and '1'. For example, these words are binary: \"0001\", \"11\", \"0\" and \"0011100\". Polycarp wants to offer his set of $n$ binary words to play a game \"words\". In this game, players name words and each next word (starting from the second) must start with the last character of the previous word. The first word can be any. For example, these sequence of words can be named during the game: \"0101\", \"1\", \"10\", \"00\", \"00001\". Word reversal is the operation of reversing the order of the characters. For example, the word \"0111\" after the reversal becomes \"1110\", the word \"11010\" after the reversal becomes \"01011\". Probably, Polycarp has such a set of words that there is no way to put them in the order correspondent to the game rules. In this situation, he wants to reverse some words from his set so that: the final set of $n$ words still contains different words (i.e. all words are unique); there is a way to put all words of the final set of words in the order so that the final sequence of $n$ words is consistent with the game rules. Polycarp wants to reverse minimal number of words. Please, help him. -----Input----- The first line of the input contains one integer $t$ ($1 \\le t \\le 10^4$) — the number of test cases in the input. Then $t$ test cases follow. The first line of a test case contains one integer $n$ ($1 \\le n \\le 2\\cdot10^5$) — the number of words in the Polycarp's set. Next $n$ lines contain these words. All of $n$ words aren't empty and contains only characters '0' and '1'. The sum of word lengths doesn't exceed $4\\cdot10^6$. All words are different. Guaranteed, that the sum of $n$ for all test cases in the input doesn't exceed $2\\cdot10^5$. Also, guaranteed that the sum of word lengths for all test cases in the input doesn't exceed $4\\cdot10^6$. -----Output----- Print answer for all of $t$ test cases in the order they appear. If there is no answer for the test case, print -1. Otherwise, the first line of the output should contain $k$ ($0 \\le k \\le n$) — the minimal number of words in the set which should be reversed. The second line of the output should contain $k$ distinct integers — the indexes of the words in the set which should be reversed. Words are numerated from $1$ to $n$ in the order they appear. If $k=0$ you can skip this line (or you can print an empty line). If there are many answers you can print any of them. -----Example----- Input 4 4 0001 1000 0011 0111 3 010 101 0 2 00000 00001 4 01 001 0001 00001 Output 1 3 -1 0 2 1 2 The input will be stdin and you should print your solution to stdout Now solve the problem and return the code.",
"verification_info": {
"language": "python",
"test_cases": [
{
"input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n",
"output": "1\n3 \n-1\n0\n\n2\n1 2 \n",
"type": "stdin_stdout"
}
]
}
}
```
`verification_info` 提供了程序语言以及测试用例,其中包含输入和预期的输出。
## 奖励函数
使用`code_reward`和`code_format`奖励进行训练,实现细节见[代码](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)
- `code_reward`通过[e2b](https://e2b.dev/)或[judge0](https://judge0.com/)执行生成的代码,根据数据集中的测试用例对代码进行验证给出奖励值。
- `code_format`要求模型输出包含代码块的格式化回答。
注:当前通过e2b执行代码仅支持python语言,如需执行其他语言,可以使用judge0执行([judge0支持语言列表](https://github.com/judge0/judge0?tab=readme-ov-file#supported-languages))。
## 训练脚本
### e2b
- 在[e2b](https://e2b.dev/dashboard)注册获取E2B_API_KEY,并设置为环境变量。
- `--reward_funcs`添加`external_code_reward`作为奖励函数。
- `--external_plugins`设置为plugin.py的路径。
首先拉起 vLLM server
```bash
CUDA_VISIBLE_DEVICES=7 \
swift rollout \
--model Qwen/Qwen2.5-7B-Instruct \
--vllm_enable_lora true \
--vllm_max_lora_rank 16
```
```bash
E2B_API_KEY=xxx \
WANDB_API_KEY=xxx \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
NPROC_PER_NODE=7 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_code_reward external_code_format \
--reward_weights 1.0 0.1 \
--vllm_mode server \
--use_vllm true \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type lora \
--lora_rank 16 \
--lora_alpha 32 \
--torch_dtype bfloat16 \
--dataset 'open-r1/verifiable-coding-problems-python-10k' \
--load_from_cache_file true \
--max_completion_length 2048 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 1 \
--eval_steps 200 \
--save_steps 200 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 4 \
--num_generations 14 \
--temperature 0.9 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero2 \
--log_completions true \
--report_to wandb
```
### judge0
- 设置环境变量:
- (必需)JUDGE0_ENDPOINT: judge0访问地址。
- (可选)JUDGE0_X_AUTH_TOKEN: judge0访问Token。
- `--reward_funcs`添加`external_code_reward_by_judge0`作为奖励函数。
- `--external_plugins`设置为plugin.py的路径。
```bash
JUDGE0_ENDPOINT=xxx \
JUDGE0_X_AUTH_TOKEN=xxx \
WANDB_API_KEY=xxx \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
NPROC_PER_NODE=7 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_code_reward_by_judge0 external_code_format \
--reward_weights 1.0 0.1 \
--vllm_mode server \
--use_vllm true \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type lora \
--torch_dtype bfloat16 \
--dataset 'open-r1/verifiable-coding-problems-python-10k' \
--load_from_cache_file true \
--max_completion_length 2048 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 1 \
--eval_steps 200 \
--save_steps 200 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 4 \
--num_generations 14 \
--temperature 0.9 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero2 \
--log_completions true \
--report_to wandb
```
训练奖励曲线图

================================================
FILE: docs/source/BestPractices/GRPO-Multi-Modal-Training.md
================================================
# 多模态GRPO完整实验流程
本文介绍如何使用SWIFT GRPO进行多模态模型和任务的训练。目标是对多个多模态任务进行训练,提升任务精度,任务定义和训练参数等参考了 [R1-V](https://github.com/Deep-Agent/R1-V.git) 和 [open-r1-multimodal](https://github.com/EvolvingLMMs-Lab/open-r1-multimodal.git)
## ClevrCount 任务
### 任务与数据集定义
本任务从clevr_cogen_a_train数据集出发,模型的目标是输出图像中包含的物体数量,因此,我们定义数据集如下:
```python
class ClevrPreprocessor(ResponsePreprocessor):
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
query = row.get('query', '')
query = f"""{query} Output the thinking process in <think> </think> and
final answer (number) in <answer> </answer> tags."""
row.update({'query': query})
return super().preprocess(row)
register_dataset(
DatasetMeta(
ms_dataset_id='AI-ModelScope/clevr_cogen_a_train',
subsets=[
SubsetDataset(
name='default',
subset='default',
split=['train'],
),
],
preprocess_func=ClevrPreprocessor(),
tags=['qa', 'math']))
```
这里重新定义dataset preprocessor的目的是修改query。数据集示例样本如下,包含messages,images和solution字段,solution会送入后续的奖励函数中,而messages和images则会作为模型输入。
- 注意:`{'role': 'assistant', 'content': '<answer> 3 </answer>'}`将会在GRPOTrainer中被移除,可以忽略。'solution'字段将会透传入ORM中。在自定义数据集时,'images'字段组织成`["image_path1", "image_path2"]`即可。
```json
{
"images": ["image_path1", "image_path2"],
"messages": [
{
"role": "user",
"content": "How many items are there in the image? Output the thinking process in <think> </think> and \n final answer (number) in <answer> </answer> tags."
}
],
"solution": "<answer> 3 </answer>"
}
```
## 奖励函数定义:
本任务使用的奖励函数有两个,一个是 Deepseek-R1 中提到的格式奖励函数,另一是 ClevrCount 的准确性奖励函数。前者已经在swift中内置,通过 `--reward_funcs format` 可以直接使用,而后者需要我们自己定义,在这里我们使用 external_plugin 的方式定义准确性奖励函数,将代码放在`swift/examples/train/grpo/plugin/plugin.py`中。
在这里,奖励函数的输入包括completions和solution两个字段,分别表示模型生成的文本和真值。每个都是list,支持多个completion同时计算。注意,在这里,solution字段是数据集中定义的字段透传而来,如果有任务上的变动,可以分别对数据集和奖励函数做对应的改变即可。
```python
class MultiModalAccuracyORM(ORM):
def __call__(self, completions, solution, **kwargs) -> List[float]:
"""
Reward function that checks if the completion is correct.
Args:
completions (list[str]): Generated outputs
solution (list[str]): Ground Truths.
Returns:
list[float]: Reward scores
"""
rewards = []
from math_verify import parse, verify
for content, sol in zip(completions, solution):
reward = 0.0
# Try symbolic verification first
try:
answer = parse(content)
if float(verify(answer, parse(sol))) > 0:
reward = 1.0
except Exception:
pass # Continue to next verification method if this fails
# If symbolic verification failed, try string matching
if reward == 0.0:
try:
# Extract answer from solution if it has think/answer tags
sol_match = re.search(r'<answer>(.*?)</answer>', sol)
ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
# Extract answer from content if it has think/answer tags
content_match = re.search(r'<answer>(.*?)</answer>', content)
student_answer = content_match.group(1).strip() if content_match else content.strip()
# Compare the extracted answers
if student_answer == ground_truth:
reward = 1.0
except Exception:
pass # Keep reward as 0.0 if both methods fail
rewards.append(reward)
return rewards
orms['external_r1v_acc'] = MultiModalAccuracyORM
```
### GRPO训练实验记录
#### 训练参数:
我们选取 Qwen2.5-VL-3B-Instruct 作为基础模型进行训练,选取 Instruct 而不是基模的主要原因是可以更快地获取 format reward。我们在八卡 GPU 上进行实验。如果遇到vllm部署qwen2.5-vl报错,可以参考[issue](https://github.com/vllm-project/vllm/issues/13285)
由于任务简单,我们设置max_completion_length为1024,奖励函数选择external_r1v_acc和format,学习率和beta分别设置为1e-6和0.001。其他设置如下所示,batch_size和num_generations的设置原则可以参考[GRPO完整流程](./GRPO.md)。
首先拉起 external vLLM server
```bash
CUDA_VISIBLE_DEVICES=6,7 \
swift rollout \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--vllm_data_parallel_size 2
```
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'AI-ModelScope/clevr_cogen_a_train' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 1000 \
--save_steps 1000 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_CLEVR_COUNTDOWN \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--num_generations 24 \
--temperature 1.0 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 1 \
--async_generate false \
--beta 0.001 \
```
#### 实验现象

由于数据集和任务比较简单,模型训练了500个epoch已经基本收敛,有以下观察。
1. 自定义的ClevrORM在不断增加,证明模型学习到了如何完成这一任务,最终任务成功率从初始的0.4攀升到1左右。
2. 由于是Instruct模型,Format Reward基本稳定在1,这可能也与数据集所有样本都是一样的query有一定关系。
3. reward_std 稳定到0.1以下。
4. completion length最终稳定在60-80,模型学习到的固定输出范式是逐个物体列举数数。
step 1:
```
"<think> The image shows 5 distinct items: a cyan metallic cylinder, a red metallic cylinder, a small purple metallic cylinder, a large gray metallic sphere, and a small yellow metallic cube. To count the number of items, I add the counts of each type of object: 1 cyan cylinder, 1 red cylinder, 1 purple cylinder, 1 gray sphere, and 1 yellow cube. Therefore, the total number of items is 5. </think>\n<answer>5</answer>"
"Let's count the number of items in the image:\n1. A large cyan metallic cylinder.\n2. A small purple metallic cylinder.\n3. A large red metallic cylinder.\n4. A large gray metallic sphere.\n5. A small yellow metallic cube.\n\n<answer>5</answer>"
"<think> The image shows four distinct objects: a large cyan metallic cylinder, a smaller purple metallic cylinder, a large red metallic cylinder, and a small yellow metallic cube. Counting each of these items individually, there are five items in total. </think>\n<answer>5</answer>"
```
step 400:
```
"<think> Counting the number of items in the image:\n1. Green matte sphere\n2. Large metallic yellow sphere\n3. Small metallic brown cube\n\nThere are three distinct objects in total.\n</think>\n<answer>3</answer>"
"<think> Counting the objects in the image:\n1. Large yellow metallic sphere\n2. Small green rubber sphere\n3. Small brown metallic cube\n\nThere are three distinct items in the image.\n</think>\n<answer>3</answer>"
```
总体来讲,这一任务比较简单,reward的收敛也比较典型。
## Geometric QA任务
### 任务与数据集定义
本任务为Geometric QA任务,任务描述为:给定一个几何图形,回答有关几何图形的数学问题。原始数据来自于[论文](https://arxiv.org/pdf/2312.11370),[R1-V](https://github.com/Deep-Agent/R1-V.git)对数据进行了预处理,将所有数据全部处理成了problem-solution的格式,而图像则保留在image字段中,因此,我们不需要额外定义数据集,直接使用`--dataset AI-ModelScope/GEOQA_R1V_Train_8K`即可。
### 奖励函数
由于也是数学题,同时,答案也处理成了最终结果,因此,我们直接使用以上定义过的`MultiModalAccuracyORM`奖励函数。
### GRPO训练实验记录
#### 训练参数:
选取的模型和大部分超参数与上一个实验相似,主要有两点不同:
1. SWIFT 已支持`--num_iteration`参数,单次rollout可以进行多次更新,这里设置为2。
2. 在实验时发现,在数学问题中,训练可能会出现不稳定现象,导致模型训崩,具体表现为所有rewar迅速降低,loss、grad_norm和kl都迅速增大,后续也难以恢复正常状态。因此,这里设置`--max_grad_norm 0.5`,保证稳定训练,当然,这种现象的出现也有一定的随机性。
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
MAX_PIXELS=401408 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 400 \
--save_steps 400 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_GEOQA \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--repetition_penalty 1.1 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 2 \
--async_generate false \
--beta 0.001 \
--max_grad_norm 0.5 \
```
#### 实验现象

训练曲线如上图所示。
1. 相比于Count任务,Geometric QA任务的难度明显更大,因此,Accuracy Reward收敛明显更慢,同时震荡幅度也明显更大。
2. completion_length的变化趋势比较明显,最终收敛到250 token左右。
3. reward基本收敛,标准差在一共1300个step时稳定在0.2左右。
以下是训练后的模型completition示例:
```
User:
"If AB is the tangent to circle O with the tangent point A, and BO intersects circle O at point C, and point D is located on circle O, what is the measure of angle ADC if the degree of angle ABO is 32.0?"
Assistant:
"<think>\nTo find the measure of angle ADC, we first need to understand the properties of the tangent and the circle. Since AB is a tangent to the circle at point A, the line segment OA (which is perpendicular to the tangent AB) is perpendicular to AB. This means that angle OAB is 90 degrees.\n\nGiven that angle ABO is 32 degrees, we can find angle BAO by subtracting angle ABO from 90 degrees:\n∠BAO = 90° - ∠ABO = 90° - 32° = 58°\n\nSince angle BAO is an angle in the minor arc AD, the angle ADC, which is an angle in the minor arc AD, is half the angle BAO. Therefore, angle ADC is:\n∠ADC = 1/2 × ∠BAO = 1/2 × 58° = 29°\n\nSo, the measure of angle ADC is 29 degrees.\n</think>\n<answer>\nThe measure of angle ADC is 29 degrees.\n</answer>"
```
## Multimodal Open R1 数据集实验
### 任务与数据集定义
本任务为参考[open-r1-multimodal](https://github.com/EvolvingLMMs-Lab/open-r1-multimodal.git)的实验,使用数据集:[lmms-lab/multimodal-open-r1-8k-verified](https://www.modelscope.cn/datasets/lmms-lab/multimodal-open-r1-8k-verified),该数据集专注于多模态的数学推理任务,数据由GPT4o基于`Math360K`和`Geo170K`数据集生成,包含推理路径和可验证答案。数据集中已包含了image, problem和solution字段,我们也不需要针对prompt进行修改,因此无需额外定义数据集。
### 奖励函数
我们直接使用以上定义过的`MultiModalAccuracyORM`奖励函数。
### GRPO训练实验记录
#### 训练参数:
选取的模型和大部分超参数与上一个实验相似,由于训练的时候出现了OOM,我们设置`MAX_PIXELS=262144`以降低显存占用。
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
MAX_PIXELS=262144 \
MASTER_PORT=29600 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 400 \
--save_steps 400 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_GEOQA \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--repetition_penalty 1.1 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 2 \
--async_generate false \
--beta 0.001 \
--max_grad_norm 0.5 \
```
#### 实验现象

训练曲线如上图所示。
一共8k数据,训练了1268个step,Accuracy Reward收敛到0.5左右,completion_length基本收敛到200token左右,reward_std最终在0.2左右震荡。
以下是训练后的模型completition示例:
```
User:
"Based on the map, which state falls into the lowest percentage range of lots, and what is that percentage range?"
Assistant:
"<think>\nThe image provided shows a color-coded map of the USA indicating different ranges of lot percentages across various states. According to the legend at the bottom, the lowest percentage range (6.0% - 6.5%) is represented by white. In the image, Alabama (AL) is shaded in white, which corresponds to the 6.0% - 6.5% category. Therefore, based on the map, the state that falls into the lowest percentage range of lots is Alabama, with the percentage range of 6.0% - 6.5%.\nTherefore, the answer is 6.0% - 6.5%.\n</think>\n<answer>Alabama</answer>"
```
================================================
FILE: docs/source/BestPractices/GRPO.md
================================================
# GRPO完整实验流程
本文从较为简单的数学任务 Coundown Game 出发,从数据集定义、奖励函数定义和GRPO训练几个步骤介绍完整的GRPO训练流程。任务定义和训练参数等参考了 [mini-deepseek-r1](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb)。
## 任务与数据集定义
Coundown Game 的任务目标是根据给定的几个数字和加减乘除四种运算,得到目标数字,因此,我们定义数据集如下:
```python
class CoundownTaskPreprocessor(ResponsePreprocessor):
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
numbers = row['nums']
target = row.pop('response', None)
query = f"""
Using the numbers {numbers}, create an equation that equals {target}.
You can use basic arithmetic operations (+, -, *, /) and each number can only be used once.
Show your work in <think> </think> tags. And return the final equation and answer in <answer> </answer> tags,
for example <answer> (1 + 2) / 3 * 4 = 4 </answer>.
"""
row.update({'target': target, 'query': query})
return super().preprocess(row)
register_dataset(
DatasetMeta(
ms_dataset_id='zouxuhong/Countdown-Tasks-3to4',
subsets=['default'],
preprocess_func=CoundownTaskPreprocessor(),
tags=['math']))
```
通过 template, 使用 numbers 和 target 完成任务定义,并给到 query 字段供模型采样使用。同时,我们需要保留 nums 和 target 两个字段,用于后续的奖励函数计算。
## 奖励函数定义:
本任务使用的奖励函数有两个,一个是 Deepseek-R1 中提到的格式奖励函数,另一是 Coundown Game 的准确性奖励函数。前者已经在swift中内置,通过 `--reward_funcs format` 可以直接使用,而后者需要我们自己定义,在这里我们使用 external_plugin 的方式定义准确性奖励函数,将代码放在`swift/examples/train/grpo/plugin/plugin.py`中。
在这里,奖励函数的输入包括 completions、target 和 nums 三个字段,分别表示模型生成的文本、目标答案和可用的数字。每个都是list,支持多个 completion 同时计算。注意,在这里,除了 completions 之外的参数都是数据集中定义的字段透传而来,如果有任务上的变动,可以分别对数据集和奖励函数做对应的改变即可。
```python
class CountdownORM(ORM):
def __call__(self, completions, target, nums, **kwargs) -> List[float]:
"""
Evaluates completions based on Mathematical correctness of the answer
Args:
completions (list[str]): Generated outputs
target (list[str]): Expected answers
nums (list[str]): Available numbers
Returns:
list[float]: Reward scores
"""
rewards = []
for completion, gt, numbers in zip(completions, target, nums):
try:
# Check if the format is correct
match = re.search(r"<answer>(.*?)<\/answer>", completion)
if match is None:
rewards.append(0.0)
continue
# Extract the "answer" part from the completion
equation = match.group(1).strip()
if '=' in equation:
equation = equation.split('=')[0]
# Extract all numbers from the equation
used_numbers = [int(n) for n in re.findall(r'\d+', equation)]
# Check if all numbers are used exactly once
if sorted(used_numbers) != sorted(numbers):
rewards.append(0.0)
continue
# Define a regex pattern that only allows numbers, operators, parentheses, and whitespace
allowed_pattern = r'^[\d+\-*/().\s]+$'
if not re.match(allowed_pattern, equation):
rewards.append(0.0)
continue
# Evaluate the equation with restricted globals and locals
result = eval(equation, {'__builtins__': None}, {})
# Check if the equation is correct and matches the ground truth
if abs(float(result) - float(gt)) < 1e-5:
rewards.append(1.0)
else:
rewards.append(0.0)
except Exception as e:
# If evaluation fails, reward is 0
rewards.append(0.0)
return rewards
orms['external_countdown'] = CountdownORM
```
## GRPO训练实验记录
首先贴上GRPO的公式:
$$
{\scriptstyle
\begin{aligned}
\mathcal{J}_{G R P O}(\theta) & =\mathbb{E}\left[q \sim P(Q),\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{o l d}}(O \mid q)\right] \\
& \frac{1}{G} \sum_{i=1}^G \frac{1}{\left|o_i\right|} \sum_{t=1}^{\left|o_i\right|}\left\{\min \left[\frac{\pi_\theta\left(o_{i, t} \mid q, o_{i,<t}\right)}{\pi_{\theta_{o l d}}\left(o_{i, t} \mid q, o_{i,<t}\right)} \hat{A}_{i, t}, \operatorname{clip}\left(\frac{\pi_\theta\left(o_{i, t} \mid q, o_{i,<t}\right)}{\pi_{\theta_{o l d}}\left(o_{i, t} \mid q, o_{i,<t}\right)}, 1-\varepsilon, 1+\varepsilon\right) \hat{A}_{i, t}\right]-\beta \mathbb{D}_{K L}\left[\pi_\theta| | \pi_{r e f}\right]\right\}
\end{aligned}
}
$$
### 训练参数:
我们选取 Qwen2.5-3B-Instruct 作为基础模型进行训练,选取 Instruct 而不是基模的主要原因是可以更快地获取 format reward。我们在三卡 GPU 上进行实验,因此vllm的推理部署在最后一张卡上,而进程数设置为2,在剩下两张卡上进行梯度更新。
由于任务较为简单,我们设置 max_completion_length 和 vllm_max_model_len 为1024,如果有更复杂的任务,可以适当加大模型输出长度,但请注意,**这两个参数越大,模型训练需要的显存越多,训练速度越慢,单个step的训练时间与max_completion_length呈现线性关系**。
在我们的实验中,总batch_size为
```
num_processes * per_device_train_batch_size * gradient_accumulation_steps = 2 * 8 * 8 = 128
```
注意,这里单卡batch_size设置也与显存息息相关,请根据显存上限设置一个合适的值。 同时,还有一个公式,即总的steps数量 :$num\_steps = epochs \times len(datasets) \times num\_generations \div batch\_size $,需要根据这个来合理规划训练的学习率和warmup设置。
最后比较重要的设置是学习率和 beta,学习率比较好理解,而beta则是是以上公式的 $\beta$,即KL散度的梯度的权重。这两个参数设置的越大,模型收敛原则上更快,但训练往往会不稳定。经过实验,我们分别设置为 `5e-7` 和 `0.001`。在实际训练中,请根据是否出现不稳定的震荡情况适当调整这两个参数。
对于KL散度,社区有很多的讨论,可以参考[为什么GRPO坚持用KL散度](https://zhuanlan.zhihu.com/p/25862547100)。
其他参数的设置,没有做太多探讨,所以这里不进行详细说明。
```bash
CUDA_VISIBLE_DEVICES=2 \
swift rollout \
--model Qwen/Qwen2.5-3B-Instruct
```
```bash
CUDA_VISIBLE_DEVICES=0,1 \
WANDB_API_KEY=your_wandb_key \
NPROC_PER_NODE=2 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_countdown format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'zouxuhong/Countdown-Tasks-3to4#50000' \
--load_from_cache_file true \
--max_length 2048 \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 5e-7 \
--gradient_accumulation_steps 8 \
--eval_steps 500 \
--save_steps 100 \
--save_total_limit 20 \
--logging_steps 1 \
--output_dir output/GRPO_COUNTDOWN \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--system 'You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--beta 0.001 \
--num_iterations 1
```
### 实验现象

从训练了2000 step的实验结果来看,有几点观察:
1. 自定义的CoundownORM在不断增加,证明模型学习到了如何完成这一任务,最终任务成功率在0.5左右。
2. Format Reward整体处在较高的水平,在0.9-1左右震荡。
3. reward_std 最终在0.2-0.3左右,仍未完全收敛,模型仍然有继续提升reward的空间。
4. 观察模型输出可以发现,相当一部分失败案例是由于模型简单加减乘除做不对,因此模型最终的收敛方向也变成了逐步运算上,如果换成更大的模型,可能有不同的收敛方向。
5. completion length呈现先下降再增加,在200 step左右最低,然后逐渐上升到350左右。证明模型经历了思考方式的转变。
6. 模型的思考方式在不断变化,最终模型发现做排列组合并逐步运算能更好地完成这一任务,最终的采样输出也大多是这一格式,以下是不同step的completion变化:
step 1:
```
"<think>\nWe need to create an equation using the given numbers [70, 5, 42, 30] exactly once and make it equal to 77. \n\nFirst, let's consider the possibility of multiplication or division to get closer to 77. Notice that 77 is close to 70 and 42. If we can use these numbers together, it might work.\n\nMultiplying 70 by 1.1 would give us approximately 77, which is very close. So, let's try to make 1.1 using the remaining numbers 5 and 30. \nIf we take 30/5 = 6, and then 5/6 = 0.8333..., this isn't useful here, so let's look at other possibilities.\n\nNext, let's consider if there's another way to achieve 77 using addition, subtraction, or another combination of our numbers. Given the numbers provided, it seems that multiplication or division will be key. \n\nWe can try subtracting 5 from 30 to get 25. Then, if we divide 42 by 25, we don't get a whole number, but we can round it to the nearest whole number, 1. Now, we need to make the equation fit 77. Let's multiply 25 by 3 and add 7 (since 77 = 25*3 + 7). However, we only have one 7 available and we need to find a way to use the remaining numbers to get 7.\n\nLet's try: \n70 - 42 = 28 \nThen, 28 + 5 = 33 \nNext, 33 + 30 = 63 \nFinally, 77 - 63 = 14, which doesn't help.\n\nConsidering different combinations, one viable way is:\n(70 - 42) * (30 / 5) = 28 * 6 = 168 / 2 = 84 / 2 = 42 / 2 = 21 / 3 = 7\n\nSo, (70 - 42) * (30 / 5) = 77.\n</think>\n<answer>(70 - 42) * (30 / 5) = 77</answer>"
```
step 200:
```
"<think>\nWe need to use all the given numbers in a way that the resultant value of the calculation is 99. Given these restrictions, it's clear we should look into how the numbers relate and the mathematical operations that will bring them together to meet our condition.\n\nFirstly, let's evaluate the sums, differences, and multiplication/division among the numbers and their relations to see how this could lead us towards an outcome of 99.\n- Summing the three numbers: 96 + 70 + 73 = 239, which is far from 99. This shows addition isn't practical.\n- Subtracting numbers: For instance, subtracting the lowest two (i.e., 73 - 70 = 3) and combining with the highest one (96) does not work. Also, there is no other combination of subtraction that would result in the number needing to add up to 99.\n- Multiplying or dividing the numbers alone wouldn’t reach to any near solution since none of the products or division results of the numbers themselves is very close to 99. We might think more carefully about the arrangement of them using a combination of the operations provided.\nTherefore, considering the possible arrangement, a combination like taking (73 - 70 + 96) may produce an effective solution as this will sum the values in such a manner that it aligns with the goal of 99.\n\nSo we conclude that the operation: 73 - 70 + 96 gives the right sequence to equal 99.\n</think>\n\n<answer>73 - 70 + 96 = 99</answer>"
```
step 500:
```
"<think>\nLet's begin by exploring various possible combinations of the given numbers using the arithmetic operators we have access to.\n\n- If I add the numbers, I get: 56 + 73 + 51 = 180 which is not equal to our target number (68)\n- If I subtract one number from another I will still exceed our target value as well since the maximum difference is: |(difference between max-min):| | (73 - 51) or | (73 - 56), etc.|\n- Let's multiply the numbers individually and see if the division operation can bring us closer to the goal:\n - (73 * 51) - 56 = 3723 - 56 = 3667 which isn't the goal either.\n - The only operator left is addition and subtraction; I'll try the combination: \n ((73 - 56) + 51) gives me (17 + 51) = 68.\n \nThus, our solution fits the provided problem constraints and meets the required outcome.\n</think>\n<answer>(73 - 56) + 51 = 68</answer>"
```
step 1000:
```
"<think>\nWe need to form an expression using the given numbers (50, 86, and 49) such that it equals 85. Let's try different combinations:\n\n(86 - 50) + 49 = 76 + 49 = 125 (too big)\n(86 - 49) + 50 = 37 + 50 = 87 (too big)\n(86 - 50) + 49 = 37 + 49 = 86 (too big)\n(86 + 50) - 49 = 136 - 49 = 87 (too big)\n(86 - 50) + (49 - x)? Not working!\n(50 + 86) - 49: this is 136 which isn't our aim.\n50 + (86 - 49) = 50 + 37 = 87 which wasn't correct.\n50 - 86 + 49 = -16 + 49: This doesn't match up as well.\n86 - (50 + 49) = 86 - 99 which doesn't work too.\n\nWe try one more:\n(86 + 49) - 50 = 135 - 50 = 85!\n\nThat's it!\n</think>\n\n<answer>(86 + 49) - 50 = 85</answer>"
```
step 2000:
```
"<think>\nWe need to form an expression using the numbers 58, 9, 78, and 44 which evaluates to 73. Let's try different combinations:\n(78 - 58) + (9 + 44) = 10 + 53 = 63 (too low)\n(78 - 58) + (9 - 44) = 20 - 35 = -15 (too low)\n(78 - 58) + (44 - 9) = 20 + 35 = 55 (too low)\n(78 + 58) - (9 + 44) = 136 - 53 = 83 (too high)\n(78 + 58) - (9 - 44) = 136 + 35 = 171 (too high)\n(78 + 58) - (44 + 9) = 136 + 53 = 189 (too high)\n(78 + 9) - (58 + 44) = 87 - 102 = -15 (too low)\n(78 + 9) - (58 - 44) = 87 - 14 = 73\n\nSo our solution is: (78 + 9) - (58 - 44) = 73</think>\n\n<answer>(78 + 9) - (58 - 44) = 73</answer>"
```
附learning_rate和beta分别取值1e-6和0.04的不稳定实验记录,模型在step 200左右出现了震荡,format和CountdownORM瞬间走低:

================================================
FILE: docs/source/BestPractices/MLLM-Registration.md
================================================
# 注册多模态模型最佳实践
本文将介绍如何在ms-swift中注册多模态模型,并成功推理和训练。本文将以Qwen2.5-Omni为例子,注册新的model_type和template `my_qwen2_5_omni`,并支持文本、图片、视频和音频的训练。由于Qwen2.5-Omni已经在ms-swift中注册,我们可以通过显式指定model_type和template来使用我们自定义的部分。
## 环境准备
```shell
# 避免未来出现与文档的不兼容情况
pip install "ms-swift>=4.0"
pip install "transformers==4.57.*" "qwen_omni_utils==0.0.8"
```
## 注册模型
第一步,我们需要注册模型,来获取模型和processor。
```python
from transformers import PretrainedConfig, PreTrainedModel
from swift.model import (Model, ModelGroup, ModelMeta, MultiModelKeys, get_model_processor, register_model,
register_model_arch, ModelLoader)
from swift.model.models.qwen import patch_qwen_vl_utils
from swift.model.patcher import patch_get_input_embeddings
from swift.model.utils import use_submodel_func
from swift.utils import get_env_args, Processor
register_model_arch(
MultiModelKeys(
'my_qwen2_5_omni',
# `freeze_llm`, `freeze_vit`, `freeze_aligner`将根据下面的值来决定其行为。
# 例如:全参数训练,若设置`freeze_vit=True`,将冻结以`thinker.audio_tower`和`thinker.visual`为前缀的模型层的参数。
# LoRA训练,若设置`freeze_vit=False`,将额外为以`thinker.audio_tower`和`thinker.visual`为前缀的Linear层添加LoRA。
language_model=['thinker.model', 'thinker.lm_head'],
vision_tower=['thinker.audio_tower', 'thinker.visual'],
aligner=['thinker.audio_tower.proj', 'thinker.visual.merger'],
# generator的部分将永远不进行训练或处于冻结状态。
# 如果你希望`thinker.audio_tower`, `thinker.audio_tower.proj`永远不进行训练,你可以放置到generator中,并将其从vision_tower, aligner中移除。
generator=['talker', 'token2wav'],
))
class Qwen2_5OmniLoader(ModelLoader):
def get_config(self, model_dir: str) -> PretrainedConfig:
from transformers import Qwen2_5OmniConfig
config = Qwen2_5OmniConfig.from_pretrained(model_dir, trust_remote_code=True)
enable_audio_output = get_env_args('ENABLE_AUDIO_OUTPUT', bool, None)
if enable_audio_output is not None:
config.enable_audio_output = enable_audio_output
return config
def get_processor(self, model_dir: str, config: PretrainedConfig) -> Processor:
from transformers import Qwen2_5OmniProcessor
from qwen_omni_utils import vision_process
processor = Qwen2_5OmniProcessor.from_pretrained(model_dir, trust_remote_code=True)
# Control constants in qwen_omni_utils library via environment variables,
# e.g., `MAX_PIXELS`, etc.
patch_qwen_vl_utils(vision_process)
return processor
def get_model(self, model_dir: str, config: PretrainedConfig, processor: Processor,
model_kwargs) -> PreTrainedModel:
from transformers import Qwen2_5OmniForConditionalGeneration
print('Run my_qwen2_5_omni...')
self.auto_model_cls = self.auto_model_cls or Qwen2_5OmniForConditionalGeneration
model = super().get_model(model_dir, config, processor, model_kwargs)
# For multimodal model consistency, we replace the model's forward/generate functions
# with those of its language_model.
# Handle additional parts separately.
use_submodel_func(model, 'thinker')
# Avoid inplace operations on leaf_variable during training
# (replacing parts of input_embeds with images_embeds)
patch_get_input_embeddings(model.thinker.visual, 'patch_embed')
# Some custom settings for model/config (usually not needed; configure based on
# specific model if errors occur during training/inference)
model.config.keys_to_ignore_at_inference += ['hidden_states', 'attention_mask']
model.config.talker_config.pad_token_id = None
return model
register_model(
ModelMeta(
'my_qwen2_5_omni',
[
ModelGroup([
Model('Qwen/Qwen2.5-Omni-3B', 'Qwen/Qwen2.5-Omni-3B'),
Model('Qwen/Qwen2.5-Omni-7B', 'Qwen/Qwen2.5-Omni-7B'),
]),
],
# 用来获取model和processor的函数。
Qwen2_5OmniLoader,
template='my_qwen2_5_omni',
is_multimodal=True, # 是否是多模态模型
model_arch='my_qwen2_5_omni', # 通常只为多模态模型设置
# 用于model_type的自动匹配
architectures=['Qwen2_5OmniModel', 'Qwen2_5OmniForConditionalGeneration'],
# 用来提示用户依赖版本(可删除)
requires=['transformers>=4.50', 'soundfile', 'qwen_omni_utils', 'decord'],
# 用来提示用户(可删除)
tags=['vision', 'video', 'audio'],
# 全参数训练/merge-lora需要额外保存的文件
additional_saved_files=['spk_dict.pt'],
))
if __name__ == '__main__':
# 测试与debug
model, processor = get_model_processor('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni')
```
## 注册模板
第二步,我们需要注册模板,来自定义如何将文本、图片、视频和音频进行预处理(`_encode`和`_data_collator`方法)。这是ms-swift支持多模态模型训练的关键模块。预处理方式请参考transformers推理实现,并进行对齐。
template的功能如下:
1. 支持正常推理与训练,预处理文本和多模态信息,并支持grounding任务。
2. 支持padding_free和packing训练。
3. 支持混合模态数据训练。
```python
from functools import partial
from typing import Any, Dict, List, Literal, Optional
import torch
from transformers.integrations import is_deepspeed_zero3_enabled
from swift import get_model_processor
from swift.template import StdTemplateInputs, Template, TemplateMeta, get_template, register_template
from swift.template.utils import Context, findall
from swift.template.vision_utils import load_audio
from swift.utils import Processor, get_env_args, get_logger, get_packed_seq_params, is_deepspeed_enabled, to_float_dtype
logger = get_logger()
class Qwen2_5OmniTemplate(Template):
use_model = True # 是否在预处理的过程中需要model参与
# 需要注意是:并不是所有的多模态模型都能支持padding_free/packing。`transformers`库内的模型通常可以支持
support_padding_free = True # 是否支持padding_free和packing(多模态模型)
norm_bbox = 'none' # grounding任务使用绝对坐标还是norm1000坐标
# 这里的tokens将不会被裁剪(例如设置`--truncation_strategy left/right`)
# 并会使用简略方式打印(调用`template.safe_decode`)
placeholder_tokens = ['<|IMAGE|>', '<|AUDIO|>', '<|VIDEO|>']
def init_processor(self, processor: Processor) -> None:
"""在初始化processor时,额外初始化所需的一些常量"""
if processor is None:
return
super().init_processor(processor)
from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import Qwen2_5OmniProcessorKwargs
default = Qwen2_5OmniProcessorKwargs._defaults
self.seconds_per_chunk = default['videos_kwargs']['seconds_per_chunk']
self.position_id_per_seconds = default['videos_kwargs']['position_id_per_seconds']
self.use_audio_in_video = get_env_args('use_audio_in_video', bool, False)
self.sampling_rate = get_env_args('sampling_rate', int, self.processor.feature_extractor.sampling_rate)
# `QWENVL_BBOX_FORMAT`的含义参考grounding数据集自定义文档
self.bbox_format = get_env_args('QWENVL_BBOX_FORMAT', str, 'legacy')
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
inputs: StdTemplateInputs) -> List[Context]:
"""读取多模态数据,并替换通用多模态tag。
例如:图像tag从`<image>` -> `<|vision_bos|><|IMAGE|><|vision_eos|>`"""
# 读取多模态数据也可以在`_encode`函数中进行,怎么方便怎么来。
from qwen_omni_utils import fetch_image, fetch_video
if media_type == 'image':
inputs.images[index] = fetch_image({'image': inputs.images[index]})
return ['<|vision_bos|><|IMAGE|><|vision_eos|>']
elif media_type == 'audio':
if self.mode != 'vllm': # 'vllm'推理场景下不需要处理
inputs.audios[index] = load_audio(inputs.audios[index], self.sampling_rate)
return ['<|audio_bos|><|AUDIO|><|audio_eos|>']
elif media_type == 'video':
video = inputs.videos[index]
_video = fetch_video({'video': video})
if isinstance(_video, torch.Tensor):
_video = _video.to(torch.uint8)
inputs.videos[index] = _video
if self.use_audio_in_video:
import librosa
if video.startswith('http://') or video.startswith('https://'):
import audioread
video = audioread.ffdec.FFmpegAudioFile(video)
video = librosa.load(video, sr=self.sampling_rate)[0]
inputs.audios.insert(inputs.audio_idx, (video, 'video'))
inputs.audio_idx += 1
return ['<|vision_bos|><|audio_bos|><|VIDEO|><|audio_eos|><|vision_eos|>']
else:
return ['<|vision_bos|><|VIDEO|><|vision_eos|>']
def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
"""替换grounding任务的通用tag: `<ref-object>`"""
if self.bbox_format == 'legacy':
return [f'<|object_ref_start|>{ref}<|object_ref_end|>']
else:
return [ref]
def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]:
"""替换grounding任务的通用tag: `<bbox>`"""
if self.bbox_format == 'legacy':
return [f'<|box_start|>{self._get_bbox_str(bbox)}<|box_end|>']
else:
return [str(bbox)]
def packing_row(self, row: List[Dict[str, Any]]) -> Dict[str, Any]:
"""支持packing & mrope。通常情况不需要继承该函数,这里为了自定义mrope的position_ids。"""
position_ids = []
for r in row:
r = r.copy()
r['input_ids'] = torch.tensor(r['input_ids'])[None]
position_ids.append(self._get_position_ids(r))
packed = super().packing_row(row)
packed['position_ids'] = torch.concat(position_ids, dim=-1)
return packed
def _get_new_tokens_use_audio_in_video(self, i, *, video_grid_thw, video_second_per_grid, audio_lengths,
video_token_id, audio_token_id):
"""辅助函数,用于支持`use_audio_in_video`为True的情况"""
merge_size = self.processor.image_processor.merge_size
grid_thw = video_grid_thw[i]
height = grid_thw[1] // merge_size
width = grid_thw[2] // merge_size
audio_token_indices = torch.arange(audio_lengths[i])
video_token_indices = torch.arange(grid_thw[0]).reshape(-1, 1, 1)
video_token_indices = torch.broadcast_to(video_token_indices,
(video_token_indices.shape[0], height, width)).reshape(-1)
video_token_indices = (video_token_indices * video_second_per_grid[i] * self.position_id_per_seconds)
tokens_per_chunk = int(self.position_id_per_seconds * self.seconds_per_chunk)
video_chunk_indexes = self.processor.get_chunked_index(video_token_indices, tokens_per_chunk)
audio_chunk_indexes = self.processor.get_chunked_index(audio_token_indices, tokens_per_chunk)
res = []
for j in range(max(len(video_chunk_indexes), len(audio_chunk_indexes))):
if j < len(video_chunk_indexes):
video_seq_length = video_chunk_indexes[j][1] - video_chunk_indexes[j][0]
res += video_token_id * video_seq_length
if j < len(audio_chunk_indexes):
audio_seq_length = audio_chunk_indexes[j][1] - audio_chunk_indexes[j][0]
res += audio_token_id * audio_seq_length
return res
def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
"""这里决定如何将text/images/audios/videos -> input_ids、labels、loss_scale以及pixel_values等多模态内容
这里的处理逻辑通常可以从对应模型的预处理代码实现中借鉴。
推荐:请先做推理对齐再做训练"""
encoded = Template._encode(self, inputs) # 处理纯文本部分,具体请参考自定义模型文档
logger.info_once('Run qwen2_5_omni template')
processor = self.processor
# 获取多模态内容
media_inputs = processor(
text='',
audio=inputs.audios or None,
images=inputs.images or None,
videos=inputs.videos or None,
do_resize=False,
return_tensors='pt')
# 我们不使用`processor`产生的input_ids和attention_mask。因为其不产生`labels`。
media_inputs.pop('input_ids')
media_inputs.pop('attention_mask')
media_inputs = to_float_dtype(media_inputs, self.model_info.torch_dtype)
input_ids = encoded['input_ids']
labels = encoded['labels']
loss_scale = encoded.get('loss_scale', None)
# audio模态
audio_token_id = self._tokenize('<|AUDIO|>')
idx_list = findall(input_ids, audio_token_id) # 查找所有的audio_token
feature_attention_mask = media_inputs.get('feature_attention_mask')
if feature_attention_mask is not None:
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
audio_lengths = ((audio_feature_lengths - 1) // 2 + 1 - 2) // 2 + 1
else:
audio_lengths = None
audio_lengths_origin = audio_lengths
# video_audios_mask用于处理`use_audio_in_video`,区分是纯audio(0)还是video中的audio(1)
video_audios_mask = []
for i, audio in enumerate(inputs.audios):
if isinstance(audio, tuple) and audio[1] == 'video':
inputs.audios[i] = audio[0]
video_audios_mask.append(True)
else:
video_audios_mask.append(False)
video_audios_mask = torch.tensor(video_audios_mask)
if idx_list:
# 过滤掉video中的audio的内容(将在video部分处理)
if self.use_audio_in_video:
audio_lengths = audio_lengths[~video_audios_mask]
def _get_new_audio_tokens(i):
return audio_token_id * audio_lengths[i]
# 对input_ids的多模态tokens进行展开
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_audio_tokens)
# image和video模态
for media_type in ['image', 'video']:
token = f'<|{media_type.upper()}|>'
token_id = self._tokenize(token)
idx_list = findall(input_ids, token_id)
if idx_list:
merge_size = processor.image_processor.merge_size
media_grid_thw = media_inputs.get(f'{media_type}_grid_thw')
if media_type == 'video' and self.use_audio_in_video:
audio_lengths = audio_lengths_origin[video_audios_mask]
video_second_per_grid = media_inputs['video_second_per_grid']
_get_new_tokens_use_audio_in_video = partial(
self._get_new_tokens_use_audio_in_video,
video_grid_thw=media_grid_thw,
video_second_per_grid=video_second_per_grid,
audio_lengths=audio_lengths,
video_token_id=token_id,
audio_token_id=audio_token_id)
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_tokens_use_audio_in_video)
else:
def _get_new_tokens(i):
token_len = (media_grid_thw[i].prod() // (merge_size**2))
return token_id * token_len
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_tokens)
encoded['input_ids'] = input_ids
encoded['labels'] = labels
encoded['loss_scale'] = loss_scale
encoded.update(media_inputs) # 将多模态内容加入其中
return encoded
def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""该函数通常用于解决混合模型训练zero2/zero3卡住的问题,
即有的进程为纯文本数据未过vit,有的进程含图片数据过了vit。这里将创建dummy_image来解决。
该函数将被注册在`model.forward`前的pre_forward_hook中。
该函数需返回 含多模态信息的input_embeds。
"""
if not self.is_training:
return inputs
input_ids = inputs['input_ids']
input_features = inputs.get('input_features')
feature_attention_mask = inputs.get('feature_attention_mask')
base_model = self.get_base_model(model)
inputs_embeds = base_model.thinker.model.embed_tokens(input_ids)
thinker_config = model.config.thinker_config
# 辅助函数,用于处理text/image/video混合模态数据场景。(内部会创建dummy_image)
inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.thinker.visual, self.processor,
thinker_config)
# 含audio的混合模态数据场景
if input_features is None:
if is_deepspeed_enabled() and not is_deepspeed_zero3_enabled():
# 注意: 由于transformers实现中,经过audio部分模型层的次数与audio数量相关
# 因此zero3在不同进程audios数不同场景下会卡住(需修改transformers代码修复)。此场景请使用zero2。
input_features = input_ids.new_zeros([1, 128, 128], dtype=model.thinker.audio_tower.dtype)
feature_attention_mask = input_ids.new_ones([1, 128], dtype=torch.bool)
audio_res = model.thinker.get_audio_features(input_features, feature_attention_mask)
# 兼容transformers 5.0
if hasattr(audio_res, 'last_hidden_state'):
audio_embeds = audio_res.last_hidden_state
else:
audio_embeds = audio_res
inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.
else:
audio_res = model.thinker.get_audio_features(input_features, feature_attention_mask)
# 兼容transformers 5.0
if hasattr(audio_res, 'last_hidden_state'):
audio_embeds = audio_res.last_hidden_state
else:
audio_embeds = audio_res
audio_mask = (input_ids == thinker_config.audio_token_index).unsqueeze(-1).expand_as(inputs_embeds)
audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_embeds)
return {'inputs_embeds': inputs_embeds}
def _get_position_ids(self, inputs: Dict[str, Any]):
"""辅助函数,用来获取mrope的position_ids"""
feature_attention_mask = inputs.get('feature_attention_mask')
if feature_attention_mask is not None:
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
else:
audio_feature_lengths = None
video_second_per_grid = inputs.pop('video_second_per_grid', None)
input_ids = inputs['input_ids']
attention_mask = inputs.get('attention_mask')
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
position_ids, _ = self.model.thinker.get_rope_index(
input_ids,
inputs.get('image_grid_thw'),
inputs.get('video_grid_thw'),
attention_mask,
self.use_audio_in_video,
audio_feature_lengths,
video_second_per_grid,
)
return self._concat_text_position_ids(position_ids) # 第一维为text_position_ids
def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
"""传入dataloader的`collate_fn`"""
res = super()._data_collator(batch, padding_to=padding_to)
if not self.padding_free and self.is_training:
# 其中padding_free/packing场景将会在packing_row中处理position_ids。
res['position_ids'] = self._get_position_ids(res)
if 'position_ids' in res:
# 创建`packed_seq_params`以支持padding_free/packing & flash-attn
position_ids = res['position_ids']
res['position_ids'] = position_ids[1:]
res['text_position_ids'] = text_position_ids = position_ids[0]
# https://github.com/huggingface/transformers/pull/40194
res.update(get_packed_seq_params(text_position_ids))
return res
def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
"""处理`_data_collator`函数中的多模态部分。(该函数兼容padding_free/packing)"""
res = super()._data_collator_mm_data(batch)
video_second_per_grid = self.gather_list(batch, 'video_second_per_grid')
if video_second_per_grid:
res['video_second_per_grid'] = video_second_per_grid
input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
feature_attention_mask = [
b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None
]
if input_features:
res['input_features'] = torch.concat(input_features)
res['feature_attention_mask'] = torch.concat(feature_attention_mask)
return res
def generate(self, model, *args, **kwargs):
"""`TransformersEngine`会调用template.generate方法进行文本生成,这里继承进行自定义。"""
if kwargs.get('video_grid_thw') is not None:
kwargs['use_audio_in_video'] = self.use_audio_in_video
return super().generate(model, *args, **kwargs)
register_template(
TemplateMeta('my_qwen2_5_omni', prefix=[], prompt=['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'],
chat_sep=['<|im_end|>\n'], suffix=['<|im_end|>'],
system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'],
default_system='You are a helpful assistant.', stop_words=['<|endoftext|>'],
agent_template='hermes',
template_cls=Qwen2_5OmniTemplate))
if __name__ == '__main__':
# 测试与debug
model, processor = get_model_processor('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni')
template = get_template(processor, template_type='my_qwen2_5_omni')
data = {
'messages': [
{'role': 'user', 'content': '描述视频<video>与图片<image>内容。'},
{'role': 'assistant', 'content': '一个小孩和一只猫咪。'},
],
'videos': ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'],
'images': ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'],
}
template.set_mode('train')
encoded = template.encode(data)
print('input_ids: ' + template.safe_decode(encoded['input_ids']))
print('labels: ' + template.safe_decode(encoded['labels']))
print('keys: ' + str(encoded.keys()))
```
## 推理对齐
接下来,你需要进行TransformersEngine与transformers的推理对齐。通常你需要对齐`input_ids`以及输出内容。你可以书写以下测试函数:
```python
import os
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
from modelscope import snapshot_download
from swift.infer_engine import TransformersEngine, InferRequest, RequestConfig
import requests
def infer_hf():
model_dir = snapshot_download('Qwen/Qwen2.5-Omni-7B')
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
model_dir, torch_dtype="auto", device_map="auto", attn_implementation='flash_attention_2')
processor = Qwen2_5OmniProcessor.from_pretrained(model_dir)
# 使用decord读取视频(暂不支持url)
resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
with open('_baby.mp4', 'wb') as f:
f.write(resp.content)
conversation = [
{
"role": "user",
"content": [
{"type": "video", "video": "_baby.mp4"},
{"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"},
{"type": "text", "text": "描述视频和图像。"},
],
},
]
USE_AUDIO_IN_VIDEO = False
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True,
use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)
text_ids = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, thinker_do_sample=False,
return_audio=False)
text = processor.batch_decode(text_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
return inputs['input_ids'][0].tolist(), text[0]
def test_my_qwen2_5_omni():
engine = TransformersEngine('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni', attn_impl='flash_attention_2')
infer_request = InferRequest(messages=[{
"role": "user",
"content": "<video><image>描述视频和图像。",
}],
videos=["https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4"],
images=["http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"],
)
request_config = RequestConfig(temperature=0, max_tokens=512)
input_ids = engine.template.encode(infer_request)['input_ids']
resp_list = engine.infer([infer_request], request_config)
resp = resp_list[0].choices[0].message.content
return input_ids, resp
if __name__ == '__main__':
# 开启debug模式,会打印`TransformersEngine.infer`的input_ids和generate_ids
os.environ['SWIFT_DEBUG'] = '1'
input_ids_hf, response_hf = infer_hf()
input_ids_swift, response_swift = test_my_qwen2_5_omni()
# 测试input_ids和response对齐
assert input_ids_hf == input_ids_swift
assert response_hf == response_swift
```
## 开始训练
使用python代码训练,这通常更容易debug:
```python
from swift import sft_main, SftArguments
import os
if __name__ == '__main__':
os.environ['MAX_PIXELS'] = '1003520'
sft_main(SftArguments(
model='Qwen/Qwen2.5-Omni-7B',
dataset=['AI-ModelScope/LaTeX_OCR#5000'],
model_type='my_qwen2_5_omni',
template='my_qwen2_5_omni',
load_from_cache_file=True,
split_dataset_ratio=0.01,
tuner_type='lora',
torch_dtype='bfloat16',
attn_impl='flash_attn',
padding_free=True,
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=1e-4,
lora_rank=8,
lora_alpha=32,
target_modules=['all-linear'],
freeze_vit=True,
freeze_aligner=True,
gradient_accumulation_steps=1,
eval_steps=50,
save_steps=50,
save_total_limit=2,
logging_steps=5,
max_length=2048,
output_dir='output',
warmup_ratio=0.05,
dataloader_num_workers=4,
dataset_num_proc=1,
))
```
使用命令行训练:
```shell
# 4 * 35GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
MAX_PIXELS=1003520 \
swift sft \
--model Qwen/Qwen2.5-Omni-7B \
--model_type my_qwen2_5_omni \
--template my_qwen2_5_omni \
--external_plugins 'examples/custom/my_qwen2_5_omni/my_register.py' \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#2000' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \
'speech_asr/speech_asr_aishell1_trainsets:validation#2000' \
'swift/VideoChatGPT:all#2000' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--tuner_type lora \
--torch_dtype bfloat16 \
--attn_impl flash_attn \
--padding_free true \
--packing true \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--freeze_vit true \
--freeze_aligner true \
--gradient_accumulation_steps 1 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 1 \
--deepspeed zero2
```
训练后对验证集进行推理:(环境变量请与训练时对齐)
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
MAX_PIXELS=1003520 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--max_new_tokens 512 \
--load_data_args true
```
使用以下命令将训练权重推送到 Modelscope:
```shell
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '<your-model-id>' \
--hub_token '<your-sdk-token>'
```
================================================
FILE: docs/source/BestPractices/Metax-support.md
================================================
# Metax支持
## 1. 在 Metax 平台上使用 Swift
你可以选择构建自己的镜像,也可以直接拉取已有的预构建镜像。本文以拉取预构建镜像为例,演示如何在 Metax 上使用 ms-swift。
### 1.1. 启动 ms-swift 容器
```bash
docker pull mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
# 你可以根据需要调整 --privileged 参数,并仅挂载特定的 GPU 卡。
# 更多信息请参考我们的官方文档:https://developer.metax-tech.com
# 必须通过 --device 挂载 Metax GPU 设备:--device=/dev/dri --device=/dev/mxcd
docker run -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
--shm-size 100gb --ulimit memlock=-1 \
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
--device=/dev/dri --device=/dev/mxcd \
-v /root/workspace:/external \
--name swift_test \
mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
```
## 2. 环境检查
### 2.1. 检查 Metax GPU 是否可用
得益于与 CUDA 的兼容性,我们可以像使用 NVIDIA GPU 一样检查 Metax 设备是否可用:
```python
import torch
print(torch.cuda.is_available())
# True
```
### 2.2. 检查 GPU 之间的 P2P 连接拓扑
```bash
mx-smi topo -m
# output
=================== MetaX System Management Interface Log ===================
Timestamp : Wed Feb 11 16:37:10 2026
Attached GPUs : 8
Device link type matrix
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 Node Affinity CPU Affinity
GPU0 X MX MX MX NODE NODE NODE NODE 0 0-31,64-95
GPU1 MX X MX MX NODE NODE NODE NODE 0 0-31,64-95
GPU2 MX MX X MX NODE NODE NODE NODE 0 0-31,64-95
GPU3 MX MX MX X NODE NODE NODE NODE 0 0-31,64-95
GPU4 NODE NODE NODE NODE X MX MX MX 0 0-31,64-95
GPU5 NODE NODE NODE NODE MX X MX MX 0 0-31,64-95
GPU6 NODE NODE NODE NODE MX MX X MX 0 0-31,64-95
GPU7 NODE NODE NODE NODE MX MX MX X 0 0-31,64-95
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
MX = Connection traversing MetaXLink
ETH = Connection traversing Eth
NA = Connection type is unknown
```
### 2.3. 查看 GPU 状态
```bash
mx-smi
# output
=================== MetaX System Management Interface Log ===================
Timestamp : Wed Feb 11 09:55:49 2026
Attached GPUs : 8
+---------------------------------------------------------------------------------+
| MX-SMI 2.2.9 Kernel Mode Driver Version: 3.4.4 |
| MACA Version: 3.3.0.15 BIOS Version: 1.30.0.0 |
|------------------+-----------------+---------------------+----------------------|
| Board Name | GPU Persist-M | Bus-id | GPU-Util sGPU-M |
| Pwr:Usage/Cap | Temp Perf | Memory-Usage | GPU-State |
|==================+=================+=====================+======================|
| 0 MetaX C500 | 0 Off | 0000:0e:00.0 | 0% Disabled |
| 57W / 350W | 35C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 1 MetaX C500 | 1 Off | 0000:0f:00.0 | 0% Disabled |
| 58W / 350W | 37C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 2 MetaX C500 | 2 Off | 0000:10:00.0 | 0% Disabled |
| 58W / 350W | 36C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 3 MetaX C500 | 3 Off | 0000:12:00.0 | 0% Disabled |
| 60W / 350W | 35C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 4 MetaX C500 | 4 Off | 0000:35:00.0 | 0% Disabled |
| 57W / 350W | 33C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 5 MetaX C500 | 5 Off | 0000:36:00.0 | 0% Disabled |
| 56W / 350W | 34C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 6 MetaX C500 | 6 Off | 0000:37:00.0 | 0% Disabled |
| 55W / 350W | 34C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 7 MetaX C500 | 7 Off | 0000:38:00.0 | 0% Disabled |
| 56W / 350W | 36C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
+---------------------------------------------------------------------------------+
| Process: |
| GPU PID Process Name GPU Memory |
| Usage(MiB) |
|=================================================================================|
| no process found |
+---------------------------------------------------------------------------------+
```
## 3. 运行示例
我们支持直接使用社区版 Swift,同时在镜像中 /workspace 目录下提供了经过更多优化的版本。强烈建议优先使用该目录下的软件包。
### 3.1. 运行 Swift 示例
在大多数场景下,可直接运行 Swift 的训练示例:
```bash
# We assume that the ms-swift code is under /workspace
cd /workspace/ms-swift/
bash examples/train/full/train.sh
```
运行输出示例(节选):
```bash
# output:
{'loss': 1.47077751, 'grad_norm': 10.5625, 'learning_rate': 2e-06, 'token_acc': 0.65511727, 'epoch': 0.01, 'global_step/max_steps': '1/94', 'percentage': '1.06%', 'elapsed_time': '2s', 'remaining_time': '4m 28s', 'memory(GiB)': 4.87, 'train_speed(iter/s)': 0.345807}
{'loss': 1.58882141, 'grad_norm': 10.75, 'learning_rate': 1e-05, 'token_acc': 0.61763144, 'epoch': 0.05, 'global_step/max_steps': '5/94', 'percentage': '5.32%', 'elapsed_time': '10s', 'remaining_time': '3m 12s', 'memory(GiB)': 5.64, 'train_speed(iter/s)': 0.461462}
{'loss': 1.56617603, 'grad_norm': 12.8125, 'learning_rate': 9.92e-06, 'token_acc': 0.61519274, 'epoch': 0.11, 'global_step/max_steps': '10/94', 'percentage': '10.64%', 'elapsed_time': '20s', 'remaining_time': '2m 52s', 'memory(GiB)': 5.64, 'train_speed(iter/s)': 0.485796}
{'loss': 1.63347206, 'grad_norm': 13.6875, 'learning_rate': 9.69e-06, 'token_acc': 0.60373975, 'epoch': 0.16, 'global_step/max_steps': '15/94', 'percentage': '15.96%', 'elapsed_time': '30s', 'remaining_time': '2m 39s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.493855}
{'loss': 1.60613976, 'grad_norm': 11.0, 'learning_rate': 9.32e-06, 'token_acc': 0.59997221, 'epoch': 0.21, 'global_step/max_steps': '20/94', 'percentage': '21.28%', 'elapsed_time': '39s', 'remaining_time': '2m 27s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.500516}
{'loss': 1.45015478, 'grad_norm': 15.25, 'learning_rate': 8.8e-06, 'token_acc': 0.62373584, 'epoch': 0.27, 'global_step/max_steps': '25/94', 'percentage': '26.60%', 'elapsed_time': '49s', 'remaining_time': '2m 16s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.50548}
{'loss': 1.39427547, 'grad_norm': 13.9375, 'learning_rate': 8.18e-06, 'token_acc': 0.6357994, 'epoch': 0.32, 'global_step/max_steps': '30/94', 'percentage': '31.91%', 'elapsed_time': '59s', 'remaining_time': '2m 5s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.508409}
{'loss': 1.53672237, 'grad_norm': 11.125, 'learning_rate': 7.45e-06, 'token_acc': 0.61650612, 'epoch': 0.37, 'global_step/max_steps': '35/94', 'percentage': '37.23%', 'elapsed_time': '1m 8s', 'remaining_time': '1m 55s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.510425}
{'loss': 1.54039021, 'grad_norm': 13.8125, 'learning_rate': 6.65e-06, 'token_acc': 0.61613974, 'epoch': 0.43, 'global_step/max_steps': '40/94', 'percentage': '42.55%', 'elapsed_time': '1m 18s', 'remaining_time': '1m 45s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512302}
{'loss': 1.40159426, 'grad_norm': 9.4375, 'learning_rate': 5.79e-06, 'token_acc': 0.64041773, 'epoch': 0.48, 'global_step/max_steps': '45/94', 'percentage': '47.87%', 'elapsed_time': '1m 27s', 'remaining_time': '1m 35s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512983}
{'loss': 1.54977188, 'grad_norm': 11.9375, 'learning_rate': 4.91e-06, 'token_acc': 0.61078816, 'epoch': 0.53, 'global_step/max_steps': '50/94', 'percentage': '53.19%', 'elapsed_time': '1m 37s', 'remaining_time': '1m 25s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.514489}
{'loss': 1.6754509, 'grad_norm': 13.0625, 'learning_rate': 4.04e-06, 'token_acc': 0.58574393, 'epoch': 0.59, 'global_step/max_steps': '55/94', 'percentage': '58.51%', 'elapsed_time': '1m 46s', 'remaining_time': '1m 15s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.515752}
{'loss': 1.37204351, 'grad_norm': 9.25, 'learning_rate': 3.19e-06, 'token_acc': 0.6391937, 'epoch': 0.64, 'global_step/max_steps': '60/94', 'percentage': '63.83%', 'elapsed_time': '1m 56s', 'remaining_time': '1m 5s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.516829}
{'loss': 1.47697926, 'grad_norm': 11.375, 'learning_rate': 2.4e-06, 'token_acc': 0.62817259, 'epoch': 0.69, 'global_step/max_steps': '65/94', 'percentage': '69.15%', 'elapsed_time': '2m 5s', 'remaining_time': '55s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.517947}
{'loss': 1.4336628, 'grad_norm': 8.125, 'learning_rate': 1.69e-06, 'token_acc': 0.63453862, 'epoch': 0.75, 'global_step/max_steps': '70/94', 'percentage': '74.47%', 'elapsed_time': '2m 14s', 'remaining_time': '46s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.518833}
{'loss': 1.54315252, 'grad_norm': 9.625, 'learning_rate': 1.08e-06, 'token_acc': 0.60202073, 'epoch': 0.8, 'global_step/max_steps': '75/94', 'percentage': '79.79%', 'elapsed_time': '2m 24s', 'remaining_time': '36s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.519627}
{'loss': 1.47180223, 'grad_norm': 9.5625, 'learning_rate': 6e-07, 'token_acc': 0.62211501, 'epoch': 0.85, 'global_step/max_steps': '80/94', 'percentage': '85.11%', 'elapsed_time': '2m 33s', 'remaining_time': '26s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520284}
{'loss': 1.44068375, 'grad_norm': 10.125, 'learning_rate': 2.5e-07, 'token_acc': 0.62673112, 'epoch': 0.91, 'global_step/max_steps': '85/94', 'percentage': '90.43%', 'elapsed_time': '2m 43s', 'remaining_time': '17s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520331}
{'loss': 1.44893646, 'grad_norm': 8.375, 'learning_rate': 5e-08, 'token_acc': 0.63837478, 'epoch': 0.96, 'global_step/max_steps': '90/94', 'percentage': '95.74%', 'elapsed_time': '2m 52s', 'remaining_time': '7s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520707}
{'train_runtime': 183.4332, 'train_samples_per_second': 8.177, 'train_steps_per_second': 0.512, 'train_loss': 1.50650934, 'token_acc': 0.6194337, 'epoch': 1.0, 'global_step/max_steps': '94/94', 'percentage': '100.00%', 'elapsed_time': '3m 3s', 'remaining_time': '0s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512463}
Train: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [03:03<00:00, 1.95s/it]
[INFO:swift] last_model_checkpoint: /workspace/ms-swift/output/v0-20260211-143035/checkpoint-94
[INFO:swift] best_model_checkpoint: None
[INFO:swift] images_dir: /workspace/ms-swift/output/v0-20260211-143035/images
[INFO:swift] End time of running main: 2026-02-11 14:34:09.521336
```
### 3.2. 使用 Megatron-LM 作为 Swift 后端
若希望使用 Megatron-LM 作为 Swift 的后端,需设置 `MEGATRON_LM_PATH` 环境变量:
```bash
export MEGATRON_LM_PATH=/workspace/Megatron-LM-0.15.0
cd /workspace/ms-swift
bash examples/megatron/pretrain.sh
```
### 3.3. 使用其他版本的 ms-swift
Metax 平台要求使用与 Maca 兼容的软件包。例如,编译可能依赖 torch2.8,因此需使用 torch2.8+maca3.3.x.x 版本。
默认情况下,安装会覆盖环境中已有的 PyTorch。因此,建议使用 --no-deps 参数进行安装:
```bash
git clone -b ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
gitextract_9q3kx9l7/
├── .dev_scripts/
│ ├── build_docs.sh
│ ├── ci_container_test.sh
│ ├── dockerci.sh
│ └── dockerci_npu.sh
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1-bug-report.yml
│ │ ├── 2-feature-request.yml
│ │ ├── 3-question-discussion.yml
│ │ └── config.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── SECURITY.md
│ └── workflows/
│ ├── citest.yaml
│ ├── citest_npu.yaml
│ ├── close_tale_issue.yaml
│ ├── lint.yaml
│ └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .pre-commit-config_local.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTING_CN.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_CN.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── source/
│ │ ├── .readthedocs.yaml
│ │ ├── BestPractices/
│ │ │ ├── Elastic.md
│ │ │ ├── Embedding.md
│ │ │ ├── GRPO-Code-Training.md
│ │ │ ├── GRPO-Multi-Modal-Training.md
│ │ │ ├── GRPO.md
│ │ │ ├── MLLM-Registration.md
│ │ │ ├── Metax-support.md
│ │ │ ├── More-Best-Practices.md
│ │ │ ├── NPU-support.md
│ │ │ ├── Qwen3-Best-Practice.md
│ │ │ ├── Qwen3-VL-Best-Practice.md
│ │ │ ├── Qwen3_5-Best-Practice.md
│ │ │ ├── Rapidly-Training-VL-model.md
│ │ │ └── Reranker.md
│ │ ├── Customization/
│ │ │ ├── Architecture.md
│ │ │ ├── Custom-dataset.md
│ │ │ └── Custom-model.md
│ │ ├── GetStarted/
│ │ │ ├── Quick-start.md
│ │ │ ├── SWIFT-installation.md
│ │ │ └── Web-UI.md
│ │ ├── Instruction/
│ │ │ ├── Agent-support.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── Evaluation.md
│ │ │ ├── Export-and-push.md
│ │ │ ├── Frequently-asked-questions.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO/
│ │ │ │ ├── AdvancedResearch/
│ │ │ │ │ ├── CHORD.md
│ │ │ │ │ ├── CISPO.md
│ │ │ │ │ ├── DAPO.md
│ │ │ │ │ ├── GSPO.md
│ │ │ │ │ ├── REINFORCEPP.md
│ │ │ │ │ ├── RLOO.md
│ │ │ │ │ ├── SAPO.md
│ │ │ │ │ ├── deepeyes.md
│ │ │ │ │ ├── entropy_mask.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ │ └── treepo.md
│ │ │ │ ├── DeveloperGuide/
│ │ │ │ │ ├── gym_env.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── loss_types.md
│ │ │ │ │ ├── multi_task.md
│ │ │ │ │ ├── multi_turn.md
│ │ │ │ │ ├── reward_function.md
│ │ │ │ │ └── reward_model.md
│ │ │ │ ├── GetStarted/
│ │ │ │ │ ├── GRPO.md
│ │ │ │ │ └── index.rst
│ │ │ │ └── index.rst
│ │ │ ├── Inference-and-deployment.md
│ │ │ ├── Pre-training-and-Fine-tuning.md
│ │ │ ├── RLHF.md
│ │ │ ├── Ray.md
│ │ │ ├── Reinforced-Fine-tuning.md
│ │ │ ├── Sample.md
│ │ │ ├── Supported-models-and-datasets.md
│ │ │ └── Use-tuners.md
│ │ ├── Megatron-SWIFT/
│ │ │ ├── Ascend.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO.md
│ │ │ ├── LoRA-Training.md
│ │ │ ├── Mcore-Bridge.md
│ │ │ ├── Multimodal-Model.md
│ │ │ └── Quick-start.md
│ │ ├── _templates/
│ │ │ ├── autosummary/
│ │ │ │ └── class.rst
│ │ │ ├── classtemplate.rst
│ │ │ └── sobolengine.rst
│ │ ├── conf.py
│ │ └── index.rst
│ └── source_en/
│ ├── .readthedocs.yaml
│ ├── BestPractices/
│ │ ├── Elastic.md
│ │ ├── Embedding.md
│ │ ├── GRPO-Code-Training.md
│ │ ├── GRPO-Multi-Modal-Training.md
│ │ ├── GRPO.md
│ │ ├── MLLM-Registration.md
│ │ ├── Metax-support.md
│ │ ├── More-Best-Practices.md
│ │ ├── NPU-support.md
│ │ ├── Qwen3-Best-Practice.md
│ │ ├── Qwen3-VL-Best-Practice.md
│ │ ├── Qwen3_5-Best-Practice.md
│ │ ├── Rapidly-Training-VL-model.md
│ │ └── Reranker.md
│ ├── Customization/
│ │ ├── Architecture.md
│ │ ├── Custom-dataset.md
│ │ └── Custom-model.md
│ ├── GetStarted/
│ │ ├── Quick-start.md
│ │ ├── SWIFT-installation.md
│ │ └── Web-UI.md
│ ├── Instruction/
│ │ ├── Agent-support.md
│ │ ├── Command-line-parameters.md
│ │ ├── Evaluation.md
│ │ ├── Export-and-push.md
│ │ ├── Frequently-asked-questions.md
│ │ ├── GKD.md
│ │ ├── GRPO/
│ │ │ ├── AdvancedResearch/
│ │ │ │ ├── CHORD.md
│ │ │ │ ├── CISPO.md
│ │ │ │ ├── DAPO.md
│ │ │ │ ├── GSPO.md
│ │ │ │ ├── REINFORCEPP.md
│ │ │ │ ├── RLOO.md
│ │ │ │ ├── SAPO.md
│ │ │ │ ├── deepeyes.md
│ │ │ │ ├── entropy_mask.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ └── treepo.md
│ │ │ ├── DeveloperGuide/
│ │ │ │ ├── gym_env.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── loss_types.md
│ │ │ │ ├── multi_task.md
│ │ │ │ ├── multi_turn.md
│ │ │ │ ├── reward_function.md
│ │ │ │ └── reward_model.md
│ │ │ ├── GetStarted/
│ │ │ │ ├── GRPO.md
│ │ │ │ └── index.rst
│ │ │ └── index.rst
│ │ ├── Inference-and-deployment.md
│ │ ├── Pre-training-and-Fine-tuning.md
│ │ ├── RLHF.md
│ │ ├── Ray.md
│ │ ├── Reinforced-Fine-tuning.md
│ │ ├── Sample.md
│ │ ├── Supported-models-and-datasets.md
│ │ └── Use-tuners.md
│ ├── Megatron-SWIFT/
│ │ ├── Ascend.md
│ │ ├── Command-line-parameters.md
│ │ ├── GKD.md
│ │ ├── GRPO.md
│ │ ├── LoRA-Training.md
│ │ ├── Mcore-Bridge.md
│ │ ├── Multimodal-Model.md
│ │ └── Quick-start.md
│ ├── _templates/
│ │ ├── autosummary/
│ │ │ └── class.rst
│ │ ├── classtemplate.rst
│ │ └── sobolengine.rst
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── app/
│ │ ├── base_url/
│ │ │ ├── demo.py
│ │ │ └── demo.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ └── mllm.sh
│ ├── ascend/
│ │ ├── activation_cpu_offload/
│ │ │ ├── fsdp2.json
│ │ │ └── train.sh
│ │ ├── deploy/
│ │ │ └── vllm.sh
│ │ ├── infer/
│ │ │ └── vllm/
│ │ │ └── dp_tp.sh
│ │ ├── megatron/
│ │ │ └── train_sft_full.sh
│ │ ├── multi-node/
│ │ │ └── megatron/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ └── train/
│ │ ├── qwen3/
│ │ │ ├── qwen3_lora_deepspeed.sh
│ │ │ ├── qwen3_lora_fsdp/
│ │ │ │ ├── fsdp.json
│ │ │ │ └── train.sh
│ │ │ └── qwen3_lora_megatron.sh
│ │ ├── qwen3_next/
│ │ │ └── qwen3_next_megatron.sh
│ │ ├── qwen3_omni/
│ │ │ └── qwen3_omni_full_mindspeed.sh
│ │ └── qwen3_vl/
│ │ └── moe_full_mindspeed.sh
│ ├── custom/
│ │ ├── dataset.py
│ │ ├── infer.sh
│ │ ├── model.py
│ │ ├── model_hf.py
│ │ ├── my_qwen2_5_omni/
│ │ │ ├── my_register.py
│ │ │ ├── test_register.py
│ │ │ └── train.py
│ │ └── sft.sh
│ ├── deploy/
│ │ ├── README.md
│ │ ├── agent/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── bert/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── client/
│ │ │ ├── llm/
│ │ │ │ ├── base/
│ │ │ │ │ ├── openai_client.py
│ │ │ │ │ └── swift_client.py
│ │ │ │ └── chat/
│ │ │ │ ├── openai_client.py
│ │ │ │ └── swift_client.py
│ │ │ └── mllm/
│ │ │ ├── openai_client.py
│ │ │ └── swift_client.py
│ │ ├── embedding/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── lora/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── reranker/
│ │ │ ├── client.py
│ │ │ ├── client_generative.py
│ │ │ └── server.sh
│ │ ├── reward_model/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── seq_cls/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── sglang.sh
│ │ ├── vllm.sh
│ │ └── vllm_dp.sh
│ ├── eval/
│ │ ├── eval_url/
│ │ │ ├── demo.py
│ │ │ └── eval.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ ├── train_eval/
│ │ │ └── train.sh
│ │ └── vlm/
│ │ └── eval.sh
│ ├── export/
│ │ ├── merge_lora.sh
│ │ ├── ollama.sh
│ │ ├── push_to_hub.sh
│ │ └── quantize/
│ │ ├── awq.sh
│ │ ├── bert/
│ │ │ ├── bnb.sh
│ │ │ └── gptq.sh
│ │ ├── bnb.sh
│ │ ├── fp8.sh
│ │ ├── gptq.sh
│ │ ├── gptq_v2.sh
│ │ ├── mllm/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── moe/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── omni/
│ │ │ └── gptq.sh
│ │ └── reward_model/
│ │ ├── bnb.sh
│ │ └── gptq.sh
│ ├── infer/
│ │ ├── cli_demo.sh
│ │ ├── demo.py
│ │ ├── demo_agent.py
│ │ ├── demo_bert.py
│ │ ├── demo_embedding.py
│ │ ├── demo_grounding.py
│ │ ├── demo_hf.py
│ │ ├── demo_lora.py
│ │ ├── demo_mllm.py
│ │ ├── demo_reranker.py
│ │ ├── demo_reward_model.py
│ │ ├── demo_vllm_reasoning_parser.py
│ │ ├── lmdeploy/
│ │ │ ├── batch_ddp.sh
│ │ │ └── mllm_tp.sh
│ │ ├── sglang/
│ │ │ ├── demo.sh
│ │ │ ├── distill_qwen3_235b.sh
│ │ │ ├── mtp.sh
│ │ │ └── tp.sh
│ │ ├── transformers/
│ │ │ ├── batch_ddp.sh
│ │ │ ├── bert.sh
│ │ │ ├── lora.sh
│ │ │ ├── mllm_device_map.sh
│ │ │ ├── prm.sh
│ │ │ └── reward_model.sh
│ │ └── vllm/
│ │ ├── dp_tp.sh
│ │ ├── mllm_ddp.sh
│ │ ├── mllm_tp.sh
│ │ └── mtp.sh
│ ├── megatron/
│ │ ├── base_to_chat.sh
│ │ ├── benchmark/
│ │ │ └── deepspeed.sh
│ │ ├── dense/
│ │ │ ├── 72b_offload.sh
│ │ │ └── qwen3_32b.sh
│ │ ├── embedding/
│ │ │ ├── qwen3_emb.sh
│ │ │ └── qwen3_vl_emb.sh
│ │ ├── export/
│ │ │ ├── full.sh
│ │ │ └── lora.sh
│ │ ├── fp8/
│ │ │ ├── benchmark.sh
│ │ │ ├── llm.sh
│ │ │ └── vlm.sh
│ │ ├── grpo/
│ │ │ ├── dense_colocate.sh
│ │ │ ├── dense_server.sh
│ │ │ ├── moe_colocate_full.sh
│ │ │ ├── moe_colocate_lora.sh
│ │ │ └── sapo.sh
│ │ ├── long_text.sh
│ │ ├── lora/
│ │ │ ├── dense.sh
│ │ │ ├── dpo.sh
│ │ │ ├── loss_scale.sh
│ │ │ ├── moe.sh
│ │ │ ├── mtp.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── qwen3_235b.sh
│ │ ├── mcore_bridge/
│ │ │ ├── full/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── lora/
│ │ │ ├── moe.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── seq_cls.sh
│ │ ├── moe/
│ │ │ ├── deepseek_v3.sh
│ │ │ ├── moe.sh
│ │ │ ├── qwen3_moe.sh
│ │ │ └── qwen3_moe_offload.sh
│ │ ├── multi-node/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ ├── multimodal/
│ │ │ ├── dense/
│ │ │ │ ├── dpo.sh
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── lora_llm_vit_full/
│ │ │ │ └── sft.sh
│ │ │ ├── moe/
│ │ │ │ ├── full_dpo_offload.sh
│ │ │ │ └── lora.sh
│ │ │ └── omni/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── pretrain.sh
│ │ ├── reranker/
│ │ │ ├── qwen3_reranker.sh
│ │ │ └── qwen3_vl_reranker.sh
│ │ ├── rlhf/
│ │ │ ├── dpo/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── group_by_length.sh
│ │ │ │ ├── moe.sh
│ │ │ │ └── packing.sh
│ │ │ ├── gkd/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── opsd.sh
│ │ │ │ └── teacher_server.sh
│ │ │ ├── kto/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── rm/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── seq_cls/
│ │ │ ├── full.sh
│ │ │ └── lora/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ └── sft.sh
│ ├── models/
│ │ ├── deepseek_ocr/
│ │ │ ├── infer.py
│ │ │ └── train.sh
│ │ ├── deepseek_vl2/
│ │ │ └── train.sh
│ │ ├── glm-4.6v/
│ │ │ ├── flash.sh
│ │ │ └── mcore.sh
│ │ ├── gpt_oss/
│ │ │ ├── internvl3_5_gpt.sh
│ │ │ ├── mcore.sh
│ │ │ └── train.sh
│ │ ├── hunyuan_ocr/
│ │ │ └── train.sh
│ │ ├── internvl3/
│ │ │ └── train.sh
│ │ ├── keye/
│ │ │ └── train.sh
│ │ ├── llama4/
│ │ │ └── mcore.sh
│ │ ├── minicpmv/
│ │ │ └── train.sh
│ │ ├── ovis2/
│ │ │ └── train.sh
│ │ ├── qwen3_5/
│ │ │ ├── mcore.sh
│ │ │ ├── mcore_full.sh
│ │ │ ├── mcore_grpo_moe.sh
│ │ │ ├── packing.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_next/
│ │ │ ├── mcore.sh
│ │ │ ├── mtp.sh
│ │ │ ├── non_padding_free.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_omni/
│ │ │ ├── transformers.sh
│ │ │ └── zero3.sh
│ │ └── qwen3_vl/
│ │ ├── mcore.sh
│ │ ├── mcore_full.sh
│ │ ├── mixed.sh
│ │ ├── transformers.sh
│ │ └── zero3.sh
│ ├── notebook/
│ │ ├── qwen2_5-self-cognition/
│ │ │ ├── infer.ipynb
│ │ │ ├── infer.sh
│ │ │ ├── self-cognition-sft.ipynb
│ │ │ └── sft.sh
│ │ ├── qwen2_5-vl-grounding/
│ │ │ └── zh.ipynb
│ │ └── qwen2vl-ocr/
│ │ ├── infer.ipynb
│ │ └── ocr-sft.ipynb
│ ├── sampler/
│ │ ├── distill/
│ │ │ ├── distill.sh
│ │ │ └── distill.yaml
│ │ └── sample/
│ │ ├── sample.sh
│ │ └── sampling.yaml
│ ├── train/
│ │ ├── agent/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── glm4.sh
│ │ │ ├── loss_scale/
│ │ │ │ ├── infer_lora.py
│ │ │ │ └── train.sh
│ │ │ └── qwen2_5.sh
│ │ ├── all_to_all/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ ├── base_to_chat/
│ │ │ ├── full.sh
│ │ │ ├── lora.sh
│ │ │ └── lora2.sh
│ │ ├── cached_dataset/
│ │ │ ├── dpo.sh
│ │ │ ├── mcore.sh
│ │ │ ├── pretrained.sh
│ │ │ ├── reranker.sh
│ │ │ ├── seq_cls.sh
│ │ │ ├── sft.sh
│ │ │ └── vlm.sh
│ │ ├── early_stop/
│ │ │ └── lora_sft.sh
│ │ ├── embedding/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_emb.sh
│ │ │ │ └── qwen3_vl_emb.sh
│ │ │ └── train_gme.sh
│ │ ├── flash_attention_3/
│ │ │ ├── mcore.sh
│ │ │ └── transformers.sh
│ │ ├── full/
│ │ │ ├── dft.sh
│ │ │ ├── infer.sh
│ │ │ ├── qwen2_5_32b.sh
│ │ │ └── train.sh
│ │ ├── grpo/
│ │ │ ├── external/
│ │ │ │ ├── README.md
│ │ │ │ ├── agent.sh
│ │ │ │ ├── grpo_32b_full.sh
│ │ │ │ ├── grpo_7b.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── vllm_gym.sh
│ │ │ │ └── vllm_multi_turn.sh
│ │ │ ├── internal/
│ │ │ │ ├── README.md
│ │ │ │ ├── chord.sh
│ │ │ │ ├── full_lmdeploy.sh
│ │ │ │ ├── gspo.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── qlora.sh
│ │ │ │ ├── reinforce_plus_plus.sh
│ │ │ │ ├── rloo.sh
│ │ │ │ ├── sapo.sh
│ │ │ │ ├── transformers.sh
│ │ │ │ ├── vllm_72b_4gpu.sh
│ │ │ │ ├── vllm_lora_qwenvl72b.sh
│ │ │ │ ├── vllm_multi_turn.sh
│ │ │ │ └── vllm_vl7b.sh
│ │ │ ├── multi_node/
│ │ │ │ ├── Qwen2_5_32B_full.sh
│ │ │ │ ├── colocate_multi_node1.sh
│ │ │ │ ├── colocate_multi_node2.sh
│ │ │ │ ├── server_multi_node.sh
│ │ │ │ └── train_dlc.sh
│ │ │ ├── plugin/
│ │ │ │ ├── deepeyes/
│ │ │ │ │ ├── deepeyes.sh
│ │ │ │ │ └── deepeyes_plugin.py
│ │ │ │ ├── gsm8k/
│ │ │ │ │ ├── gsm8k.sh
│ │ │ │ │ └── gsm8k_plugin.py
│ │ │ │ ├── plugin.py
│ │ │ │ ├── run_external_reward_func.sh
│ │ │ │ ├── run_external_reward_model.sh
│ │ │ │ ├── run_external_scheduler.sh
│ │ │ │ └── treepo/
│ │ │ │ ├── tree_rollout.py
│ │ │ │ ├── tree_rollout.sh
│ │ │ │ └── tree_rollout_plugin.py
│ │ │ ├── prompt.txt
│ │ │ └── qwen2_5_omni/
│ │ │ ├── grpo.sh
│ │ │ └── infer.sh
│ │ ├── infer.sh
│ │ ├── liger/
│ │ │ └── sft.sh
│ │ ├── lora_sft.sh
│ │ ├── moe/
│ │ │ ├── llama4.sh
│ │ │ └── qwen3_moe.sh
│ │ ├── multi-gpu/
│ │ │ ├── ddp/
│ │ │ │ └── train.sh
│ │ │ ├── ddp_device_map/
│ │ │ │ └── train.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── train_zero2.sh
│ │ │ │ └── train_zero3.sh
│ │ │ ├── device_map/
│ │ │ │ └── train.sh
│ │ │ ├── fsdp2_lora/
│ │ │ │ ├── fsdp2.json
│ │ │ │ └── train.sh
│ │ │ └── fsdp_qlora/
│ │ │ ├── fsdp_offload.json
│ │ │ └── train.sh
│ │ ├── multi-node/
│ │ │ ├── accelerate/
│ │ │ │ ├── multi_node.yaml
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── README.md
│ │ │ │ ├── host.txt
│ │ │ │ └── train.sh
│ │ │ ├── dlc/
│ │ │ │ └── train.sh
│ │ │ ├── ray/
│ │ │ │ ├── sft.sh
│ │ │ │ └── sft.yaml
│ │ │ ├── swift/
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ └── torchrun/
│ │ │ ├── train_node1.sh
│ │ │ └── train_node2.sh
│ │ ├── multimodal/
│ │ │ ├── audio.sh
│ │ │ ├── caption.sh
│ │ │ ├── grounding.sh
│ │ │ ├── infer.sh
│ │ │ ├── lora_llm_full_vit/
│ │ │ │ ├── infer.sh
│ │ │ │ ├── merge_lora.sh
│ │ │ │ ├── seq_cls.sh
│ │ │ │ └── sft.sh
│ │ │ ├── ocr.sh
│ │ │ ├── omni/
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── rlhf/
│ │ │ │ ├── dpo/
│ │ │ │ │ ├── full.sh
│ │ │ │ │ └── lora.sh
│ │ │ │ ├── gkd/
│ │ │ │ │ ├── fast.sh
│ │ │ │ │ └── full.sh
│ │ │ │ └── kto.sh
│ │ │ ├── video.sh
│ │ │ └── vit_gradient_checkpointing.sh
│ │ ├── new_special_tokens/
│ │ │ ├── infer.sh
│ │ │ ├── merge_lora.sh
│ │ │ ├── tokens.txt
│ │ │ └── train.sh
│ │ ├── on_policy_distillation.sh
│ │ ├── optimizer/
│ │ │ ├── muon.sh
│ │ │ └── muonclip.sh
│ │ ├── packing/
│ │ │ ├── dpo.sh
│ │ │ ├── dpo_vlm.sh
│ │ │ ├── liger_kernel.sh
│ │ │ ├── llm.sh
│ │ │ ├── qwen2_5_omni.sh
│ │ │ ├── qwen2_5_vl.sh
│ │ │ └── streaming.sh
│ │ ├── padding_free/
│ │ │ ├── dpo_vlm.sh
│ │ │ └── sft.sh
│ │ ├── plugins/
│ │ │ ├── loss_scale.sh
│ │ │ └── tuner_phi4_mm.sh
│ │ ├── predict_with_generate/
│ │ │ └── train.sh
│ │ ├── pretrain/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ ├── awq/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── bnb/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── gptq.sh
│ │ │ └── hqq.sh
│ │ ├── reranker/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_reranker.sh
│ │ │ │ └── qwen3_vl_reranker.sh
│ │ │ ├── train_generative_reranker.sh
│ │ │ ├── train_generative_reranker_listwise.sh
│ │ │ ├── train_reranker.sh
│ │ │ ├── train_reranker_auto_patch.sh
│ │ │ ├── train_reranker_listwise.sh
│ │ │ └── train_reranker_mm.sh
│ │ ├── rft/
│ │ │ ├── math.json
│ │ │ └── rft.py
│ │ ├── rlhf/
│ │ │ ├── README.md
│ │ │ ├── cpo.sh
│ │ │ ├── dpo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── gkd/
│ │ │ │ ├── fast.sh
│ │ │ │ ├── full.sh
│ │ │ │ ├── teacher_server.sh
│ │ │ │ ├── think_model.sh
│ │ │ │ ├── vllm_colocate.sh
│ │ │ │ └── vllm_server.sh
│ │ │ ├── kto.sh
│ │ │ ├── mpo.sh
│ │ │ ├── opsd/
│ │ │ │ ├── opsd.sh
│ │ │ │ └── opsd_plugin.py
│ │ │ ├── orpo.sh
│ │ │ ├── ppo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── rm.sh
│ │ │ └── simpo.sh
│ │ ├── seq_cls/
│ │ │ ├── bert/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── multi_label/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ ├── sft.sh
│ │ │ │ └── vlm.sh
│ │ │ ├── qwen2_5/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── qwen2_5_omni/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ └── regression/
│ │ │ ├── deploy.sh
│ │ │ ├── infer.sh
│ │ │ └── sft.sh
│ │ ├── sequence_parallel/
│ │ │ ├── sequence_parallel.sh
│ │ │ ├── sequence_parallel_512k.sh
│ │ │ ├── sequence_parallel_dpo.sh
│ │ │ ├── sequence_parallel_emb.sh
│ │ │ ├── sequence_parallel_grpo.sh
│ │ │ ├── sequence_parallel_reranker.sh
│ │ │ └── sequence_parallel_seq_cls.sh
│ │ ├── streaming/
│ │ │ ├── lazy_tokenize.sh
│ │ │ └── streaming.sh
│ │ ├── think_model/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── qwen3_demo1.sh
│ │ │ └── qwen3_demo2.sh
│ │ └── tuners/
│ │ ├── adalora/
│ │ │ └── train.sh
│ │ ├── adapter/
│ │ │ └── train.sh
│ │ ├── boft/
│ │ │ └── train.sh
│ │ ├── bone/
│ │ │ └── train.sh
│ │ ├── dora/
│ │ │ └── train.sh
│ │ ├── galore/
│ │ │ ├── train_galore.sh
│ │ │ └── train_qgalore.sh
│ │ ├── lisa/
│ │ │ └── train.sh
│ │ ├── llamapro/
│ │ │ └── train.sh
│ │ ├── longlora/
│ │ │ └── train.sh
│ │ ├── lora/
│ │ │ └── train.sh
│ │ ├── lora-ga/
│ │ │ └── train.sh
│ │ ├── neftune/
│ │ │ └── train.sh
│ │ ├── olora/
│ │ │ └── train.sh
│ │ ├── pissa/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ └── train.sh
│ │ ├── reft/
│ │ │ └── train.sh
│ │ └── unsloth/
│ │ └── train.sh
│ └── yaml/
│ ├── sft.sh
│ └── sft.yaml
├── requirements/
│ ├── docs.txt
│ ├── eval.txt
│ ├── framework.txt
│ ├── install_all.sh
│ ├── ray.txt
│ ├── swanlab.txt
│ └── tests.txt
├── requirements.txt
├── scripts/
│ ├── benchmark/
│ │ ├── config/
│ │ │ └── tuner.json
│ │ ├── exp.py
│ │ ├── exp_utils.py
│ │ └── generate_report.py
│ └── utils/
│ ├── plot_loss.py
│ ├── run_dataset_info.py
│ ├── run_model_info.py
│ ├── run_template.py
│ └── test_link_valid.py
├── setup.cfg
├── setup.py
├── swift/
│ ├── __init__.py
│ ├── agent_template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── deepseek_v3_1.py
│ │ ├── extra.py
│ │ ├── glm4.py
│ │ ├── hermes.py
│ │ ├── llama.py
│ │ ├── mapping.py
│ │ ├── minimax_m2.py
│ │ ├── mistral.py
│ │ ├── qwen.py
│ │ ├── qwen3_coder.py
│ │ ├── react.py
│ │ ├── seed_oss.py
│ │ ├── toolbench.py
│ │ └── youtu.py
│ ├── arguments/
│ │ ├── __init__.py
│ │ ├── app_args.py
│ │ ├── base_args/
│ │ │ ├── __init__.py
│ │ │ ├── base_args.py
│ │ │ ├── data_args.py
│ │ │ ├── generation_args.py
│ │ │ ├── model_args.py
│ │ │ ├── quant_args.py
│ │ │ └── template_args.py
│ │ ├── deploy_args.py
│ │ ├── eval_args.py
│ │ ├── export_args.py
│ │ ├── infer_args.py
│ │ ├── merge_args.py
│ │ ├── pretrain_args.py
│ │ ├── rlhf_args.py
│ │ ├── sampling_args.py
│ │ ├── sft_args.py
│ │ ├── tuner_args.py
│ │ └── webui_args.py
│ ├── callbacks/
│ │ ├── __init__.py
│ │ ├── activation_cpu_offload.py
│ │ ├── adalora.py
│ │ ├── base.py
│ │ ├── deepspeed_elastic.py
│ │ ├── early_stop.py
│ │ ├── lisa.py
│ │ ├── mapping.py
│ │ └── perf_log.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── _megatron/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── main.py
│ │ │ ├── pt.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── app.py
│ │ ├── deploy.py
│ │ ├── eval.py
│ │ ├── export.py
│ │ ├── infer.py
│ │ ├── main.py
│ │ ├── merge_lora.py
│ │ ├── pt.py
│ │ ├── rlhf.py
│ │ ├── rollout.py
│ │ ├── sample.py
│ │ ├── sft.py
│ │ ├── utils.py
│ │ └── web_ui.py
│ ├── config/
│ │ ├── fsdp2.json
│ │ ├── zero0.json
│ │ ├── zero1.json
│ │ ├── zero2.json
│ │ ├── zero2_offload.json
│ │ ├── zero3.json
│ │ └── zero3_offload.json
│ ├── dataloader/
│ │ ├── __init__.py
│ │ ├── dispatcher.py
│ │ └── shard.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ └── dataset_info.json
│ │ ├── dataset/
│ │ │ ├── __init__.py
│ │ │ ├── llm.py
│ │ │ └── mllm.py
│ │ ├── dataset_meta.py
│ │ ├── dataset_syntax.py
│ │ ├── indexed_dataset.py
│ │ ├── loader.py
│ │ ├── media.py
│ │ ├── packing.py
│ │ ├── preprocessor/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── extra.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── hub/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ └── hub.py
│ ├── infer_engine/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── grpo_vllm_engine.py
│ │ ├── infer_client.py
│ │ ├── infer_engine.py
│ │ ├── lmdeploy_engine.py
│ │ ├── patch.py
│ │ ├── protocol.py
│ │ ├── sglang_engine.py
│ │ ├── transformers_engine.py
│ │ ├── utils.py
│ │ └── vllm_engine.py
│ ├── loss/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── causal_lm.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ └── reranker.py
│ ├── loss_scale/
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── base.py
│ │ ├── config/
│ │ │ ├── agentflan.json
│ │ │ ├── alpha_umi.json
│ │ │ ├── hermes.json
│ │ │ ├── ignore_empty_think.json
│ │ │ ├── qwen.json
│ │ │ └── react.json
│ │ ├── mapping.py
│ │ ├── other.py
│ │ └── utils.py
│ ├── megatron/
│ │ ├── __init__.py
│ │ ├── arguments/
│ │ │ ├── __init__.py
│ │ │ ├── export_args.py
│ │ │ ├── megatron_args.py
│ │ │ ├── megatron_base_args.py
│ │ │ ├── pretrain_args.py
│ │ │ ├── rlhf_args.py
│ │ │ └── sft_args.py
│ │ ├── callbacks/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── default_flow.py
│ │ │ ├── mapping.py
│ │ │ ├── print.py
│ │ │ ├── swanlab.py
│ │ │ ├── tensorboard.py
│ │ │ ├── utils.py
│ │ │ └── wandb.py
│ │ ├── convert.py
│ │ ├── init.py
│ │ ├── model/
│ │ │ ├── __init__.py
│ │ │ ├── constant.py
│ │ │ ├── gpt_bridge.py
│ │ │ ├── gpt_model.py
│ │ │ ├── gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm4.py
│ │ │ │ ├── minimax_m2.py
│ │ │ │ ├── olmoe.py
│ │ │ │ ├── qwen3_emb.py
│ │ │ │ └── qwen3_next.py
│ │ │ ├── mm_gpt_model.py
│ │ │ ├── mm_gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm.py
│ │ │ │ ├── internvl.py
│ │ │ │ ├── kimi_vl.py
│ │ │ │ ├── llama4.py
│ │ │ │ ├── qwen.py
│ │ │ │ ├── qwen3_5.py
│ │ │ │ ├── qwen3_5_gdn.py
│ │ │ │ ├── qwen3_vl.py
│ │ │ │ └── utils.py
│ │ │ ├── model_config.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gated_delta_net.py
│ │ │ │ └── gated_self_attention.py
│ │ │ ├── register.py
│ │ │ └── rope.py
│ │ ├── pipelines/
│ │ │ ├── __init__.py
│ │ │ ├── export/
│ │ │ │ ├── __init__.py
│ │ │ │ └── export.py
│ │ │ └── train/
│ │ │ ├── __init__.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── trainers/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── batch_sampler.py
│ │ │ ├── dpo_trainer.py
│ │ │ ├── embedding_trainer.py
│ │ │ ├── gkd_trainer.py
│ │ │ ├── grpo_trainer.py
│ │ │ ├── kto_trainer.py
│ │ │ ├── reranker_trainer.py
│ │ │ ├── reward_trainer.py
│ │ │ ├── rlhf_mixin.py
│ │ │ ├── rollout_mixin.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── vocab_parallel_utils.py
│ │ ├── tuners/
│ │ │ ├── __init__.py
│ │ │ └── lora.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── convert_utils.py
│ │ ├── megatron_lm_utils.py
│ │ ├── parallel_utils.py
│ │ ├── patcher.py
│ │ └── utils.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── acc.py
│ │ ├── base.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ ├── nlg.py
│ │ ├── reranker.py
│ │ └── utils.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ ├── model_arch.py
│ │ ├── model_meta.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baichuan.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── codefuse.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── internlm.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── mamba.py
│ │ │ ├── microsoft.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── mistral.py
│ │ │ ├── mllm.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── skywork.py
│ │ │ ├── stepfun.py
│ │ │ ├── telechat.py
│ │ │ ├── tencent.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── npu_patcher.py
│ │ ├── patcher.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── galore/
│ │ │ ├── __init__.py
│ │ │ ├── adafactor.py
│ │ │ ├── adamw.py
│ │ │ ├── adamw8bit.py
│ │ │ ├── galore_projector.py
│ │ │ └── utils.py
│ │ ├── lorap.py
│ │ ├── mapping.py
│ │ ├── multimodal.py
│ │ ├── muon.py
│ │ └── muonclip.py
│ ├── pipelines/
│ │ ├── __init__.py
│ │ ├── app/
│ │ │ ├── __init__.py
│ │ │ ├── app.py
│ │ │ ├── build_ui.py
│ │ │ └── locale.py
│ │ ├── base.py
│ │ ├── eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ └── utils.py
│ │ ├── export/
│ │ │ ├── __init__.py
│ │ │ ├── cached_dataset.py
│ │ │ ├── export.py
│ │ │ ├── merge_lora.py
│ │ │ ├── ollama.py
│ │ │ └── quant.py
│ │ ├── infer/
│ │ │ ├── __init__.py
│ │ │ ├── deploy.py
│ │ │ ├── infer.py
│ │ │ ├── rollout.py
│ │ │ └── utils.py
│ │ ├── sampling/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── distill_sampler.py
│ │ │ ├── sampling.py
│ │ │ ├── utils.py
│ │ │ └── vanilla_sampler.py
│ │ ├── train/
│ │ │ ├── __init__.py
│ │ │ ├── kto.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ ├── sft.py
│ │ │ └── tuner.py
│ │ └── utils.py
│ ├── ray/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── base.py
│ │ └── resource_manager.py
│ ├── rewards/
│ │ ├── __init__.py
│ │ ├── orm.py
│ │ ├── prm.py
│ │ └── rm_plugin.py
│ ├── rlhf_trainers/
│ │ ├── __init__.py
│ │ ├── args_mixin.py
│ │ ├── arguments.py
│ │ ├── cpo_trainer.py
│ │ ├── dpo_trainer.py
│ │ ├── gkd_trainer.py
│ │ ├── grpo_trainer.py
│ │ ├── kto_trainer.py
│ │ ├── orpo_trainer.py
│ │ ├── ppo_trainer.py
│ │ ├── reward_trainer.py
│ │ ├── rlhf_mixin.py
│ │ ├── rollout_mixin.py
│ │ ├── utils.py
│ │ └── vllm_client.py
│ ├── rollout/
│ │ ├── __init__.py
│ │ ├── gym_env.py
│ │ └── multi_turn.py
│ ├── sequence_parallel/
│ │ ├── __init__.py
│ │ ├── ulysses.py
│ │ ├── utils.py
│ │ └── zigzag_ring_attn.py
│ ├── template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── constant.py
│ │ ├── grounding.py
│ │ ├── register.py
│ │ ├── template_inputs.py
│ │ ├── template_meta.py
│ │ ├── templates/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── deepseek.py
│ │ │ ├── dots.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── idefics3.py
│ │ │ ├── internlm.py
│ │ │ ├── internvl.py
│ │ │ ├── kwai.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── megrez.py
│ │ │ ├── microsoft.py
│ │ │ ├── midashenglm.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── minimind.py
│ │ │ ├── mistral.py
│ │ │ ├── molmo.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── pixtral.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── stepfun.py
│ │ │ ├── tencent.py
│ │ │ ├── utils.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── utils.py
│ │ └── vision_utils.py
│ ├── trainers/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── embedding_trainer.py
│ │ ├── mixin.py
│ │ ├── patcher.py
│ │ ├── reranker_trainer.py
│ │ ├── seq2seq_trainer.py
│ │ ├── trainer.py
│ │ ├── trainer_factory.py
│ │ └── utils.py
│ ├── tuner_plugin/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dummy.py
│ │ ├── ia3.py
│ │ ├── lora_llm.py
│ │ └── mapping.py
│ ├── tuners/
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── base.py
│ │ ├── llamapro.py
│ │ ├── longlora/
│ │ │ ├── __init__.py
│ │ │ ├── llama.py
│ │ │ └── longlora.py
│ │ ├── lora.py
│ │ ├── lora_layers.py
│ │ ├── mapping.py
│ │ ├── neftune.py
│ │ ├── part.py
│ │ ├── peft.py
│ │ ├── prompt.py
│ │ ├── reft.py
│ │ ├── restuning.py
│ │ ├── restuning_components.py
│ │ ├── scetuning/
│ │ │ ├── __init__.py
│ │ │ ├── scetuning.py
│ │ │ └── scetuning_components.py
│ │ ├── side.py
│ │ └── utils.py
│ ├── ui/
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── base.py
│ │ ├── llm_eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ ├── llm_eval.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_export/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── llm_export.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_grpo/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── external_rollout.py
│ │ │ ├── external_runtime.py
│ │ │ ├── grpo_advanced.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_grpo.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── reward.py
│ │ │ ├── rollout.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_infer/
│ │ │ ├── __init__.py
│ │ │ ├── generate.py
│ │ │ ├── llm_infer.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_rlhf/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_rlhf.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── rlhf.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_sample/
│ │ │ ├── __init__.py
│ │ │ ├── llm_sample.py
│ │ │ ├── model.py
│ │ │ ├── runtime.py
│ │ │ └── sample.py
│ │ └── llm_train/
│ │ ├── __init__.py
│ │ ├── advanced.py
│ │ ├── dataset.py
│ │ ├── hyper.py
│ │ ├── llm_train.py
│ │ ├── lora.py
│ │ ├── model.py
│ │ ├── optimizer.py
│ │ ├── quantization.py
│ │ ├── report_to.py
│ │ ├── runtime.py
│ │ ├── save.py
│ │ ├── self_cog.py
│ │ ├── target.py
│ │ ├── task.py
│ │ ├── tuner.py
│ │ └── utils.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── dequantizer.py
│ │ ├── env.py
│ │ ├── hf_config.py
│ │ ├── hub_utils.py
│ │ ├── import_utils.py
│ │ ├── io_utils.py
│ │ ├── logger.py
│ │ ├── np_utils.py
│ │ ├── processor_utils.py
│ │ ├── safetensors.py
│ │ ├── shutdown_manager.py
│ │ ├── tb_utils.py
│ │ ├── torch_utils.py
│ │ ├── transformers_utils.py
│ │ └── utils.py
│ └── version.py
└── tests/
├── __init__.py
├── app/
│ └── test_app.py
├── deploy/
│ ├── test_dataset.py
│ └── test_logprobs.py
├── eval/
│ └── test_eval.py
├── export/
│ └── test_quant.py
├── general/
│ ├── test_arch.py
│ ├── test_dataset.py
│ ├── test_model.py
│ ├── test_stream.py
│ └── test_template.py
├── hub/
│ ├── __init__.py
│ └── test_check_model.py
├── infer/
│ ├── test_agent.py
│ ├── test_infer.py
│ ├── test_logprobs.py
│ ├── test_main.py
│ ├── test_max_memory.py
│ ├── test_mllm.py
│ └── test_sglang.py
├── llm/
│ ├── __init__.py
│ ├── config/
│ │ ├── infer.json
│ │ └── sft.json
│ ├── data/
│ │ ├── alpaca.csv
│ │ ├── alpaca.jsonl
│ │ ├── alpaca2.csv
│ │ ├── chatml.jsonl
│ │ ├── conversations.jsonl
│ │ ├── multi_modal_1.jsonl
│ │ ├── multi_modal_2.jsonl
│ │ ├── multi_modal_3.jsonl
│ │ ├── sharegpt.jsonl
│ │ ├── swift_multi.json
│ │ ├── swift_multi.jsonl
│ │ ├── swift_pre.csv
│ │ ├── swift_pre.jsonl
│ │ ├── swift_single.csv
│ │ └── swift_single.jsonl
│ ├── test_custom.py
│ ├── test_dataset.py
│ ├── test_ollama_export.py
│ ├── test_run.py
│ ├── test_template.py
│ ├── test_utils.py
│ └── test_web_ui.py
├── megatron/
│ ├── export/
│ │ └── test_export.py
│ ├── test_align/
│ │ ├── test_llm.py
│ │ └── test_mllm.py
│ ├── test_embedding.py
│ ├── test_export.py
│ ├── test_gkd.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_lora.py
│ ├── test_rlhf.py
│ └── test_train.py
├── model_tag.py
├── models/
│ ├── test_flash_attn.py
│ ├── test_llm.py
│ └── test_mllm.py
├── run.py
├── run_config.yaml
├── sample/
│ └── test_client.py
├── test_align/
│ ├── test_cls.py
│ ├── test_lmdeploy_vlm.py
│ ├── test_padding_side.py
│ ├── test_rlhf_loss.py
│ ├── test_template/
│ │ ├── test_agent.py
│ │ ├── test_audio.py
│ │ ├── test_gene.py
│ │ ├── test_llm.py
│ │ ├── test_template.py
│ │ ├── test_tool.py
│ │ ├── test_video.py
│ │ └── test_vision.py
│ └── test_vllm_vlm.py
├── test_utils.py
├── train/
│ ├── test_channel.py
│ ├── test_cls.py
│ ├── test_embedding.py
│ ├── test_export_cached_dataset.py
│ ├── test_freeze.py
│ ├── test_gkd.py
│ ├── test_grounding.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_liger.py
│ ├── test_multilabel.py
│ ├── test_packing.py
│ ├── test_ppo.py
│ ├── test_pt.py
│ ├── test_resume_from_checkpoint.py
│ ├── test_rlhf.py
│ ├── test_sample.py
│ ├── test_sft.py
│ ├── test_train_eval.py
│ ├── test_vit_lr.py
│ └── test_vllm_importance_sampling_basic.py
├── tuners/
│ ├── __init__.py
│ ├── test_extra_state_dict.py
│ ├── test_merged_linear.py
│ ├── test_neft.py
│ ├── test_peft.py
│ ├── test_scetuning.py
│ ├── test_swift_base.py
│ ├── test_swift_device_map.py
│ └── test_swift_restuning.py
└── utils/
├── __init__.py
├── test_async_rewards.py
├── test_file_utils.py
├── test_io_utils.py
├── test_rewards.py
├── test_split_str_parts_by.py
└── test_torch_utils.py
Showing preview only (434K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (4974 symbols across 527 files)
FILE: docs/source/conf.py
function get_version (line 29) | def get_version():
FILE: docs/source_en/conf.py
function get_version (line 29) | def get_version():
FILE: examples/custom/dataset.py
class CustomPreprocessor (line 7) | class CustomPreprocessor(ResponsePreprocessor):
method preprocess (line 13) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
FILE: examples/custom/model_hf.py
class MyModelLoader (line 22) | class MyModelLoader(ModelLoader):
method get_config (line 24) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_processor (line 27) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 30) | def get_model(self, model_dir: str, config: PretrainedConfig, processo...
FILE: examples/custom/my_qwen2_5_omni/my_register.py
class Qwen2_5OmniLoader (line 35) | class Qwen2_5OmniLoader(ModelLoader):
method get_config (line 37) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_processor (line 45) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 54) | def get_model(self, model_dir: str, config: PretrainedConfig, processo...
class Qwen2_5OmniTemplate (line 105) | class Qwen2_5OmniTemplate(Template):
method init_processor (line 115) | def init_processor(self, processor) -> None:
method replace_tag (line 129) | def replace_tag(self, media_type: Literal['image', 'video', 'audio'], ...
method replace_ref (line 160) | def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs)...
method replace_bbox (line 167) | def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplat...
method packing_row (line 174) | def packing_row(self, row: List[Dict[str, Any]]) -> Dict[str, Any]:
method _get_new_tokens_use_audio_in_video (line 187) | def _get_new_tokens_use_audio_in_video(self, i, *, video_grid_thw, vid...
method _encode (line 214) | def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
method _post_encode (line 306) | def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
method _get_position_ids (line 358) | def _get_position_ids(self, inputs: Dict[str, Any]):
method _data_collator (line 381) | def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: O...
method _data_collator_mm_data (line 396) | def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[...
method generate (line 412) | def generate(self, model, *args, **kwargs):
FILE: examples/custom/my_qwen2_5_omni/test_register.py
function infer_hf (line 13) | def infer_hf():
function test_my_qwen2_5_omni (line 63) | def test_my_qwen2_5_omni():
FILE: examples/deploy/agent/client.py
function get_infer_request (line 8) | def get_infer_request():
function infer (line 31) | def infer(client, model: str, messages, tools):
function infer_stream (line 49) | def infer_stream(client, model: str, messages, tools):
FILE: examples/deploy/bert/client.py
function infer_batch (line 6) | def infer_batch(engine: InferClient, infer_requests: List[InferRequest]):
FILE: examples/deploy/client/llm/base/openai_client.py
function infer (line 8) | def infer(client, model: str, messages):
function run_client (line 21) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/client/llm/base/swift_client.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
function run_client (line 18) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/client/llm/chat/openai_client.py
function infer (line 8) | def infer(client, model: str, messages):
function infer_stream (line 18) | def infer_stream(client, model: str, messages):
function run_client (line 28) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/client/llm/chat/swift_client.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
function infer_stream (line 25) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
function run_client (line 39) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/client/mllm/openai_client.py
function infer (line 9) | def infer(client, model: str, messages):
function infer_stream (line 19) | def infer_stream(client, model: str, messages):
function get_message (line 29) | def get_message(mm_type: Literal['text', 'image', 'video', 'audio']):
function run_client (line 79) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/client/mllm/swift_client.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
function infer_stream (line 18) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
function get_message (line 32) | def get_message(mm_type: Literal['text', 'image', 'video', 'audio']):
function get_data (line 86) | def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
function run_client (line 105) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/embedding/client.py
function infer (line 8) | def infer(client, model: str, messages):
function run_client (line 22) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/lora/client.py
function infer_multilora (line 4) | def infer_multilora(engine: InferClient, infer_request: InferRequest):
FILE: examples/deploy/reranker/client.py
function infer (line 8) | def infer(client, model: str, messages):
function run_client (line 16) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/reranker/client_generative.py
function infer (line 8) | def infer(client, model: str, messages):
function run_client (line 16) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/deploy/seq_cls/client.py
function infer (line 8) | def infer(client, model: str, messages):
function run_client (line 16) | def run_client(host: str = '127.0.0.1', port: int = 8000):
FILE: examples/infer/demo.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
function infer_stream (line 19) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
FILE: examples/infer/demo_agent.py
function infer (line 8) | def infer(engine: 'InferEngine', infer_request: 'InferRequest'):
function infer_stream (line 26) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
function get_infer_request (line 53) | def get_infer_request():
function infer_continue_generate (line 79) | def infer_continue_generate(engine):
FILE: examples/infer/demo_bert.py
function infer_batch (line 9) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
FILE: examples/infer/demo_embedding.py
function run_qwen3_emb (line 6) | def run_qwen3_emb():
function run_qwen3_vl_emb (line 33) | def run_qwen3_vl_emb():
FILE: examples/infer/demo_grounding.py
function draw_bbox_qwen2_vl (line 9) | def draw_bbox_qwen2_vl(image, response, norm_bbox: Literal['norm1000', '...
function infer_grounding (line 21) | def infer_grounding():
FILE: examples/infer/demo_hf.py
function infer_hf (line 1) | def infer_hf():
function infer_swift (line 33) | def infer_swift():
FILE: examples/infer/demo_lora.py
function infer_multilora (line 7) | def infer_multilora(infer_request: 'InferRequest', infer_backend: Litera...
function infer_lora (line 37) | def infer_lora(infer_request: 'InferRequest'):
FILE: examples/infer/demo_mllm.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
function infer_stream (line 19) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
function get_message (line 33) | def get_message(mm_type: Literal['text', 'image', 'video', 'audio']):
function get_data (line 80) | def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
FILE: examples/infer/demo_reranker.py
function run_qwen3_reranker (line 6) | def run_qwen3_reranker():
function run_qwen3_vl_reranker (line 29) | def run_qwen3_vl_reranker():
FILE: examples/infer/demo_reward_model.py
function infer_batch (line 8) | def infer_batch(engine: 'InferEngine', infer_requests: List['InferReques...
FILE: examples/infer/demo_vllm_reasoning_parser.py
function main (line 10) | def main(engine: VllmEngine):
function streaming_example (line 43) | def streaming_example(engine: VllmEngine):
FILE: examples/train/agent/loss_scale/infer_lora.py
function infer (line 8) | def infer(engine: 'InferEngine', infer_request: 'InferRequest'):
function infer_stream (line 26) | def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
function get_infer_request (line 53) | def get_infer_request():
FILE: examples/train/embedding/qwen3/infer.py
function run_qwen3_emb (line 9) | def run_qwen3_emb():
FILE: examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py
function extract_answer (line 63) | def extract_answer(action_string: str) -> Dict[str, any]:
function extract_action (line 68) | def extract_action(action_string: str) -> Dict[str, Any]:
function get_chat_template (line 73) | def get_chat_template():
function get_gpt4_score_ICE (line 82) | def get_gpt4_score_ICE():
function get_prompt (line 135) | def get_prompt(predict_str, ground_truth, question):
function load_pil_image (line 151) | def load_pil_image(img):
function rule_math_verify (line 182) | def rule_math_verify(ground_truth, model_answer):
class DeepEyesReward (line 188) | class DeepEyesReward(ORM):
method __init__ (line 190) | def __init__(self, args, **kwargs):
method __call__ (line 202) | def __call__(self, completions, reward_model, extra_info, data_source,...
method compute_score (line 220) | def compute_score(self, predict_str: str, ground_truth: str, extra_inf...
method compute_score_math (line 290) | def compute_score_math(self, predict_str: str, ground_truth: str, extr...
class VisualToolBoxScheduler (line 348) | class VisualToolBoxScheduler(MultiTurnScheduler):
method __init__ (line 353) | def __init__(self, infer_engine=None, max_turns=None, *args, **kwargs):
method check_finished (line 356) | def check_finished(self, infer_request, response_choice, current_turn):
method step (line 370) | def step(self, infer_request, response_choice, current_turn):
method validate_bbox (line 408) | def validate_bbox(self, left, top, right, bottom):
method maybe_resize_bbox (line 417) | def maybe_resize_bbox(self, bbox, origin_width, origin_height):
FILE: examples/train/grpo/plugin/gsm8k/gsm8k_plugin.py
class GSM8KAccuracy (line 7) | class GSM8KAccuracy(ORM):
method extract_answer (line 10) | def extract_answer(text: str) -> str:
method __call__ (line 23) | def __call__(self, completions, solution, **kwargs) -> List[float]:
class GSM8KFormat (line 38) | class GSM8KFormat(ORM):
method __call__ (line 40) | def __call__(self, completions, **kwargs) -> List[float]:
FILE: examples/train/grpo/plugin/plugin.py
class CountdownORM (line 40) | class CountdownORM(ORM):
method __call__ (line 42) | def __call__(self, completions, target, nums, **kwargs) -> List[float]:
class MultiModalAccuracyORM (line 95) | class MultiModalAccuracyORM(ORM):
method __call__ (line 97) | def __call__(self, completions, solution, **kwargs) -> List[float]:
class MultiTurnThinkingTips (line 142) | class MultiTurnThinkingTips(ORM):
method __init__ (line 157) | def __init__(self, args=None, **kwargs):
method __call__ (line 162) | def __call__(self, completions, **kwargs) -> List[float]:
class CodeReward (line 185) | class CodeReward(ORM):
method __init__ (line 187) | def __init__(self, args=None, **kwargs):
method extract_code (line 197) | def extract_code(completion: str, language: str) -> str:
method run_async_from_sync (line 203) | def run_async_from_sync(self, scripts: List[str], languages: List[str]...
method run_async (line 217) | async def run_async(self, scripts: List[str], languages: List[str]) ->...
method run_script (line 238) | async def run_script(self, sbx, script: str, language: str) -> float:
method __call__ (line 249) | def __call__(self, completions, **kwargs) -> List[float]:
class CodeFormat (line 310) | class CodeFormat(ORM):
method __call__ (line 312) | def __call__(self, completions, **kwargs) -> List[float]:
class CodeRewardByJudge0 (line 326) | class CodeRewardByJudge0(ORM):
method __init__ (line 373) | def __init__(self, args, **kwargs):
method extract_code (line 384) | def extract_code(completion: str, language: str) -> str:
method get_language_id (line 391) | def get_language_id(cls, language):
method _evaluate_code (line 396) | async def _evaluate_code(self, code, test_cases, language_id):
method run_async_from_sync (line 426) | def run_async_from_sync(self):
method run_async (line 435) | async def run_async(self):
method __call__ (line 444) | def __call__(self, completions, **kwargs) -> List[float]:
class AsyncGenRMReward (line 463) | class AsyncGenRMReward(AsyncORM):
method __init__ (line 494) | def __init__(self, args, **kwargs):
method _build_eval_prompt (line 531) | def _build_eval_prompt(self, question: str, completion: str) -> str:
method _extract_score (line 544) | def _extract_score(self, response: str) -> float:
method _score_single (line 562) | async def _score_single(self, session, question: str, completion: str)...
method __call__ (line 602) | async def __call__(self, completions, messages, **kwargs) -> List[float]:
class ToolUseFormatReward (line 642) | class ToolUseFormatReward(ORM):
method __init__ (line 644) | def __init__(self, args=None, **kwargs):
method __call__ (line 649) | def __call__(self, completions, solution, **kwargs) -> List[float]:
class ToolUseLengthReward (line 706) | class ToolUseLengthReward(ORM):
method __init__ (line 708) | def __init__(self, args=None, **kwargs):
method __call__ (line 714) | def __call__(self, completions, solution, **kwargs):
class ToolUseCorrectnessReward (line 746) | class ToolUseCorrectnessReward(ORM):
method __init__ (line 748) | def __init__(self, args=None, **kwargs):
method match_score (line 757) | def match_score(self, list1, list2):
method compute_tool_call_reward (line 776) | def compute_tool_call_reward(self, gt_tools, pd_tools, max_possible_re...
method __call__ (line 838) | def __call__(self, completions, solution, **kwargs):
class CustomizedRMPlugin (line 914) | class CustomizedRMPlugin:
method __init__ (line 922) | def __init__(self, model, template):
method __call__ (line 926) | def __call__(self, inputs, **kwargs):
class QwenLongPlugin (line 934) | class QwenLongPlugin(DefaultRMPlugin):
method __init__ (line 940) | def __init__(self, model, template, accuracy_orm=None):
method __call__ (line 965) | def __call__(self, inputs, **kwargs):
method prepare_rm_inputs (line 981) | def prepare_rm_inputs(self, inputs: List[Dict], completions, ground_tr...
method extract_reward (line 1002) | def extract_reward(model_output: str) -> float:
method compute_rewards (line 1017) | def compute_rewards(self, results: List[ChatCompletionResponse]) -> Li...
class ToolCallScheduler (line 1076) | class ToolCallScheduler(MultiTurnScheduler):
method __init__ (line 1079) | def __init__(self, *args, **kwargs):
method _calculator_tool (line 1086) | def _calculator_tool(self, expression: str) -> str:
method _extract_tool_calls (line 1157) | def _extract_tool_calls(self, text: str):
method _execute_tools (line 1170) | def _execute_tools(self, tool_calls):
method check_finished (line 1185) | def check_finished(self, infer_request: 'RolloutInferRequest', respons...
method step (line 1194) | def step(self, infer_request: 'RolloutInferRequest', response_choice: ...
class CustomEnv (line 1225) | class CustomEnv(Env):
class CustomCtxManager (line 1232) | class CustomCtxManager(ContextManager):
FILE: examples/train/grpo/plugin/treepo/tree_rollout.py
class SampleStatus (line 12) | class SampleStatus(Enum):
class FinishedReason (line 20) | class FinishedReason(Enum):
class DataSampleTree (line 27) | class DataSampleTree:
method root_node (line 51) | def root_node(self):
method depth (line 55) | def depth(self):
method response_num (line 59) | def response_num(self):
method response_truncate (line 62) | def response_truncate(self, truncate_len: int):
method extend_response (line 75) | def extend_response(self, choice: ChatCompletionResponseChoice):
method extend_response_text (line 85) | def extend_response_text(self, response_text: str):
method extend_logprobs (line 88) | def extend_logprobs(self, logprobs: List[float]):
function _repeat_list_interleave (line 92) | def _repeat_list_interleave(any_list, repeat_times):
function _increment_tree_idx_depth (line 97) | def _increment_tree_idx_depth(
function extract_last_boxed (line 106) | def extract_last_boxed(text):
class AbstractDivergence (line 115) | class AbstractDivergence:
method calc_weights (line 118) | def calc_weights(cls, root_idx, samples_to_go_deeper, **kwargs) -> Lis...
method allocate_with_weights (line 122) | def allocate_with_weights(cls, weights, budget, max_divergence) -> Lis...
method apply (line 164) | def apply(cls, root_idx, samples_to_go_deeper, divergence_budget, max_...
class LogProbDivergence (line 183) | class LogProbDivergence(AbstractDivergence):
method calc_weights (line 186) | def calc_weights(cls, root_idx, samples_to_go_deeper, **kwargs) -> Lis...
class AvgDivergence (line 204) | class AvgDivergence(AbstractDivergence):
method calc_weights (line 207) | def calc_weights(cls, root_idx, samples_to_go_deeper, **kwargs) -> Lis...
FILE: examples/train/grpo/plugin/treepo/tree_rollout_plugin.py
class TreeRolloutScheduler (line 15) | class TreeRolloutScheduler(MultiTurnScheduler):
method __init__ (line 40) | def __init__(self, infer_engine=None, max_turns=None, *args, **kwargs):
method async_infer (line 50) | async def async_infer(self,
method run (line 85) | async def run(self, infer_request: Union[List[RolloutInferRequest], Ro...
method step (line 189) | def step(self, sample: DataSampleTree, **kwargs):
method check_finished (line 205) | def check_finished(self, sample: DataSampleTree, output: ChatCompletio...
method roll_back_to_divergence (line 225) | def roll_back_to_divergence(
FILE: examples/train/reranker/qwen3/infer.py
function run_qwen3_reranker (line 7) | def run_qwen3_reranker():
FILE: examples/train/rft/rft.py
function do_sample (line 16) | def do_sample(model: str, model_type: str, dataset: List[str], iter: int):
function do_train (line 101) | def do_train(model: str, model_type: str, datasets: List[str], iter, cmd...
function do_eval (line 146) | def do_eval(model, model_type: str, iter):
function replace_math_dataset (line 180) | def replace_math_dataset():
function main (line 193) | def main():
FILE: examples/train/rlhf/opsd/opsd_plugin.py
class OpenThoughtsOPSDPreprocessor (line 21) | class OpenThoughtsOPSDPreprocessor(RowPreprocessor):
method preprocess (line 28) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
FILE: scripts/benchmark/exp.py
function parse_args (line 12) | def parse_args():
function llm_exp (line 33) | def llm_exp():
FILE: scripts/benchmark/exp_utils.py
class Experiment (line 18) | class Experiment:
method __init__ (line 46) | def __init__(self,
method load (line 70) | def load(self, _json):
method priority (line 80) | def priority(self):
method to_dict (line 83) | def to_dict(self):
class ExpManager (line 90) | class ExpManager:
method __init__ (line 94) | def __init__(self):
method assert_gpu_not_overlap (line 97) | def assert_gpu_not_overlap(self):
method run (line 105) | def run(self, exp: Experiment):
method _build_eval_cmd (line 148) | def _build_eval_cmd(self, exp: Experiment):
method _build_cmd (line 174) | def _build_cmd(self, exp: Experiment):
method _find_free_gpu (line 226) | def _find_free_gpu(self, n):
method prepare_experiments (line 236) | def prepare_experiments(self, args: Any):
method _get_metric (line 273) | def _get_metric(exp: Experiment):
method write_record (line 309) | def write_record(exp: Experiment):
method _poll (line 315) | def _poll(self):
method begin (line 350) | def begin(self, args: Any):
function find_all_config (line 380) | def find_all_config(dir_or_file: str):
FILE: scripts/benchmark/generate_report.py
class ModelOutput (line 13) | class ModelOutput:
method tuner_hyper_params (line 64) | def tuner_hyper_params(self):
method hyper_parameters (line 94) | def hyper_parameters(self):
method train_speed (line 101) | def train_speed(self):
method infer_speed (line 108) | def infer_speed(self):
function generate_sft_report (line 114) | def generate_sft_report(outputs: List[ModelOutput]):
function generate_export_report (line 213) | def generate_export_report(outputs: List[ModelOutput]):
function parse_output (line 285) | def parse_output(file):
function generate_reports (line 409) | def generate_reports():
FILE: scripts/utils/run_dataset_info.py
function get_cache_mapping (line 13) | def get_cache_mapping(fpath):
function get_dataset_id (line 33) | def get_dataset_id(key):
function run_dataset (line 40) | def run_dataset(key, template, cache_mapping):
function write_dataset_info (line 79) | def write_dataset_info() -> None:
FILE: scripts/utils/run_model_info.py
function get_url_suffix (line 8) | def get_url_suffix(model_id):
function get_cache_mapping (line 14) | def get_cache_mapping(fpath):
function get_model_info_table (line 32) | def get_model_info_table():
FILE: scripts/utils/test_link_valid.py
function check_link (line 10) | def check_link(url):
function extract_links_from_md (line 18) | def extract_links_from_md(file_path):
function check_links_in_folder (line 25) | def check_links_in_folder(folder_path):
FILE: setup.py
function readme (line 9) | def readme():
function get_version (line 18) | def get_version():
function parse_requirements (line 24) | def parse_requirements(fname='requirements.txt', with_version=True):
FILE: swift/agent_template/base.py
class AgentKeyword (line 20) | class AgentKeyword:
class ToolDesc (line 27) | class ToolDesc:
class ReactCompatMixin (line 35) | class ReactCompatMixin:
method _split_action_action_input (line 45) | def _split_action_action_input(response: str, keyword: AgentKeyword) -...
method get_toolcall (line 60) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tool_responses (line 76) | def _format_tool_responses(
method _parse_tool_call (line 113) | def _parse_tool_call(content) -> Dict[str, Any]:
method _format_tool_calls (line 123) | def _format_tool_calls(self, tool_call_messages) -> str:
class BaseAgentTemplate (line 143) | class BaseAgentTemplate(ReactCompatMixin, ABC):
method _get_tool_name (line 157) | def _get_tool_name(tool):
method unwrap_tool (line 161) | def unwrap_tool(tool):
method wrap_tool (line 168) | def wrap_tool(tool):
method _parse_tool (line 175) | def _parse_tool(tool, lang: Literal['zh', 'en']) -> ToolDesc:
method _parse_json (line 196) | def _parse_json(json_str: str) -> Optional[Any]:
method _format_tools (line 218) | def _format_tools(self,
FILE: swift/agent_template/deepseek_v3_1.py
class DeepSeekV31AgentTemplate (line 11) | class DeepSeekV31AgentTemplate(BaseAgentTemplate):
method get_toolcall (line 13) | def get_toolcall(self, response: str) -> List[Function]:
method _get_tool_responses (line 30) | def _get_tool_responses(self, tool_messages):
method _get_tool_calls (line 34) | def _get_tool_calls(self, tool_calls: List[str]):
method _format_tool_responses (line 37) | def _format_tool_responses(
method _format_tools (line 48) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 80) | def _format_tool_calls(self, tool_call_messages):
FILE: swift/agent_template/extra.py
class ReactGRPOAgentTemplate (line 7) | class ReactGRPOAgentTemplate(BaseAgentTemplate):
method _format_tools (line 9) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
FILE: swift/agent_template/glm4.py
class ChatGLM4AgentTemplate (line 11) | class ChatGLM4AgentTemplate(BaseAgentTemplate):
method _find_function_call (line 15) | def _find_function_call(single_content: str) -> Optional[Function]:
method get_toolcall (line 24) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tools (line 36) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_responses (line 48) | def _format_tool_responses(
method _format_tool_calls (line 65) | def _format_tool_calls(self, tool_call_messages) -> str:
class GLM4AgentTemplate (line 73) | class GLM4AgentTemplate(ChatGLM4AgentTemplate):
class GLM4_5AgentTemplate (line 77) | class GLM4_5AgentTemplate(BaseAgentTemplate):
method _find_function_call (line 81) | def _find_function_call(single_content: str) -> Optional[Function]:
method get_toolcall (line 94) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tools (line 106) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_responses (line 133) | def _format_tool_responses(
method _format_tool_calls (line 155) | def _format_tool_calls(self, tool_call_messages) -> str:
class GLM4_7AgentTemplate (line 171) | class GLM4_7AgentTemplate(GLM4_5AgentTemplate):
FILE: swift/agent_template/hermes.py
class HermesAgentTemplate (line 11) | class HermesAgentTemplate(BaseAgentTemplate):
method get_toolcall (line 13) | def get_toolcall(self, response: str) -> List[Function]:
method _get_tool_responses (line 25) | def _get_tool_responses(self, tool_messages):
method _get_tool_calls (line 32) | def _get_tool_calls(self, tool_calls: List[str]):
method _format_tool_responses (line 35) | def _format_tool_responses(
method _format_tools (line 57) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 76) | def _format_tool_calls(self, tool_call_messages):
class HunyuanHermesAgentTemplate (line 84) | class HunyuanHermesAgentTemplate(HermesAgentTemplate):
method get_toolcall (line 86) | def get_toolcall(self, response: str) -> List[Function]:
method _get_tool_responses (line 97) | def _get_tool_responses(self, tool_messages):
method _get_tool_calls (line 105) | def _get_tool_calls(self, tool_calls: List[str]):
method _format_tools (line 109) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
FILE: swift/agent_template/llama.py
class Llama3AgentTemplate (line 11) | class Llama3AgentTemplate(BaseAgentTemplate):
method get_toolcall (line 17) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tool_responses (line 31) | def _format_tool_responses(
method _format_tools (line 46) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 60) | def _format_tool_calls(self, tool_call_messages) -> str:
class Llama4AgentTemplate (line 69) | class Llama4AgentTemplate(Llama3AgentTemplate):
FILE: swift/agent_template/minimax_m2.py
class MinimaxM2AgentTemplate (line 11) | class MinimaxM2AgentTemplate(BaseAgentTemplate):
method get_toolcall (line 23) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tool_responses (line 62) | def _format_tool_responses(
method _format_tools (line 106) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 138) | def _format_tool_calls(self, tool_call_messages):
FILE: swift/agent_template/mistral.py
class MistralAgentTemplate (line 11) | class MistralAgentTemplate(BaseAgentTemplate):
method get_toolcall (line 13) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tool_responses (line 32) | def _format_tool_responses(
method _format_tools (line 55) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 60) | def _format_tool_calls(self, tool_call_messages):
FILE: swift/agent_template/qwen.py
class QwenEnAgentTemplate (line 13) | class QwenEnAgentTemplate(BaseAgentTemplate):
method _get_tool_names_descs (line 16) | def _get_tool_names_descs(self, tools):
method _format_tools (line 27) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
class QwenZhAgentTemplate (line 46) | class QwenZhAgentTemplate(BaseAgentTemplate):
method _get_tool_names_descs (line 49) | def _get_tool_names_descs(self, tools):
method _format_tools (line 60) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
class QwenEnParallelAgentTemplate (line 79) | class QwenEnParallelAgentTemplate(QwenEnAgentTemplate):
method _format_tools (line 81) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
class QwenZhParallelAgentTemplate (line 108) | class QwenZhParallelAgentTemplate(QwenZhAgentTemplate):
method _format_tools (line 110) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
FILE: swift/agent_template/qwen3_coder.py
function render_extra_keys (line 10) | def render_extra_keys(obj, handled_keys):
class Qwen3CoderAgentTemplate (line 33) | class Qwen3CoderAgentTemplate(HermesAgentTemplate):
method _find_function_call (line 36) | def _find_function_call(single_content: str) -> Optional[Function]:
method get_toolcall (line 62) | def get_toolcall(self, response: str) -> List[Function]:
method _format_tools (line 75) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 134) | def _format_tool_calls(self, tool_call_messages):
method _get_tool_responses (line 158) | def _get_tool_responses(self, tool_messages):
class Qwen3_5AgentTemplate (line 166) | class Qwen3_5AgentTemplate(Qwen3CoderAgentTemplate):
method _format_tools (line 168) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _get_tool_responses (line 178) | def _get_tool_responses(self, tool_messages):
FILE: swift/agent_template/react.py
class ReactEnAgentTemplate (line 7) | class ReactEnAgentTemplate(BaseAgentTemplate):
method _format_tools (line 9) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
class ReactZnAgentTemplate (line 39) | class ReactZnAgentTemplate(BaseAgentTemplate):
method _format_tools (line 41) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
FILE: swift/agent_template/seed_oss.py
class SeedAgentTemplate (line 11) | class SeedAgentTemplate(BaseAgentTemplate):
method _py_type (line 26) | def _py_type(t: str) -> str:
method get_toolcall (line 29) | def get_toolcall(self, response: str) -> List[Function]:
method _get_tool_responses (line 47) | def _get_tool_responses(self, tool_messages: List[dict]) -> str:
method _format_tool_responses (line 51) | def _format_tool_responses(
method _build_tool_def_string (line 63) | def _build_tool_def_string(self, tool: dict) -> str:
method _format_tools (line 104) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 136) | def _format_tool_calls(self, tool_call_messages: List[dict]) -> str:
FILE: swift/agent_template/toolbench.py
class ToolBenchAgentTemplate (line 8) | class ToolBenchAgentTemplate(BaseAgentTemplate):
method _format_tools (line 10) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
FILE: swift/agent_template/youtu.py
class YoutuAgentTemplate (line 11) | class YoutuAgentTemplate(HermesAgentTemplate):
method _get_tool_responses (line 19) | def _get_tool_responses(self, tool_messages):
method _format_tool_responses (line 26) | def _format_tool_responses(
method _format_tools (line 49) | def _format_tools(self, tools: List[Union[str, dict]], system: Optiona...
method _format_tool_calls (line 61) | def _format_tool_calls(self, tool_call_messages):
FILE: swift/arguments/app_args.py
class AppArguments (line 15) | class AppArguments(WebUIArguments, DeployArguments):
method _init_torch_dtype (line 41) | def _init_torch_dtype(self) -> None:
method __post_init__ (line 48) | def __post_init__(self):
FILE: swift/arguments/base_args/base_args.py
function get_supported_tuners (line 25) | def get_supported_tuners():
class BaseArguments (line 31) | class BaseArguments(GenerationArguments, QuantizeArguments, DataArgument...
method _prepare_training_args (line 100) | def _prepare_training_args(self, training_args: Dict[str, Any]) -> None:
method _init_lazy_tokenize (line 103) | def _init_lazy_tokenize(self):
method _import_external_plugins (line 119) | def _import_external_plugins(self):
method _check_is_adapter (line 135) | def _check_is_adapter(adapter_dir: str) -> bool:
method _init_adapters (line 142) | def _init_adapters(self):
method __post_init__ (line 149) | def __post_init__(self):
method _init_model_kwargs (line 181) | def _init_model_kwargs(self):
method is_adapter (line 189) | def is_adapter(self) -> bool:
method supported_tuners (line 193) | def supported_tuners(self):
method adapters_can_be_merged (line 197) | def adapters_can_be_merged(self):
method from_pretrained (line 201) | def from_pretrained(cls, checkpoint_dir: str):
method _init_ckpt_dir (line 212) | def _init_ckpt_dir(self, adapters=None):
method load_args_from_ckpt (line 222) | def load_args_from_ckpt(self) -> None:
method save_args (line 279) | def save_args(self, output_dir=None) -> None:
method _init_device (line 288) | def _init_device(self):
method get_template (line 292) | def get_template(self, processor: Optional[Processor] = None, **kwargs...
method get_model_processor (line 304) | def get_model_processor(self,
FILE: swift/arguments/base_args/data_args.py
class DataArguments (line 12) | class DataArguments:
method _init_custom_dataset_info (line 99) | def _init_custom_dataset_info(self):
method __post_init__ (line 106) | def __post_init__(self):
method _init_val_dataset_exists (line 120) | def _init_val_dataset_exists(self):
method get_dataset_kwargs (line 124) | def get_dataset_kwargs(self):
FILE: swift/arguments/base_args/generation_args.py
class GenerationArguments (line 12) | class GenerationArguments:
method _init_stream (line 57) | def _init_stream(self):
method get_request_config (line 61) | def get_request_config(self):
FILE: swift/arguments/base_args/model_args.py
class ModelArguments (line 17) | class ModelArguments:
method _init_device_map (line 93) | def _init_device_map(self):
method _init_max_memory (line 104) | def _init_max_memory(self):
method _init_torch_dtype (line 118) | def _init_torch_dtype(self) -> None:
method _init_mixed_precision (line 128) | def _init_mixed_precision(self):
method _init_rope_scaling (line 142) | def _init_rope_scaling(self):
method _init_model_info (line 189) | def _init_model_info(self) -> torch.dtype:
method _init_new_special_tokens (line 203) | def _init_new_special_tokens(self):
method __post_init__ (line 217) | def __post_init__(self):
method get_model_kwargs (line 226) | def get_model_kwargs(self):
FILE: swift/arguments/base_args/quant_args.py
class QuantizeArguments (line 11) | class QuantizeArguments:
method get_quantization_config (line 45) | def get_quantization_config(self):
method get_modules_to_not_convert (line 99) | def get_modules_to_not_convert(self):
method __post_init__ (line 116) | def __post_init__(self):
FILE: swift/arguments/base_args/template_args.py
class TemplateArguments (line 13) | class TemplateArguments:
method __post_init__ (line 137) | def __post_init__(self):
method get_template_kwargs (line 155) | def get_template_kwargs(self):
FILE: swift/arguments/deploy_args.py
class DeployArguments (line 13) | class DeployArguments(InferArguments):
method __post_init__ (line 51) | def __post_init__(self):
method _init_adapters (line 58) | def _init_adapters(self):
method _init_ckpt_dir (line 75) | def _init_ckpt_dir(self, adapters=None):
method _init_stream (line 78) | def _init_stream(self):
class RolloutArguments (line 83) | class RolloutArguments(DeployArguments):
method __post_init__ (line 111) | def __post_init__(self):
method _set_default_engine_type (line 117) | def _set_default_engine_type(self):
method _check_args (line 127) | def _check_args(self):
method _check_device_count (line 138) | def _check_device_count(self):
FILE: swift/arguments/eval_args.py
class EvalArguments (line 15) | class EvalArguments(DeployArguments):
method __post_init__ (line 65) | def __post_init__(self):
method _init_eval_url (line 75) | def _init_eval_url(self):
method list_eval_dataset (line 81) | def list_eval_dataset(eval_backend=None):
method _init_eval_dataset (line 99) | def _init_eval_dataset(self):
method _init_result_path (line 117) | def _init_result_path(self, folder_name: str) -> None:
method _init_torch_dtype (line 125) | def _init_torch_dtype(self) -> None:
FILE: swift/arguments/export_args.py
class ExportArguments (line 16) | class ExportArguments(MergeArguments, BaseArguments):
method load_args_from_ckpt (line 85) | def load_args_from_ckpt(self) -> None:
method _init_output_dir (line 90) | def _init_output_dir(self):
method __post_init__ (line 120) | def __post_init__(self):
FILE: swift/arguments/infer_args.py
class LmdeployArguments (line 17) | class LmdeployArguments:
method get_lmdeploy_engine_kwargs (line 37) | def get_lmdeploy_engine_kwargs(self):
class SglangArguments (line 51) | class SglangArguments:
method get_sglang_engine_kwargs (line 103) | def get_sglang_engine_kwargs(self):
class InferArguments (line 128) | class InferArguments(MergeArguments, LmdeployArguments, SglangArguments,...
method _get_result_path (line 166) | def _get_result_path(self, folder_name: str) -> str:
method _init_result_path (line 174) | def _init_result_path(self, folder_name: str) -> None:
method _init_stream (line 184) | def _init_stream(self):
method _init_ddp (line 193) | def _init_ddp(self):
method __post_init__ (line 203) | def __post_init__(self) -> None:
method _init_vllm_async_engine (line 216) | def _init_vllm_async_engine(self):
FILE: swift/arguments/merge_args.py
class MergeArguments (line 10) | class MergeArguments:
FILE: swift/arguments/pretrain_args.py
class PretrainArguments (line 8) | class PretrainArguments(SftArguments):
FILE: swift/arguments/rlhf_args.py
class RewardModelArguments (line 16) | class RewardModelArguments:
class TeacherModelArguments (line 37) | class TeacherModelArguments:
class PPOArguments (line 85) | class PPOArguments:
class GRPOArguments (line 121) | class GRPOArguments(GRPOArgumentsMixin):
class RLHFArguments (line 166) | class RLHFArguments(TeacherModelArguments, GRPOArguments, PPOArguments, ...
method _prepare_training_args (line 262) | def _prepare_training_args(self, training_args: Dict[str, Any]) -> None:
method __post_init__ (line 266) | def __post_init__(self):
method _process_loss_type (line 306) | def _process_loss_type(self):
method _init_grpo (line 330) | def _init_grpo(self):
method _init_rollout (line 378) | def _init_rollout(self):
method _init_padding_side (line 404) | def _init_padding_side(self):
method _init_max_completion_length (line 409) | def _init_max_completion_length(self):
method _init_metric_for_best_model (line 413) | def _init_metric_for_best_model(self):
method _init_simpo (line 419) | def _init_simpo(self):
method _init_rm (line 429) | def _init_rm(self):
method _init_external_vllm (line 434) | def _init_external_vllm(self):
method _set_default (line 451) | def _set_default(self):
method _check_grpo (line 469) | def _check_grpo(self):
method _external_vllm_warning (line 507) | def _external_vllm_warning(self):
method _check_padding_free (line 517) | def _check_padding_free(self):
method _check_sequence_parallel (line 526) | def _check_sequence_parallel(self):
method _init_teacher_deepspeed (line 534) | def _init_teacher_deepspeed(self):
method _check_gkd (line 556) | def _check_gkd(self):
FILE: swift/arguments/sampling_args.py
class SamplingArguments (line 15) | class SamplingArguments(BaseArguments):
method _init_model_info (line 86) | def _init_model_info(self):
method __post_init__ (line 95) | def __post_init__(self):
FILE: swift/arguments/sft_args.py
class SwanlabArguments (line 17) | class SwanlabArguments:
method _init_swanlab (line 64) | def _init_swanlab(self):
class SftArguments (line 123) | class SftArguments(SwanlabArguments, TunerArguments, BaseArguments, Seq2...
method _check_padding_free (line 184) | def _check_padding_free(self):
method __post_init__ (line 195) | def __post_init__(self) -> None:
method _init_override (line 235) | def _init_override(self):
method _init_deepspeed (line 246) | def _init_deepspeed(self):
method _init_fsdp (line 277) | def _init_fsdp(self):
method _check_fsdp2_compatibility (line 324) | def _check_fsdp2_compatibility(self):
method _handle_pai_compat (line 354) | def _handle_pai_compat(self) -> None:
method _add_version (line 366) | def _add_version(self):
method _init_output_dir (line 385) | def _init_output_dir(self):
method _init_eval_strategy (line 390) | def _init_eval_strategy(self):
method _init_metric (line 402) | def _init_metric(self):
FILE: swift/arguments/tuner_args.py
class TunerArguments (line 12) | class TunerArguments:
method __post_init__ (line 197) | def __post_init__(self):
method _init_multimodal_full (line 204) | def _init_multimodal_full(self):
FILE: swift/arguments/webui_args.py
class WebUIArguments (line 6) | class WebUIArguments:
FILE: swift/callbacks/activation_cpu_offload.py
function is_torch_npu_available (line 16) | def is_torch_npu_available() -> bool:
function _get_unique_tensor_key (line 30) | def _get_unique_tensor_key(tensor):
function get_device_name (line 35) | def get_device_name() -> str:
class FSDPParameterFilter (line 50) | class FSDPParameterFilter:
method __init__ (line 52) | def __init__(self):
method __call__ (line 55) | def __call__(self, tensor):
method update_model_parameters (line 58) | def update_model_parameters(self, model):
function get_torch_device (line 65) | def get_torch_device() -> Any:
class CpuOffloadHookWithOffloadHandler (line 78) | class CpuOffloadHookWithOffloadHandler:
method __init__ (line 86) | def __init__(
method __enter__ (line 97) | def __enter__(self):
method __exit__ (line 101) | def __exit__(self, *args: Any):
method on_save_for_backward (line 105) | def on_save_for_backward(self, tensor: torch.Tensor) -> Any:
method on_get_saved_tensor (line 109) | def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor:
class OffloadHandler (line 114) | class OffloadHandler:
method __init__ (line 117) | def __init__(self) -> None:
method tensor_push (line 120) | def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any:
method tensor_pop (line 126) | def tensor_pop(self, tensor_tag: Any, **kwargs):
class GroupCommitFunction (line 133) | class GroupCommitFunction(torch.autograd.Function):
method forward (line 141) | def forward(ctx, tensor, cpu_offload_handler):
method backward (line 149) | def backward(ctx, grad_output):
class SynchronizedGroupOffloadHandler (line 159) | class SynchronizedGroupOffloadHandler(OffloadHandler):
method __init__ (line 165) | def __init__(self, num_offload_group, tensor_need_offloading_checker=(...
method groupid_reset (line 173) | def groupid_reset(self):
method on_group_commit_forward (line 183) | def on_group_commit_forward(self):
method on_group_commit_backward (line 189) | def on_group_commit_backward(self):
method offload (line 195) | def offload(src_tensor, pin_memory=True):
method reload (line 220) | def reload(state, non_blocking=None):
method tensor_push (line 227) | def tensor_push(self, tensor: torch.Tensor, **kwargs):
method tensor_pop (line 242) | def tensor_pop(self, tensor_tag, **kwargs):
class AsyncDoubleBufferGroupOffloadHandler (line 253) | class AsyncDoubleBufferGroupOffloadHandler(SynchronizedGroupOffloadHandl...
method __init__ (line 260) | def __init__(
method tensor_push (line 295) | def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any:
method tensor_pop (line 317) | def tensor_pop(self, tensor_tag, **kwargs):
method bulk_offload_group (line 330) | def bulk_offload_group(self, group_to_offload):
method synchronize_on_group_commit_forward (line 351) | def synchronize_on_group_commit_forward(self, current_group):
method on_group_commit_forward (line 379) | def on_group_commit_forward(self):
method bulk_reload_group (line 387) | def bulk_reload_group(self, group_to_reload):
method on_group_commit_backward (line 405) | def on_group_commit_backward(self):
function get_activation_offload_context (line 430) | def get_activation_offload_context(num_layers: int = 1,
class ActivationHandler (line 448) | class ActivationHandler:
method __init__ (line 450) | def __init__(self, offload_ctx, sync_func, tensor_filter, enable_ckpt):
method pre_forward (line 461) | def pre_forward(self, module):
method post_forward (line 466) | def post_forward(self, module):
method _pack_kwargs (line 470) | def _pack_kwargs(self, *args, **kwargs):
method _unpack_kwargs (line 479) | def _unpack_kwargs(self, flat_args, kwarg_keys):
method _ckpt_forward (line 487) | def _ckpt_forward(self, forward_method, *args, **kwargs):
method forward (line 501) | def forward(self, module, forward_method, *args, **kwargs):
method wrap_module_forward_method (line 517) | def wrap_module_forward_method(self, module):
function enable_activation_offloading (line 531) | def enable_activation_offloading(model, strategy, enable_ckpt=False):
class ActivationCpuOffloadCallBack (line 592) | class ActivationCpuOffloadCallBack(TrainerCallback):
method __init__ (line 594) | def __init__(self, args: TrainingArguments, trainer):
method on_train_begin (line 597) | def on_train_begin(self, args: TrainingArguments, state: TrainerState,...
FILE: swift/callbacks/adalora.py
class AdaloraCallback (line 11) | class AdaloraCallback(TrainerCallback):
method __init__ (line 13) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method on_train_begin (line 19) | def on_train_begin(self, _args, state, control, **kwargs):
method on_step_end (line 30) | def on_step_end(self, _args, state, control, **kwargs):
FILE: swift/callbacks/base.py
class TrainerCallback (line 9) | class TrainerCallback(HfTrainerCallback):
method __init__ (line 11) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
FILE: swift/callbacks/deepspeed_elastic.py
class DeepspeedElasticCallback (line 10) | class DeepspeedElasticCallback(TrainerCallback):
method __init__ (line 12) | def __init__(self, args=None, trainer=None):
method on_init_end (line 16) | def on_init_end(self, args: TrainingArguments, state: TrainerState, co...
class GracefulExitCallback (line 49) | class GracefulExitCallback(TrainerCallback):
method __init__ (line 51) | def __init__(self, args=None, trainer=None):
method on_step_end (line 59) | def on_step_end(self, args, state, control, **kwargs):
method on_save (line 77) | def on_save(self, args, state, control, **kwargs):
FILE: swift/callbacks/early_stop.py
class EarlyStopCallback (line 15) | class EarlyStopCallback(TrainerCallback):
method __init__ (line 18) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method on_save (line 24) | def on_save(self, args: 'TrainingArguments', state: TrainerState, cont...
FILE: swift/callbacks/lisa.py
class LISACallback (line 12) | class LISACallback(TrainerCallback):
method __init__ (line 14) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method freeze_all_layers (line 36) | def freeze_all_layers(self):
method on_step_begin (line 42) | def on_step_begin(self, args, state, control, **kwargs):
method switch_active_layers (line 47) | def switch_active_layers(self):
FILE: swift/callbacks/perf_log.py
class PerfMetricsLogCallback (line 35) | class PerfMetricsLogCallback(TrainerCallback):
method __init__ (line 38) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method on_init_end (line 44) | def on_init_end(self, args: 'TrainingArguments', state: TrainerState, ...
method on_step_begin (line 62) | def on_step_begin(self, args: 'TrainingArguments', state: TrainerState...
method on_step_end (line 65) | def on_step_end(self, args: 'TrainingArguments', state: TrainerState, ...
method on_log (line 68) | def on_log(self, args: 'TrainingArguments', state: TrainerState, contr...
method _estimate_device_tflops_by_dtype (line 77) | def _estimate_device_tflops_by_dtype(device: torch.device, dtype: torc...
method _retrieve_flops_from_map (line 121) | def _retrieve_flops_from_map(device):
FILE: swift/cli/_megatron/main.py
function cli_main (line 17) | def cli_main():
FILE: swift/cli/main.py
function use_torchrun (line 29) | def use_torchrun() -> bool:
function get_torchrun_args (line 37) | def get_torchrun_args() -> Optional[List[str]]:
function prepare_config_args (line 49) | def prepare_config_args(argv):
function cli_main (line 84) | def cli_main(route_mapping: Optional[Dict[str, str]] = None, is_megatron...
FILE: swift/cli/merge_lora.py
class SwiftMergeLoRA (line 6) | class SwiftMergeLoRA(SwiftPipeline):
method run (line 10) | def run(self):
FILE: swift/cli/sft.py
function try_init_unsloth (line 4) | def try_init_unsloth():
FILE: swift/cli/utils.py
function try_use_single_device_mode (line 5) | def try_use_single_device_mode():
FILE: swift/dataloader/dispatcher.py
class DataLoaderDispatcher (line 8) | class DataLoaderDispatcher:
method __init__ (line 10) | def __init__(self, base_dataloader, device=None, skip_batches: int = 0):
method rank (line 16) | def rank(self):
method world_size (line 20) | def world_size(self):
method group (line 24) | def group(self):
method _scatter_object_list (line 27) | def _scatter_object_list(self, inputs):
method _skip_batches (line 35) | def _skip_batches(self, base_iter):
method __iter__ (line 40) | def __iter__(self):
FILE: swift/dataloader/shard.py
class BatchSamplerShard (line 10) | class BatchSamplerShard:
method __init__ (line 12) | def __init__(
method rank (line 38) | def rank(self):
method world_size (line 42) | def world_size(self):
method __iter__ (line 45) | def __iter__(self):
method set_epoch (line 70) | def set_epoch(self, epoch: int):
method __len__ (line 73) | def __len__(self) -> int:
class DataLoaderShard (line 80) | class DataLoaderShard(DataLoader):
method __init__ (line 82) | def __init__(self, dataset, device=None, **dataloader_params):
method set_epoch (line 86) | def set_epoch(self, epoch: int):
method __iter__ (line 92) | def __iter__(self):
FILE: swift/dataset/dataset/llm.py
class AlpacaZhPreprocessor (line 15) | class AlpacaZhPreprocessor(AlpacaPreprocessor):
method concat_inst_input (line 18) | def concat_inst_input(cls, instruction, input_):
class LongAlpacaPreprocessor (line 33) | class LongAlpacaPreprocessor(AlpacaPreprocessor):
method preprocess (line 35) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class RuozhibaPreprocessor (line 53) | class RuozhibaPreprocessor(RowPreprocessor):
method preprocess (line 55) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class MathTrnPreprocessor (line 77) | class MathTrnPreprocessor(ResponsePreprocessor):
method preprocess (line 79) | def preprocess(self, row):
function _repair_ms_bench (line 93) | def _repair_ms_bench(messages: str) -> Optional[List[Dict[str, str]]]:
function _repair_agent_messages (line 115) | def _repair_agent_messages(messages: List[Dict[str, str]], use_mini: boo...
class FireflyPreprocessor (line 159) | class FireflyPreprocessor(ResponsePreprocessor):
method preprocess (line 167) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class SyntheticText2SqlPreprocessor (line 214) | class SyntheticText2SqlPreprocessor(ResponsePreprocessor):
method preprocess (line 216) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
function _repair_toolbench (line 234) | def _repair_toolbench(conversations: List[Dict[str, str]]) -> List[Dict[...
class BlossomMathPreprocessor (line 250) | class BlossomMathPreprocessor(ResponsePreprocessor):
method preprocess (line 252) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class TigerBotLawPreprocessor (line 276) | class TigerBotLawPreprocessor(ResponsePreprocessor):
method preprocess (line 278) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class LeetcodePythonPreprocessor (line 305) | class LeetcodePythonPreprocessor(ResponsePreprocessor):
method preprocess (line 307) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class StsbPreprocessor (line 325) | class StsbPreprocessor(RowPreprocessor):
method __init__ (line 327) | def __init__(self, sim_threshold: Optional[float] = None):
method preprocess (line 331) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class StsbGeneratePreprocessor (line 349) | class StsbGeneratePreprocessor(ResponsePreprocessor):
method preprocess (line 355) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class StsbRegressionPreprocessor (line 362) | class StsbRegressionPreprocessor(StsbGeneratePreprocessor):
method preprocess (line 364) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class MTEBRerankPreprocessor (line 386) | class MTEBRerankPreprocessor(RowPreprocessor):
method preprocess (line 388) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
function _repair_conversations_agent_instruct (line 417) | def _repair_conversations_agent_instruct(s: str) -> List[Dict[str, Any]]:
class MultiRoleAgentPreprocessor (line 432) | class MultiRoleAgentPreprocessor(RowPreprocessor):
method preprocess (line 434) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class CoundownTaskPreprocessor (line 495) | class CoundownTaskPreprocessor(ResponsePreprocessor):
method preprocess (line 497) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class HC3Preprocessor (line 516) | class HC3Preprocessor(ResponsePreprocessor):
method preprocess (line 523) | def preprocess(self, row):
class HC3ClsPreprocessor (line 532) | class HC3ClsPreprocessor(HC3Preprocessor):
method preprocess (line 534) | def preprocess(self, row):
class DureaderPreprocessor (line 591) | class DureaderPreprocessor(RowPreprocessor):
method preprocess (line 593) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class HHRLHFPreprocessor (line 618) | class HHRLHFPreprocessor(RowPreprocessor):
method _to_messages (line 621) | def _to_messages(data):
method preprocess (line 628) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class XlamFunctionCallingPreprocessor (line 653) | class XlamFunctionCallingPreprocessor(RowPreprocessor):
method preprocess (line 655) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class XlamFunctionCallingGRPOPreprocessor (line 663) | class XlamFunctionCallingGRPOPreprocessor(ResponsePreprocessor):
method preprocess (line 665) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class HHRLHFCNPreprocessor (line 689) | class HHRLHFCNPreprocessor(MessagesPreprocessor):
method preprocess (line 691) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
function repair_conversations (line 706) | def repair_conversations(s: Union[str, Any]) -> Any:
class EmojiPreprocessr (line 724) | class EmojiPreprocessr(ResponsePreprocessor):
method preprocess (line 726) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class GuanacoPreprocessor (line 762) | class GuanacoPreprocessor(RowPreprocessor):
method preprocess (line 764) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class FunctionCallChatmlPreprocessor (line 805) | class FunctionCallChatmlPreprocessor(MessagesPreprocessor):
method preprocess (line 807) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class Dolly15kPreprocessor (line 826) | class Dolly15kPreprocessor(RowPreprocessor):
method preprocess (line 828) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class OrpoDPOMix40kPreprocessor (line 857) | class OrpoDPOMix40kPreprocessor(MessagesPreprocessor):
method preprocess (line 859) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class SelfCognitionPreprocessor (line 882) | class SelfCognitionPreprocessor(ResponsePreprocessor):
method __init__ (line 884) | def __init__(self, *args, query_suffix: str = '', response_prefix: str...
method set_name_author (line 891) | def set_name_author(self, name, author):
method preprocess (line 895) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
FILE: swift/dataset/dataset/mllm.py
class ShareGPT4oPreprocessor (line 16) | class ShareGPT4oPreprocessor(MessagesPreprocessor):
method preprocess (line 18) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
method prepare_dataset (line 29) | def prepare_dataset(self, dataset):
class GPT4vDataset (line 51) | class GPT4vDataset(ResponsePreprocessor):
method preprocess (line 53) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class GarbagePreprocessor (line 84) | class GarbagePreprocessor(ResponsePreprocessor):
method preprocess (line 86) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class SA1BPairedCaptionPreprocessor (line 102) | class SA1BPairedCaptionPreprocessor(RowPreprocessor):
method preprocess (line 104) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class SA1BDenseCaptionPreprocessor (line 129) | class SA1BDenseCaptionPreprocessor(RowPreprocessor):
method preprocess (line 134) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class COCO2014Preprocess (line 161) | class COCO2014Preprocess(ResponsePreprocessor):
method preprocess (line 163) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class MantisPreprocessor (line 185) | class MantisPreprocessor(MessagesPreprocessor):
method __init__ (line 187) | def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] =...
method prepare_dataset (line 191) | def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
method preprocess (line 201) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class LLaVADataPreprocessor (line 231) | class LLaVADataPreprocessor(MessagesPreprocessor):
method prepare_dataset (line 233) | def prepare_dataset(self, dataset):
method preprocess (line 239) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class PixelProsePreprocessor (line 276) | class PixelProsePreprocessor(RowPreprocessor):
method preprocess (line 278) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class AIShell1Preprocessor (line 309) | class AIShell1Preprocessor(ResponsePreprocessor):
method preprocess (line 311) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class EmoSchemaPreprocessor (line 330) | class EmoSchemaPreprocessor(ResponsePreprocessor):
method prepare_dataset (line 332) | def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
method preprocess (line 344) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class EmoSchemaClsPreprocessor (line 362) | class EmoSchemaClsPreprocessor(EmoSchemaPreprocessor):
method preprocess (line 364) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
function _generate_url_list (line 388) | def _generate_url_list(_url, _range):
class LLaVAVideo178KPreprocessor (line 395) | class LLaVAVideo178KPreprocessor(MessagesPreprocessor):
method __init__ (line 397) | def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] =...
method prepare_dataset (line 448) | def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
method preprocess (line 453) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class MovieChat1KPreprocessor (line 483) | class MovieChat1KPreprocessor(ResponsePreprocessor):
method prepare_dataset (line 485) | def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
method preprocess (line 511) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class VideoChatGPTPreprocessor (line 531) | class VideoChatGPTPreprocessor(ResponsePreprocessor):
method prepare_dataset (line 533) | def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
method preprocess (line 542) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
function preprocess_mind2web (line 567) | def preprocess_mind2web(dataset, **kwargs):
class ShareGPT4VPreprocessor (line 698) | class ShareGPT4VPreprocessor(MessagesPreprocessor):
method prepare_dataset (line 700) | def prepare_dataset(self, dataset):
method preprocess (line 715) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class TextCapsPreprocessor (line 751) | class TextCapsPreprocessor(ResponsePreprocessor):
method preprocess (line 753) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class TextCapsEmbPreprocessor (line 760) | class TextCapsEmbPreprocessor(RowPreprocessor):
method preprocess (line 762) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class TextCapsReRankPreprocessor (line 778) | class TextCapsReRankPreprocessor(RowPreprocessor):
method __init__ (line 780) | def __init__(self,
method prepare_dataset (line 791) | def prepare_dataset(self, dataset):
method preprocess (line 811) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class RefCOCOPreprocessor (line 866) | class RefCOCOPreprocessor(ResponsePreprocessor, GroundingMixin):
method __init__ (line 869) | def __init__(self, task_type, **kwargs):
method prepare_dataset (line 873) | def prepare_dataset(self, dataset):
method preprocess (line 879) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class LLaVAInstructPreprocessor (line 944) | class LLaVAInstructPreprocessor(MessagesPreprocessor):
method prepare_dataset (line 946) | def prepare_dataset(self, dataset):
method preprocess (line 952) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class LLaVAPretrainPreprocessor (line 983) | class LLaVAPretrainPreprocessor(MessagesPreprocessor):
method prepare_dataset (line 985) | def prepare_dataset(self, dataset):
method preprocess (line 997) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class OcrvqaPreprocessor (line 1041) | class OcrvqaPreprocessor(RowPreprocessor):
method preprocess (line 1043) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class ScienceQAPreprocessor (line 1067) | class ScienceQAPreprocessor(RowPreprocessor):
method preprocess (line 1069) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class GritPreprocessor (line 1086) | class GritPreprocessor(RowPreprocessor, GroundingMixin):
method __init__ (line 1088) | def __init__(self, task_type, **kwargs):
method has_overlap (line 1093) | def has_overlap(start_ends):
method replace_intervals_with_tags (line 1100) | def replace_intervals_with_tags(response, start_ends):
method preprocess (line 1110) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class GQAPreprocessor (line 1170) | class GQAPreprocessor(RowPreprocessor):
method prepare_dataset (line 1172) | def prepare_dataset(self, dataset):
method preprocess (line 1176) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class CocoPreprocessor (line 1202) | class CocoPreprocessor(ResponsePreprocessor):
method preprocess (line 1214) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class LLaVAMixSFTPreprocessor (line 1231) | class LLaVAMixSFTPreprocessor(RowPreprocessor):
method preprocess (line 1233) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class LatexocrPreprocessor (line 1260) | class LatexocrPreprocessor(ResponsePreprocessor):
method preprocess (line 1262) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class CapchaImagesPreprocessor (line 1278) | class CapchaImagesPreprocessor(ResponsePreprocessor):
method preprocess (line 1280) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class ClevrPreprocessor (line 1293) | class ClevrPreprocessor(ResponsePreprocessor):
method preprocess (line 1295) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class Voc2007MultilabelPreprocessor (line 1311) | class Voc2007MultilabelPreprocessor(ResponsePreprocessor):
method preprocess (line 1315) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class Geometry3KPreprocessor (line 1330) | class Geometry3KPreprocessor(ResponsePreprocessor):
method preprocess (line 1332) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
FILE: swift/dataset/dataset_meta.py
class SubsetDataset (line 27) | class SubsetDataset:
method __post_init__ (line 40) | def __post_init__(self):
method set_default (line 44) | def set_default(self, dataset_meta: 'DatasetMeta') -> 'SubsetDataset':
class BaseDatasetLoader (line 53) | class BaseDatasetLoader(ABC):
method load (line 56) | def load(
method download_ms_dataset (line 66) | def download_ms_dataset(ms_dataset_id: str, files: List[str], force_do...
method concat_datasets (line 96) | def concat_datasets(datasets: List[HfDataset]) -> Optional[HfDataset]:
method interleave_datasets (line 104) | def interleave_datasets(datasets, *args, **kwargs):
method shuffle_dataset (line 112) | def shuffle_dataset(dataset, seed: int, buffer_size: int = 1000):
method post_process (line 120) | def post_process(
class DatasetMeta (line 174) | class DatasetMeta:
method __post_init__ (line 193) | def __post_init__(self):
FILE: swift/dataset/dataset_syntax.py
class DatasetSyntax (line 14) | class DatasetSyntax:
method __post_init__ (line 20) | def __post_init__(self):
method get_raw (line 26) | def get_raw(self):
method _safe_split (line 32) | def _safe_split(s: str,
method parse (line 56) | def parse(cls, dataset: str) -> 'DatasetSyntax':
method get_dataset_meta (line 81) | def get_dataset_meta(self, use_hf: bool):
method _get_dataset_meta_mapping (line 92) | def _get_dataset_meta_mapping() -> Dict[Tuple[str, str], DatasetMeta]:
method get_dataset_name (line 108) | def get_dataset_name(dataset_id: str) -> str:
method _get_matched_dataset_meta (line 120) | def _get_matched_dataset_meta(self, dataset_meta_mapping):
FILE: swift/dataset/indexed_dataset.py
class IndexedDatasetBuilder (line 13) | class IndexedDatasetBuilder:
method __init__ (line 16) | def __init__(self, dataset_name: str):
method _write_worker (line 30) | def _write_worker(self):
method add_items (line 52) | def add_items(self, items: List[Any]) -> None:
method finalize (line 58) | def finalize(self):
class BinReader (line 73) | class BinReader:
method __init__ (line 75) | def __init__(self, bin_path: str):
method read_buffer (line 84) | def read_buffer(self, offset: int, size: int) -> bytes:
method __del__ (line 89) | def __del__(self):
class IndexedDataset (line 95) | class IndexedDataset(Dataset):
method get_cache_dir (line 100) | def get_cache_dir(dataset_name: str):
method __init__ (line 107) | def __init__(self, dataset_name: str):
method __getitem__ (line 122) | def __getitem__(self, index: int):
method __len__ (line 131) | def __len__(self):
FILE: swift/dataset/loader.py
class DatasetLoader (line 21) | class DatasetLoader(BaseDatasetLoader):
method __init__ (line 23) | def __init__(
method _load_dataset_path (line 43) | def _load_dataset_path(
method _load_repo_dataset (line 64) | def _load_repo_dataset(
method _select_subsets (line 132) | def _select_subsets(subsets: List[str], dataset_meta: DatasetMeta) -> ...
method load (line 151) | def load(
function init_self_cognition_preprocessor (line 179) | def init_self_cognition_preprocessor(
function load_dataset (line 206) | def load_dataset(
FILE: swift/dataset/media.py
class MediaResource (line 13) | class MediaResource:
method get_url (line 27) | def get_url(media_type):
method download (line 33) | def download(media_type_or_url: Union[str, List[str]],
method move_directory_contents (line 57) | def move_directory_contents(src_dir, dst_dir):
method _safe_download (line 74) | def _safe_download(media_type: Union[str, List[str]],
method safe_save (line 121) | def safe_save(image, file_name, folder, format='JPEG'):
FILE: swift/dataset/packing.py
function calculate_matched_group (line 16) | def calculate_matched_group(sequences, packing_length: int, is_finished:...
class PackingDataset (line 29) | class PackingDataset(Dataset):
method __init__ (line 32) | def __init__(
method create_packed_idx (line 89) | def create_packed_idx(self, rank, offset, lengths):
method __getitem__ (line 104) | def __getitem__(self, index):
method __len__ (line 109) | def __len__(self):
class IterablePackingDataset (line 113) | class IterablePackingDataset(IterableDataset):
method __init__ (line 115) | def __init__(
method _processor (line 145) | def _processor(self):
method _put_data_in_queue (line 156) | def _put_data_in_queue(self, iterator) -> int:
method _fetch_data_out_queue (line 165) | def _fetch_data_out_queue(self, last_res, num_samples):
method cyclic_iter (line 177) | def cyclic_iter(iterable):
method __iter__ (line 182) | def __iter__(self):
FILE: swift/dataset/preprocessor/core.py
class RowPreprocessor (line 25) | class RowPreprocessor:
method __init__ (line 36) | def __init__(self,
method _check_messages (line 61) | def _check_messages(row: Dict[str, Any]) -> None:
method _cast_mm_data (line 79) | def _cast_mm_data(row: Dict[str, Any]) -> None:
method _check_rejected_response (line 102) | def _check_rejected_response(row: Dict[str, Any]) -> None:
method preprocess (line 110) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
method prepare_dataset (line 113) | def prepare_dataset(self, dataset: DATASET_TYPE) -> DATASET_TYPE:
method batched_to_rows (line 117) | def batched_to_rows(batched_row: Dict[str, Any]):
method rows_to_batched (line 123) | def rows_to_batched(rows: List[Dict[str, Any]]):
method _remove_prefix_keys (line 136) | def _remove_prefix_keys(row, prefix: str):
method _check_objects (line 145) | def _check_objects(row):
method batched_preprocess (line 167) | def batched_preprocess(self, batched_row: Dict[str, Any], *, strict: b...
method get_features_dataset (line 210) | def get_features_dataset(dataset: DATASET_TYPE) -> DATASET_TYPE:
method safe_rename_columns (line 217) | def safe_rename_columns(dataset, columns):
method remove_useless_columns (line 237) | def remove_useless_columns(dataset: DATASET_TYPE) -> DATASET_TYPE:
method _patch_arrow_writer (line 247) | def _patch_arrow_writer():
method _cast_pil_image (line 284) | def _cast_pil_image(self, dataset):
method __call__ (line 291) | def __call__(
class ResponsePreprocessor (line 356) | class ResponsePreprocessor(RowPreprocessor):
method __init__ (line 359) | def __init__(self, *, columns: Optional[Dict[str, str]] = None, **kwar...
method preprocess (line 372) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class AlpacaPreprocessor (line 394) | class AlpacaPreprocessor(ResponsePreprocessor):
method concat_inst_input (line 397) | def concat_inst_input(cls, instruction, input_):
method preprocess (line 405) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
function default_repair_messages (line 415) | def default_repair_messages(s: Union[str, Any]) -> Any:
class MessagesPreprocessor (line 421) | class MessagesPreprocessor(RowPreprocessor):
method __init__ (line 423) | def __init__(
method _is_sharegpt_format (line 461) | def _is_sharegpt_format(message: Dict[str, str]) -> bool:
method sharegpt_to_messages (line 466) | def sharegpt_to_messages(self, messages: List[Dict[str, str]], system:...
method to_std_messages (line 479) | def to_std_messages(self, messages: List[Dict[str, str]], system: Opti...
method _to_std_key (line 496) | def _to_std_key(messages: List[Dict[str, str]], std_key: str, optional...
method preprocess (line 502) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class ClsPreprocessor (line 523) | class ClsPreprocessor(ResponsePreprocessor):
method preprocess (line 525) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class AutoPreprocessor (line 531) | class AutoPreprocessor:
method __init__ (line 533) | def __init__(self, *, columns: Optional[Dict[str, str]] = None, **kwar...
method _get_preprocessor (line 537) | def _get_preprocessor(self, dataset: DATASET_TYPE) -> RowPreprocessor:
method __call__ (line 546) | def __call__(
FILE: swift/dataset/preprocessor/extra.py
class GroundingMixin (line 8) | class GroundingMixin:
method construct_grounding_prompt (line 47) | def construct_grounding_prompt(self):
class TextGenerationPreprocessor (line 55) | class TextGenerationPreprocessor(ResponsePreprocessor):
method __init__ (line 57) | def __init__(self,
method preprocess (line 67) | def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
class ClsGenerationPreprocessor (line 72) | class ClsGenerationPreprocessor(ResponsePreprocessor):
method __init__ (line 74) | def __init__(self,
method preprocess (line 100) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
FILE: swift/dataset/register.py
function get_dataset_list (line 14) | def get_dataset_list():
function register_dataset (line 26) | def register_dataset(dataset_meta: DatasetMeta, *, exist_ok: bool = Fals...
function _preprocess_d_info (line 43) | def _preprocess_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str...
function _register_d_info (line 72) | def _register_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str] ...
function register_dataset_info (line 84) | def register_dataset_info(dataset_info: Union[str, List[str], None] = No...
FILE: swift/dataset/utils.py
function sample_dataset (line 18) | def sample_dataset(
class LazyLLMDataset (line 57) | class LazyLLMDataset(Dataset):
method __init__ (line 60) | def __init__(self,
method __getitem__ (line 85) | def __getitem__(self, idx: int) -> Dict[str, Any]:
method __len__ (line 111) | def __len__(self) -> int:
class EncodePreprocessor (line 115) | class EncodePreprocessor(RowPreprocessor):
method __init__ (line 117) | def __init__(self, template: 'Template'):
method preprocess (line 121) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
class AddLengthPreprocessor (line 125) | class AddLengthPreprocessor(EncodePreprocessor):
method preprocess (line 127) | def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
function get_temporary_cache_files_directory (line 136) | def get_temporary_cache_files_directory(prefix=None):
FILE: swift/hub/hub.py
class HubOperation (line 23) | class HubOperation:
method patch_hub (line 27) | def patch_hub(cls):
method try_login (line 31) | def try_login(cls, token: Optional[str] = None) -> bool:
method create_model_repo (line 43) | def create_model_repo(cls, repo_id: str, token: Optional[str] = None, ...
method push_to_hub (line 54) | def push_to_hub(cls,
method load_dataset (line 81) | def load_dataset(cls,
method download_model (line 102) | def download_model(cls,
class MSHub (line 124) | class MSHub(HubOperation):
method create_repo (line 128) | def create_repo(repo_id: str,
method upload_folder (line 150) | def upload_folder(
method patch_hub (line 175) | def patch_hub(cls):
method try_login (line 194) | def try_login(cls, token: Optional[str] = None) -> bool:
method create_model_repo (line 205) | def create_model_repo(cls, repo_id: str, token: Optional[str] = None, ...
method push_to_hub (line 244) | def push_to_hub(cls,
method load_dataset (line 283) | def load_dataset(cls,
method download_model (line 311) | def download_model(cls,
method add_patterns_to_file (line 325) | def add_patterns_to_file(repo,
method add_patterns_to_gitignore (line 365) | def add_patterns_to_gitignore(repo, patterns: List[str], commit_messag...
method add_patterns_to_gitattributes (line 369) | def add_patterns_to_gitattributes(repo, patterns: List[str], commit_me...
class HFHub (line 382) | class HFHub(HubOperation):
method try_login (line 385) | def try_login(cls, token: Optional[str] = None) -> bool:
method create_model_repo (line 389) | def create_model_repo(cls, repo_id: str, token: Optional[str] = None, ...
method push_to_hub (line 393) | def push_to_hub(cls,
method load_dataset (line 419) | def load_dataset(cls,
method download_model (line 442) | def download_model(cls,
function get_hub (line 459) | def get_hub(use_hf: Optional[bool] = None):
FILE: swift/infer_engine/base.py
class BaseInferEngine (line 9) | class BaseInferEngine(ABC):
method infer (line 12) | def infer(self,
method infer_async (line 39) | async def infer_async(self,
FILE: swift/infer_engine/grpo_vllm_engine.py
class GRPOVllmEngine (line 23) | class GRPOVllmEngine(VllmEngine):
method infer (line 25) | def infer(
method async_infer (line 61) | async def async_infer(self,
method _batch_infer_stream (line 85) | async def _batch_infer_stream(self,
method _create_chat_completion_response (line 107) | def _create_chat_completion_response(self, result, inputs, request_con...
method _add_adapter (line 142) | def _add_adapter(self, adapter_request: Optional[Union[AdapterRequest,...
FILE: swift/infer_engine/infer_client.py
class InferClient (line 16) | class InferClient(InferEngine):
method __init__ (line 18) | def __init__(self,
method models (line 44) | def models(self):
method get_model_list (line 53) | def get_model_list(self) -> ModelList:
method _get_request_kwargs (line 59) | def _get_request_kwargs(self) -> Dict[str, Any]:
method get_model_list_async (line 67) | async def get_model_list_async(self) -> ModelList:
method infer (line 74) | def infer(
method _prepare_request_data (line 100) | def _prepare_request_data(model: str, infer_request: InferRequest, req...
method _parse_stream_data (line 112) | def _parse_stream_data(data: bytes) -> Optional[str]:
method infer_async (line 120) | async def infer_async(
FILE: swift/infer_engine/infer_engine.py
class InferEngine (line 21) | class InferEngine(BaseInferEngine, ProcessorMixin):
method __init__ (line 23) | def __init__(self, template: Template):
method _get_template (line 37) | def _get_template(self, processor: Processor, template_type: Optional[...
method _get_stop_words (line 48) | def _get_stop_words(self, stop_words: List[Union[str, List[int], None]...
method _get_stop_token_ids (line 60) | def _get_stop_token_ids(self, stop_words: List[Union[str, List[int], N...
method async_iter_to_iter (line 79) | def async_iter_to_iter(self, async_iter, prog_bar, metrics) -> Iterator:
method batch_run (line 111) | async def batch_run(tasks):
method _batch_infer_stream (line 114) | def _batch_infer_stream(
method _get_usage_info (line 147) | def _get_usage_info(num_prompt_tokens: int, num_generated_tokens: int)...
method _update_usage_info (line 155) | def _update_usage_info(origin_use_info: UsageInfo, num_generated_token...
method _update_metrics (line 163) | def _update_metrics(result, metrics: Optional[List[Metric]] = None):
method infer (line 176) | def infer(self,
method _get_toolcall (line 190) | def _get_toolcall(self, response: str) -> Optional[List[ChatCompletion...
method _get_num_tokens (line 199) | def _get_num_tokens(inputs: Dict[str, Any]) -> int:
method set_default_max_tokens (line 210) | def set_default_max_tokens(self, request_config: RequestConfig, inputs...
method _get_logprobs (line 228) | def _get_logprobs(self,
method _get_finish_reason (line 253) | def _get_finish_reason(max_tokens: int, completion_tokens: int, is_fin...
method thread_run (line 264) | def thread_run(target, args=(), kwargs=None):
method safe_asyncio_run (line 283) | def safe_asyncio_run(coro):
method _batch_encode (line 290) | def _batch_encode(self, infer_requests: List[InferRequest], strict: bo...
method _add_error_list (line 311) | def _add_error_list(outputs, error_list):
FILE: swift/infer_engine/lmdeploy_engine.py
class LmdeployEngine (line 38) | class LmdeployEngine(InferEngine):
method __init__ (line 40) | def __init__(
method _get_processor (line 92) | def _get_processor(self):
method _prepare_engine_kwargs (line 102) | def _prepare_engine_kwargs(self, engine_kwargs):
method _patch_pipeline (line 128) | def _patch_pipeline(self):
method _prepare_engine (line 140) | def _prepare_engine(self):
method _load_generation_config (line 145) | def _load_generation_config(self):
method _add_stop_words (line 161) | def _add_stop_words(self, generation_config: LmdeployGenerationConfig,...
method _prepare_generation_config (line 168) | def _prepare_generation_config(self, request_config: RequestConfig) ->...
method _infer_stream_async (line 191) | async def _infer_stream_async(
method _infer_full_async (line 240) | async def _infer_full_async(
method infer_async (line 293) | async def infer_async(self,
method _batch_infer_stream (line 333) | def _batch_infer_stream(self, *args, **kwargs):
method infer (line 340) | def infer(
FILE: swift/infer_engine/patch.py
function patch_auto_tokenizer (line 8) | def patch_auto_tokenizer(tokenizer: PreTrainedTokenizerBase):
function patch_auto_config (line 23) | def patch_auto_config(config: PretrainedConfig):
FILE: swift/infer_engine/protocol.py
class InferRequest (line 19) | class InferRequest:
method __post_init__ (line 69) | def __post_init__(self):
method remove_response (line 77) | def remove_response(messages) -> Optional[str]:
method _to_printable (line 81) | def _to_printable(obj, key: Optional[str] = None):
method to_printable (line 96) | def to_printable(self):
class RolloutInferRequest (line 101) | class RolloutInferRequest(InferRequest):
function random_uuid (line 134) | def random_uuid() -> str:
class Model (line 139) | class Model:
class ModelList (line 148) | class ModelList:
class RequestConfig (line 154) | class RequestConfig:
method __post_init__ (line 186) | def __post_init__(self):
class CompletionRequestMixin (line 192) | class CompletionRequestMixin:
class EmbeddingRequestMixin (line 198) | class EmbeddingRequestMixin:
class ChatCompletionRequestMixin (line 205) | class ChatCompletionRequestMixin:
method __post_init__ (line 211) | def __post_init__(self):
class MultiModalRequestMixin (line 227) | class MultiModalRequestMixin:
method to_base64 (line 234) | def to_base64(mm_data: Union[str, Image.Image, bytes]) -> str:
method __post_init__ (line 253) | def __post_init__(self):
class CompletionRequest (line 264) | class CompletionRequest(RequestConfig, MultiModalRequestMixin, Completio...
method __post_init__ (line 266) | def __post_init__(self):
class EmbeddingRequest (line 272) | class EmbeddingRequest(RequestConfig, MultiModalRequestMixin, EmbeddingR...
method __post_init__ (line 274) | def __post_init__(self):
method parse (line 278) | def parse(self) -> Tuple['InferRequest', 'RequestConfig']:
class ChatCompletionRequest (line 289) | class ChatCompletionRequest(RequestConfig, MultiModalRequestMixin, ChatC...
method __post_init__ (line 291) | def __post_init__(self):
method convert_to_base64 (line 297) | def convert_to_base64(self):
method parse (line 332) | def parse(self) -> Tuple['InferRequest', 'RequestConfig']:
method from_cmpl_request (line 342) | def from_cmpl_request(cls, cmpl_request: Union[CompletionRequest, Embe...
class UsageInfo (line 355) | class UsageInfo:
class Function (line 362) | class Function:
method __post_init__ (line 366) | def __post_init__(self):
class ChatCompletionMessageToolCall (line 374) | class ChatCompletionMessageToolCall:
class ChatMessage (line 381) | class ChatMessage:
class ChatCompletionResponseChoice (line 389) | class ChatCompletionResponseChoice:
method to_cmpl_choice (line 396) | def to_cmpl_choice(self) -> 'CompletionResponseChoice':
class EmbeddingResponseData (line 403) | class EmbeddingResponseData:
class EmbeddingResponse (line 410) | class EmbeddingResponse:
class CompletionResponseChoice (line 420) | class CompletionResponseChoice:
class ChatCompletionResponse (line 428) | class ChatCompletionResponse:
method to_cmpl_response (line 438) | def to_cmpl_response(self) -> 'CompletionResponse':
class RolloutOutput (line 445) | class RolloutOutput(BaseModel):
method _wrap_flat_list (line 483) | def _wrap_flat_list(cls, v):
method model_post_init (line 488) | def model_post_init(self, __context):
method mminfo_to_serializable (line 493) | def mminfo_to_serializable(self):
class CompletionResponse (line 506) | class CompletionResponse:
class DeltaMessage (line 516) | class DeltaMessage:
class ChatCompletionResponseStreamChoice (line 524) | class ChatCompletionResponseStreamChoice:
method to_cmpl_choice (line 530) | def to_cmpl_choice(self) -> 'CompletionResponseStreamChoice':
class CompletionResponseStreamChoice (line 537) | class CompletionResponseStreamChoice:
class ChatCompletionStreamResponse (line 545) | class ChatCompletionStreamResponse:
method to_cmpl_response (line 553) | def to_cmpl_response(self) -> 'CompletionStreamResponse':
class CompletionStreamResponse (line 561) | class CompletionStreamResponse:
class InitCommunicatorRequest (line 570) | class InitCommunicatorRequest(BaseModel):
class UpdateWeightsRequest (line 576) | class UpdateWeightsRequest(BaseModel):
FILE: swift/infer_engine/sglang_engine.py
class SglangEngine (line 27) | class SglangEngine(InferEngine):
method __init__ (line 29) | def __init__(
method _get_processor (line 103) | def _get_processor(self):
method _prepare_server_args (line 114) | def _prepare_server_args(self, engine_kwargs):
method _load_generation_config (line 152) | def _load_generation_config(self) -> None:
method _prepare_generation_config (line 166) | def _prepare_generation_config(self, request_config: RequestConfig) ->...
method _add_stop_words (line 179) | def _add_stop_words(self, generation_config: Dict[str, Any], request_c...
method _create_chat_completion_response (line 184) | def _create_chat_completion_response(self, output, inputs, return_deta...
method infer (line 212) | def infer(
method infer_async (line 222) | async def infer_async(self,
method _infer_embedding_async (line 249) | async def _infer_embedding_async(self, inputs: Dict[str, Any], **kwarg...
method _infer_full_async (line 262) | async def _infer_full_async(self, inputs: Dict[str, Any], generation_c...
method _infer_stream_async (line 269) | async def _infer_stream_async(self, inputs: Dict[str, Any], generation...
method _create_chat_completion_stream_response (line 281) | def _create_chat_completion_stream_response(self, output, infer_stream...
FILE: swift/infer_engine/transformers_engine.py
class _GenerationConfig (line 32) | class _GenerationConfig(GenerationConfig):
method __repr__ (line 34) | def __repr__(self) -> str:
class TransformersEngine (line 44) | class TransformersEngine(InferEngine):
method __init__ (line 46) | def __init__(
method _get_model_processor (line 104) | def _get_model_processor(self, model_id_or_path, **kwargs):
method _start_infer_worker (line 120) | def _start_infer_worker(self):
method _fetch_infer_requests (line 124) | def _fetch_infer_requests(self):
method _infer_worker (line 147) | def _infer_worker(self):
method _add_adapter (line 169) | def _add_adapter(self, adapter_path: str, adapter_name: Optional[str] ...
method _prepare_generation_config (line 172) | def _prepare_generation_config(self, request_config: RequestConfig) ->...
method _add_stop_words (line 180) | def _add_stop_words(self, generation_config: _GenerationConfig, reques...
method preprocess_logits (line 186) | def preprocess_logits(batched_logits: Optional[List[torch.Tensor]], ba...
method _update_batched_logprobs (line 205) | def _update_batched_logprobs(batched_logprobs: List[torch.Tensor], log...
method _infer_stream (line 218) | def _infer_stream(self, inputs: Dict[str, Any], *, generation_config: ...
method _get_adapter_names (line 318) | def _get_adapter_names(self, adapter_request: Optional[AdapterRequest]...
method _infer_forward (line 329) | def _infer_forward(self, inputs: Dict[str, Any], adapter_request: Opti...
method _infer_full (line 387) | def _infer_full(self, inputs: Dict[str, Any], *, generation_config: Ge...
method infer_async (line 455) | async def infer_async(
method _infer (line 490) | def _infer(
method infer (line 551) | def infer(
FILE: swift/infer_engine/utils.py
class AdapterRequest (line 24) | class AdapterRequest:
class InferTools (line 29) | class InferTools:
method _is_chinese_char (line 32) | def _is_chinese_char(cp: int) -> bool:
class InferStreamer (line 43) | class InferStreamer(InferTools):
method __init__ (line 45) | def __init__(self, template, **decode_kwargs):
method _align_blank_suffix (line 55) | def _align_blank_suffix(self, response: str) -> str:
method _get_response (line 66) | def _get_response(self, response: str, is_finished: bool, token_len: i...
method get_printable_text (line 87) | def get_printable_text(self, raw_tokens: List[int], is_finished: bool)...
class StreamerMixin (line 97) | class StreamerMixin:
method __init__ (line 99) | def __init__(self):
method __iter__ (line 102) | def __iter__(self):
method __next__ (line 105) | def __next__(self) -> torch.Tensor:
class TokensIteratorStreamer (line 113) | class TokensIteratorStreamer(StreamerMixin, BaseStreamer):
method put (line 115) | def put(self, value: torch.Tensor) -> None:
method end (line 118) | def end(self) -> None:
class LogitsStreamer (line 122) | class LogitsStreamer(LogitsProcessor):
method __init__ (line 124) | def __init__(self):
method __call__ (line 127) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
function _set_generation_config_default_value (line 132) | def _set_generation_config_default_value(model_generation_config: Genera...
function prepare_generation_config (line 143) | def prepare_generation_config(model_generation_config: Optional[Generati...
function patch_lmdeploy (line 177) | def patch_lmdeploy(load_weights=False):
function patch_npu_vllm (line 359) | def patch_npu_vllm(vllm_device: str):
function patch_vllm_memory_leak (line 377) | def patch_vllm_memory_leak():
FILE: swift/infer_engine/vllm_engine.py
class VllmEngine (line 51) | class VllmEngine(InferEngine):
method __init__ (line 53) | def __init__(
method _get_processor (line 168) | def _get_processor(self):
method _prepare_engine (line 180) | def _prepare_engine(self) -> None:
method _prepare_engine_kwargs (line 186) | def _prepare_engine_kwargs(self, max_model_len, engine_kwargs) -> None:
method _prepare_reasoning_parser (line 257) | def _prepare_reasoning_parser(self, reasoning_parser: Optional[str]) -...
method _fix_vllm_bug (line 275) | def _fix_vllm_bug(self) -> None:
method _load_generation_config (line 291) | def _load_generation_config(self) -> None:
method _add_stop_words (line 310) | def _add_stop_words(self, generation_config: SamplingParams, request_c...
method _version_ge (line 318) | def _version_ge(base_version: str):
method _add_adapter (line 324) | def _add_adapter(self, adapter_request: Optional[AdapterRequest] = None):
method _add_request (line 337) | def _add_request(self,
method _get_logprobs (line 397) | def _get_logprobs(self,
method _prepare_generation_config (line 410) | def _prepare_generation_config(self, request_config: RequestConfig) ->...
method inner_model (line 459) | def inner_model(self):
method inner_model_executor (line 463) | def inner_model_executor(self):
method _infer_stream_async (line 466) | async def _infer_stream_async(
method _create_chat_completion_stream_response (line 484) | def _create_chat_completion_stream_response(self, result, request_conf...
method _create_embedding_response (line 554) | def _create_embedding_response(self, result, generation_config, reques...
method _create_chat_completion_response (line 561) | def _create_chat_completion_response(
method _create_seq_cls_response (line 616) | def _create_seq_cls_response(
method _infer_full_async (line 653) | async def _infer_full_async(
method _batch_infer_stream (line 674) | def _batch_infer_stream(self, *args, **kwargs):
method infer (line 679) | def infer(
method infer_async (line 758) | async def infer_async(
method patch_remove_log (line 793) | def patch_remove_log():
FILE: swift/loss/base.py
class BaseLoss (line 10) | class BaseLoss(ABC):
method __init__ (line 23) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method __call__ (line 36) | def __call__(self, outputs, labels, *, num_items_in_batch=None, loss_s...
FILE: swift/loss/causal_lm.py
class CustomCrossEntropyLoss (line 5) | class CustomCrossEntropyLoss(BaseLoss):
method __call__ (line 7) | def __call__(self, outputs, labels, *, num_items_in_batch=None, loss_s...
FILE: swift/loss/embedding.py
class SiameseDistanceMetric (line 19) | class SiameseDistanceMetric(Enum):
function _parse_pair_sentence (line 27) | def _parse_pair_sentence(outputs):
class CosineSimilarityLoss (line 45) | class CosineSimilarityLoss(BaseLoss):
method __call__ (line 47) | def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
class ContrastiveLoss (line 56) | class ContrastiveLoss(BaseLoss):
method __call__ (line 58) | def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
class OnlineContrastiveLoss (line 68) | class OnlineContrastiveLoss(BaseLoss):
method __call__ (line 70) | def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
function _parse_multi_negative_sentences (line 88) | def _parse_multi_negative_sentences(sentences, labels, hard_negatives=No...
class InfonceLoss (line 113) | class InfonceLoss(BaseLoss):
method __call__ (line 115) | def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
FILE: swift/loss/reranker.py
class PointwiseRerankerLoss (line 12) | class PointwiseRerankerLoss(BaseLoss):
method __call__ (line 14) | def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
class ListwiseRerankerLoss (line 23) | class ListwiseRerankerLoss(BaseLoss):
method __call__ (line 25) | def __call__(self, outputs, labels, **kwargs):
FILE: swift/loss_scale/agent.py
class AgentFlanLossScale (line 9) | class AgentFlanLossScale(ConfigLossScale):
method get_loss_scale (line 13) | def get_loss_scale(self, context: str, *, query: Optional[str] = None):
class REACTLossScale (line 19) | class REACTLossScale(ConfigLossScale):
class QwenLossScale (line 23) | class QwenLossScale(ConfigLossScale):
class HermesLossScale (line 27) | class HermesLossScale(ConfigLossScale):
class AlphaUmiLossScale (line 31) | class AlphaUmiLossScale(ConfigLossScale):
FILE: swift/loss_scale/base.py
class LossScale (line 12) | class LossScale:
method __init__ (line 34) | def __init__(self, base_strategy: Literal['default', 'last_round', 'al...
method get_loss_scale (line 48) | def get_loss_scale(self, context: str, **kwargs) -> Tuple[List[str], L...
method __call__ (line 66) | def __call__(self, context_list: List[str], context_types: List[Contex...
method is_loss_scale_binary (line 120) | def is_loss_scale_binary(self):
class ConfigLossScale (line 125) | class ConfigLossScale(LossScale):
method __init__ (line 141) | def __init__(self, base_strategy: Literal['default', 'last_round', 'al...
method is_loss_scale_binary (line 160) | def is_loss_scale_binary(self):
method get_loss_scale (line 167) | def get_loss_scale(self, context: str, *, query: Optional[str] = None):
FILE: swift/loss_scale/mapping.py
function get_loss_scale (line 19) | def get_loss_scale(loss_scale: str) -> LossScale:
FILE: swift/loss_scale/other.py
class IgnoreEmptyThinkLossScale (line 5) | class IgnoreEmptyThinkLossScale(ConfigLossScale):
FILE: swift/loss_scale/utils.py
function calculate_loss_scale (line 7) | def calculate_loss_scale(query: str,
FILE: swift/megatron/arguments/export_args.py
class MegatronExportArguments (line 14) | class MegatronExportArguments(MegatronBaseArguments):
method _init_output_dir (line 22) | def _init_output_dir(self):
method _init_megatron_args (line 37) | def _init_megatron_args(self):
method _init_convert (line 55) | def _init_convert(self):
FILE: swift/megatron/arguments/megatron_args.py
class RLHFMegatronArgumentsMixin (line 25) | class RLHFMegatronArgumentsMixin:
method _init_kto (line 182) | def _init_kto(self):
method __post_init__ (line 189) | def __post_init__(self):
method _init_grpo (line 235) | def _init_grpo(self):
class MegatronTunerMixin (line 322) | class MegatronTunerMixin:
method __post_init__ (line 346) | def __post_init__(self):
class MegatronArguments (line 354) | class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
method load_args_config (line 560) | def load_args_config(ckpt_dir: Optional[str]) -> Dict[str, Any]:
method _set_default (line 579) | def _set_default(self):
method _init_mixed_precision (line 594) | def _init_mixed_precision(self):
method __post_init__ (line 601) | def __post_init__(self):
method _init_teacher_model (line 715) | def _init_teacher_model(self):
method _init_vpp_size (line 726) | def _init_vpp_size(self):
method _load_adapter_config (line 746) | def _load_adapter_config(self):
method init_iters (line 765) | def init_iters(self, train_dataset, val_dataset):
method _init_multimodal_full (line 804) | def _init_multimodal_full(self):
method _map_dtype (line 824) | def _map_dtype(self):
method _init_weigh_decay (line 843) | def _init_weigh_decay(self):
FILE: swift/megatron/arguments/megatron_base_args.py
class MegatronBaseArguments (line 13) | class MegatronBaseArguments(MegatronArguments, BaseArguments):
method _init_megatron_args (line 15) | def _init_megatron_args(self):
method __post_init__ (line 18) | def __post_init__(self):
FILE: swift/megatron/arguments/pretrain_args.py
class MegatronPretrainArguments (line 4) | class MegatronPretrainArguments(MegatronSftArguments):
FILE: swift/megatron/arguments/rlhf_args.py
class MegatronRLHFArguments (line 9) | class MegatronRLHFArguments(MegatronSftArguments):
method __post_init__ (line 16) | def __post_init__(self):
method _init_truncation_strategy (line 23) | def _init_truncation_strategy(self):
FILE: swift/megatron/arguments/sft_args.py
class MegatronSftArguments (line 13) | class MegatronSftArguments(MegatronBaseArguments):
method _init_output_dir (line 18) | def _init_output_dir(self):
method _init_ckpt_dir (line 28) | def _init_ckpt_dir(self, adapters=None):
method _init_megatron_args (line 38) | def _init_megatron_args(self):
method __post_init__ (line 42) | def __post_init__(self):
FILE: swift/megatron/callbacks/base.py
class MegatronCallback (line 8) | class MegatronCallback:
method __init__ (line 10) | def __init__(self, trainer: 'BaseMegatronTrainer'):
method on_train_begin (line 15) | def on_train_begin(self):
method on_train_end (line 18) | def on_train_end(self):
method on_step_begin (line 21) | def on_step_begin(self):
method on_step_end (line 24) | def on_step_end(self):
method on_log (line 27) | def on_log(self, logs):
method on_eval_begin (line 30) | def on_eval_begin(self):
method on_eval_end (line 33) | def on_eval_end(self):
method on_eval_step (line 36) | def on_eval_step(self):
method on_save (line 39) | def on_save(self, output_dir):
FILE: swift/megatron/callbacks/default_flow.py
class DefaultFlowCallback (line 7) | class DefaultFlowCallback(MegatronCallback):
method on_train_begin (line 9) | def on_train_begin(self):
method on_step_end (line 15) | def on_step_end(self):
method on_eval_begin (line 35) | def on_eval_begin(self):
method on_eval_end (line 40) | def on_eval_end(self):
FILE: swift/megatron/callbacks/print.py
class PrintCallback (line 14) | class PrintCallback(MegatronCallback):
method __init__ (line 16) | def __init__(self, trainer):
method on_train_begin (line 23) | def on_train_begin(self):
method on_train_end (line 34) | def on_train_end(self):
method on_step_end (line 38) | def on_step_end(self):
method on_eval_begin (line 43) | def on_eval_begin(self):
method on_eval_end (line 47) | def on_eval_end(self):
method on_eval_step (line 51) | def on_eval_step(self):
method on_log (line 54) | def on_log(self, logs):
FILE: swift/megatron/callbacks/swanlab.py
class SwanlabCallback (line 9) | class SwanlabCallback(MegatronCallback):
method __init__ (line 11) | def __init__(self, trainer):
method setup (line 21) | def setup(self):
method on_log (line 32) | def on_log(self, logs):
FILE: swift/megatron/callbacks/tensorboard.py
class TensorboardCallback (line 7) | class TensorboardCallback(MegatronCallback):
method __init__ (line 9) | def __init__(self, trainer):
method on_log (line 23) | def on_log(self, logs):
method on_train_end (line 29) | def on_train_end(self):
FILE: swift/megatron/callbacks/utils.py
function rewrite_logs (line 4) | def rewrite_logs(logs):
FILE: swift/megatron/callbacks/wandb.py
class WandbCallback (line 9) | class WandbCallback(MegatronCallback):
method __init__ (line 11) | def __init__(self, trainer):
method setup (line 21) | def setup(self):
method on_log (line 28) | def on_log(self, logs):
FILE: swift/megatron/convert.py
function convert_hf2mcore (line 31) | def convert_hf2mcore(args: ExportArguments) -> None:
function convert_mcore2hf (line 65) | def convert_mcore2hf(args: ExportArguments) -> None:
FILE: swift/megatron/init.py
function _patch_transformer_engine (line 26) | def _patch_transformer_engine():
function _patch__batched_p2p_ops (line 45) | def _patch__batched_p2p_ops():
function _patch_mla_attention (line 57) | def _patch_mla_attention():
function _patch_peft_BaseTuner (line 357) | def _patch_peft_BaseTuner():
function _patch_TEGroupedLinear (line 375) | def _patch_TEGroupedLinear():
function _patch_mtp (line 389) | def _patch_mtp():
function _patch_peft_ModulesToSaveWrapper (line 487) | def _patch_peft_ModulesToSaveWrapper():
function _patch_TransformerLayer (line 525) | def _patch_TransformerLayer():
function _patch_compile_helpers (line 557) | def _patch_compile_helpers():
function _patch_flash_attn (line 568) | def _patch_flash_attn():
function _patch_torch_FileSystemReader (line 575) | def _patch_torch_FileSystemReader():
function _patch_validate_non_overlapping_shards_metadata (line 618) | def _patch_validate_non_overlapping_shards_metadata():
function _patch_TELinear (line 636) | def _patch_TELinear():
function _patch__write_item (line 646) | def _patch__write_item():
function _patch_mrope (line 665) | def _patch_mrope():
function _patch_unified_memory (line 778) | def _patch_unified_memory():
function _patch_dsa (line 804) | def _patch_dsa():
function init_megatron_env (line 955) | def init_megatron_env():
FILE: swift/megatron/model/constant.py
class LLMMegatronModelType (line 2) | class LLMMegatronModelType:
class MLLMMegatronModelType (line 12) | class MLLMMegatronModelType:
class MegatronModelType (line 29) | class MegatronModelType(LLMMegatronModelType, MLLMMegatronModelType):
FILE: swift/megatron/model/gpt_bridge.py
class GPTBridge (line 34) | class GPTBridge:
method __init__ (line 44) | def __init__(self, args, attr_prefix: Literal['', 'teacher_'] = ''):
method get_hf_mlp_prefix (line 115) | def get_hf_mlp_prefix(self, layer_idx):
method _get_hf_mlp (line 121) | def _get_hf_mlp(self, layer_idx):
method _init_meta_hf_model (line 124) | def _init_meta_hf_model(self):
method _get_tp_split_dim (line 129) | def _get_tp_split_dim(self, mg_key: Optional[str]) -> Optional[int]:
method _split_tp (line 180) | def _split_tp(self, hf_weight, tp_dim, is_expert, is_embedding: bool):
method _set_weight (line 196) | def _set_weight(
method _copy_scale_inv (line 240) | def _copy_scale_inv(tensor, scale_inv):
method fp8_quantizer (line 251) | def fp8_quantizer(self):
method _is_fp8_param (line 259) | def _is_fp8_param(param):
method _set_module (line 266) | def _set_module(self, mg_module, hf_state_dict, hf_prefix: str, to_mco...
method _all_gather_tp (line 326) | def _all_gather_tp(self, tensor, tp_dim, is_expert):
method _broadcast_ep_pp (line 353) | def _broadcast_ep_pp(self, tensor, is_expert):
method _get_weight (line 380) | def _get_weight(
method _set_state_dict (line 440) | def _set_state_dict(self,
method _remove_prefix (line 517) | def _remove_prefix(state_dict, prefix: str):
method _add_prefix (line 523) | def _add_prefix(state_dict, prefix: str):
method _filter_prefix (line 529) | def _filter_prefix(state_dict, prefix: str):
method _is_moe (line 535) | def _is_moe(state_dict):
method _set_attn_state (line 541) | def _set_attn_state(self, mg_attn, hf_state_dict, hf_prefix: str, laye...
method _set_qk_layernorm (line 667) | def _set_qk_layernorm(self, mg_attn, hf_attn, hf_state_dict, to_mcore):
method get_e_score_correction_bias_key (line 673) | def get_e_score_correction_bias_key(self, hf_mlp):
method _set_moe_state (line 680) | def _set_moe_state(
method _get_hf_grouped (line 746) | def _get_hf_grouped(self):
method _get_transpose (line 755) | def _get_transpose(self):
method _set_mlp_state (line 761) | def _set_mlp_state(
method _set_indexer (line 1270) | def _set_indexer(self, mg_indexer, hf_state_dict, hf_prefix: str, to_m...
method _set_linear_attn_state (line 1286) | def _set_linear_attn_state(self, mg_attn, hf_state_dict, hf_prefix: st...
method _set_mla_attn_state (line 1412) | def _set_mla_attn_state(
method _set_layer_attn (line 1452) | def _set_layer_attn(self, mg_layer, hf_state_dict, layer_idx: int, to_...
method _set_layer_mlp (line 1463) | def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_m...
method _set_layer_state (line 1478) | def _set_layer_state(self, mg_layer, hf_state_dict, hf_prefix: str, la...
method _convert_pre_process (line 1492) | def _convert_pre_process(self, mg_model, hf_state_dict, hf_prefix: str...
method _convert_post_process (line 1509) | def _convert_post_process(self, mg_model, hf_state_dict, hf_prefix: st...
method _convert_hf_state_dict (line 1532) | def _convert_hf_state_dict(self, hf_state_dict, to_mcore):
method _convert (line 1544) | def _convert(self, mg_models, hf_state_dict, hf_prefix: str, to_mcore:...
method _convert_mtp_extra (line 1621) | def _convert_mtp_extra(self, mtp_layer, hf_state_dict, to_mcore, origi...
method _convert_mtp_layer (line 1626) | def _convert_mtp_layer(self, lm_model, hf_state_dict, hf_prefix: str, ...
method load_weights (line 1663) | def load_weights(self, mg_models, hf_model_dir: str, is_peft_format: b...
method export_weights (line 1686) | def export_weights(self,
method save_weights (line 1725) | def save_weights(self,
class MultimodalGPTBridge (line 1818) | class MultimodalGPTBridge(GPTBridge):
FILE: swift/megatron/model/gpt_model.py
class OutputLayerLinear (line 36) | class OutputLayerLinear(TELinear):
method forward (line 38) | def forward(self, hidden_states, *args, **kwargs):
method sharded_state_dict (line 41) | def sharded_state_dict(
class GPTModel (line 55) | class GPTModel(McoreGPTModel):
method __init__ (line 58) | def __init__(
method _patch_apply_rotary_pos_emb (line 151) | def _patch_apply_rotary_pos_emb(self):
method _preprocess (line 191) | def _preprocess(
method forward (line 278) | def forward(
method _postprocess (line 355) | def _postprocess(
method get_input_tensor (line 515) | def get_input_tensor(self):
FILE: swift/megatron/model/gpts/glm4.py
class Glm4SelfAttention (line 22) | class Glm4SelfAttention(SelfAttention):
method __init__ (line 24) | def __init__(
method forward (line 38) | def forward(self, hidden_states, *args, **kwargs):
class Glm4MLP (line 45) | class Glm4MLP(MLP):
method __init__ (line 47) | def __init__(
method forward (line 61) | def forward(self, hidden_states, *args, **kwargs):
method sharded_state_dict (line 67) | def sharded_state_dict(self,
class Glm4Bridge (line 84) | class Glm4Bridge(GPTBridge):
method _set_layer_attn (line 86) | def _set_layer_attn(self, mg_layer, hf_state_dict, layer_idx: int, to_...
class Glm4Loader (line 95) | class Glm4Loader(MegatronModelLoader):
method get_transformer_layer_spec (line 97) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
FILE: swift/megatron/model/gpts/minimax_m2.py
class MinimaxM2SelfAttention (line 22) | class MinimaxM2SelfAttention(SelfAttention):
method __init__ (line 24) | def __init__(
method get_query_key_value_tensors (line 51) | def get_query_key_value_tensors(self, *_args, **kwargs):
class MinimaxM2Bridge (line 69) | class MinimaxM2Bridge(GPTBridge):
method _set_qk_layernorm (line 71) | def _set_qk_layernorm(self, mg_attn, hf_attn, hf_state_dict, to_mcore):
method get_hf_mlp_prefix (line 75) | def get_hf_mlp_prefix(self, layer_idx):
method get_e_score_correction_bias_key (line 78) | def get_e_score_correction_bias_key(self, hf_mlp):
method _set_moe_state (line 81) | def _set_moe_state(
class MinimaxM2Loader (line 103) | class MinimaxM2Loader(MegatronModelLoader):
method get_transformer_layer_spec (line 105) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
FILE: swift/megatron/model/gpts/olmoe.py
class OLMoESelfAttention (line 25) | class OLMoESelfAttention(SelfAttentionBase):
method __init__ (line 27) | def __init__(self, config: MegatronModelConfig, submodules: SelfAttent...
method get_query_key_value_tensors (line 42) | def get_query_key_value_tensors(self, hidden_states, key_value_states=...
function get_olmoe_decoder_block_spec (line 75) | def get_olmoe_decoder_block_spec(
class OLMoEBridge (line 113) | class OLMoEBridge(GPTBridge):
method _set_attn_state (line 115) | def _set_attn_state(self, mg_attn, hf_state_dict, hf_prefix: str, laye...
class OlMoELoader (line 218) | class OlMoELoader(MegatronModelLoader):
method get_transformer_layer_spec (line 220) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
FILE: swift/megatron/model/gpts/qwen3_emb.py
class Qwen3EmbBridge (line 8) | class Qwen3EmbBridge(GPTBridge):
method _convert_hf_state_dict (line 10) | def _convert_hf_state_dict(self, hf_state_dict, to_mcore):
FILE: swift/megatron/model/gpts/qwen3_next.py
class Qwen3NextRMSNorm (line 62) | class Qwen3NextRMSNorm(torch.nn.Module):
method __init__ (line 71) | def __init__(self, config: MegatronModelConfig, hidden_size: int, eps:...
method _norm (line 78) | def _norm(self, x):
method forward (line 81) | def forward(self, hidden_states):
class Qwen3NextSelfAttention (line 89) | class Qwen3NextSelfAttention(SelfAttention):
method __init__ (line 91) | def __init__(self, config: MegatronModelConfig, submodules: SelfAttent...
method forward (line 133) | def forward(
method get_query_key_value_tensors (line 385) | def get_query_key_value_tensors(self, hidden_states, key_value_states=...
class Qwen3NextGatedDeltaNet (line 457) | class Qwen3NextGatedDeltaNet(_HuggingFaceModule, _Qwen3NextGatedDeltaNet):
method __init__ (line 459) | def __init__(self, config: MegatronModelConfig, submodules: SelfAttent...
method forward (line 467) | def forward(self, hidden_states: torch.Tensor, **kwargs):
class Qwen3NextBridge (line 504) | class Qwen3NextBridge(GPTBridge):
method _set_layer_attn (line 510) | def _set_layer_attn(self, mg_layer, hf_state_dict, layer_idx: int, to_...
method _set_layer_mlp (line 520) | def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_m...
method _convert_mtp_extra (line 531) | def _convert_mtp_extra(self, mtp_layer, hf_state_dict, to_mcore, origi...
class Qwen3NextLoader (line 541) | class Qwen3NextLoader(MegatronModelLoader):
method get_transformer_layer_spec (line 544) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
method get_mtp_block_spec (line 590) | def get_mtp_block_spec(self, *args, **kwargs):
FILE: swift/megatron/model/mm_gpt_model.py
class MultimodalGPTModel (line 20) | class MultimodalGPTModel(MegatronModule):
method __init__ (line 22) | def __init__(self,
method _patch_word_embeddings (line 47) | def _patch_word_embeddings(self, kwargs):
method forward (line 81) | def forward(
method set_input_tensor (line 114) | def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
method get_input_tensor (line 117) | def get_input_tensor(self):
method shared_embedding_or_output_weight (line 120) | def shared_embedding_or_output_weight(self) -> torch.Tensor:
FILE: swift/megatron/model/mm_gpts/glm.py
class Glm4vVit (line 11) | class Glm4vVit(HuggingFaceModule):
method __init__ (line 16) | def __init__(self, config):
method get_inputs_embeds (line 20) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Glm4vBridge (line 35) | class Glm4vBridge(Glm4Bridge, MultimodalGPTBridge):
FILE: swift/megatron/model/mm_gpts/internvl.py
class Internvl3Bridge (line 11) | class Internvl3Bridge(GPTBridge):
method _init_meta_hf_model (line 18) | def _init_meta_hf_model(self):
class Internvl3Vit (line 25) | class Internvl3Vit(HuggingFaceModule):
method __init__ (line 30) | def __init__(self, config):
method get_inputs_embeds (line 46) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class InternvlHfBridge (line 75) | class InternvlHfBridge(MultimodalGPTBridge):
class InternvlHfVit (line 84) | class InternvlHfVit(HuggingFaceModule):
method __init__ (line 89) | def __init__(self, config):
method get_inputs_embeds (line 105) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
FILE: swift/megatron/model/mm_gpts/kimi_vl.py
class KimiVLBridge (line 13) | class KimiVLBridge(MultimodalGPTBridge):
class KimiVLVit (line 21) | class KimiVLVit(HuggingFaceModule):
method __init__ (line 26) | def __init__(self, config):
method get_inputs_embeds (line 31) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
FILE: swift/megatron/model/mm_gpts/llama4.py
class Llama4Vit (line 19) | class Llama4Vit(HuggingFaceModule):
method __init__ (line 24) | def __init__(self, config):
method get_inputs_embeds (line 28) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Llama4Bridge (line 51) | class Llama4Bridge(GPTBridge):
class Llama4Loader (line 59) | class Llama4Loader(MegatronModelLoader):
method get_transformer_layer_spec (line 61) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
FILE: swift/megatron/model/mm_gpts/qwen.py
class Qwen2_5VL_Vit (line 14) | class Qwen2_5VL_Vit(HuggingFaceModule):
method __init__ (line 20) | def __init__(self, config):
method get_inputs_embeds (line 35) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Qwen2_5VLBridge (line 39) | class Qwen2_5VLBridge(MultimodalGPTBridge):
class Qwen2VL_Vit (line 60) | class Qwen2VL_Vit(Qwen2_5VL_Vit):
class Qwen2_5OmniBridge (line 75) | class Qwen2_5OmniBridge(GPTBridge):
class Qwen2_5Omni_Vit (line 83) | class Qwen2_5Omni_Vit(HuggingFaceModule):
method __init__ (line 89) | def __init__(self, config):
method prepare_model (line 93) | def prepare_model(self, hf_model):
method get_inputs_embeds (line 97) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Ovis2_5Bridge (line 138) | class Ovis2_5Bridge(GPTBridge):
class Ovis2_5Vit (line 146) | class Ovis2_5Vit(HuggingFaceModule):
method __init__ (line 151) | def __init__(self, config):
method get_inputs_embeds (line 157) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
FILE: swift/megatron/model/mm_gpts/qwen3_5.py
class Qwen3_5MoeGatedDeltaNet (line 24) | class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaN...
method __init__ (line 26) | def __init__(self, config: TransformerConfig, submodules: SelfAttentio...
method forward (line 37) | def forward(self, hidden_states: torch.Tensor, **kwargs):
class Qwen3_5Vit (line 74) | class Qwen3_5Vit(HuggingFaceModule):
method __init__ (line 79) | def __init__(self, config):
method get_inputs_embeds (line 84) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Qwen3_5Bridge (line 88) | class Qwen3_5Bridge(Qwen3NextBridge):
class Qwen3_5Loader (line 94) | class Qwen3_5Loader(Qwen3NextLoader):
FILE: swift/megatron/model/mm_gpts/qwen3_5_gdn.py
class Qwen3_5Bridge (line 14) | class Qwen3_5Bridge(MultimodalGPTBridge):
method _set_layer_attn (line 17) | def _set_layer_attn(self, mg_layer, hf_state_dict, layer_idx: int, to_...
method _convert_mtp_extra (line 31) | def _convert_mtp_extra(self, mtp_layer, hf_state_dict, to_mcore, origi...
class Qwen3_5Loader (line 35) | class Qwen3_5Loader(MegatronModelLoader):
method get_transformer_layer_spec (line 37) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
method build_model (line 49) | def build_model(
FILE: swift/megatron/model/mm_gpts/qwen3_vl.py
class Qwen3Omni_Vit (line 35) | class Qwen3Omni_Vit(HuggingFaceModule):
method __init__ (line 43) | def __init__(self, config):
method prepare_model (line 47) | def prepare_model(self, hf_model):
method _get_inputs_embeds (line 51) | def _get_inputs_embeds(self, inputs_embeds, inputs, visual, processor,...
method get_inputs_embeds (line 154) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Qwen3VLTransformerBlock (line 185) | class Qwen3VLTransformerBlock(gpt_model.TransformerBlock):
method _checkpointed_forward (line 188) | def _checkpointed_forward(
method forward (line 298) | def forward(
method _deepstack_process (line 449) | def _deepstack_process(self, hidden_states: torch.Tensor, visual_pos_m...
class Qwen3OmniBridge (line 460) | class Qwen3OmniBridge(GPTBridge):
class Qwen3VL_Vit (line 468) | class Qwen3VL_Vit(HuggingFaceModule):
method __init__ (line 473) | def __init__(self, config):
method get_inputs_embeds (line 478) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
class Qwen3VLLoader (line 483) | class Qwen3VLLoader(MegatronModelLoader):
method _patch_transformer_block (line 485) | def _patch_transformer_block(self):
method __init__ (line 491) | def __init__(self, args, hf_config):
FILE: swift/megatron/model/mm_gpts/utils.py
function patch_hf_initialize_weight (line 14) | def patch_hf_initialize_weight():
function patch_device_map_meta (line 29) | def patch_device_map_meta(model_cls):
class HuggingFaceModule (line 44) | class HuggingFaceModule(_HuggingFaceModule, ABC):
method __init__ (line 47) | def __init__(self, config, ignore_init_model_cls=None):
method prepare_model (line 68) | def prepare_model(self, hf_model):
method get_inputs_embeds (line 72) | def get_inputs_embeds(self, inputs_embeds, **kwargs):
FILE: swift/megatron/model/model_config.py
function _eval_pattern (line 17) | def _eval_pattern(pattern):
function no_rope_freq_type (line 29) | def no_rope_freq_type(x):
function linear_attn_freq_type (line 53) | def linear_attn_freq_type(x):
function moe_freq_type (line 72) | def moe_freq_type(x):
function tuple_type (line 99) | def tuple_type(x):
class MegatronModelConfig (line 113) | class MegatronModelConfig(TransformerConfig):
method _augment_mindspeed_defaults (line 213) | def _augment_mindspeed_defaults(self):
method __post_init__ (line 249) | def __post_init__(self):
method _format_config (line 289) | def _format_config(self):
method _check_npu (line 305) | def _check_npu(self):
function _convert_config (line 379) | def _convert_config(config, _internal_call=False) -> Dict[str, Any]:
function convert_hf_config (line 414) | def convert_hf_config(config) -> Dict[str, Any]:
function _check_attention_backend (line 541) | def _check_attention_backend(args, config):
function _check_padding_free (line 548) | def _check_padding_free(args, config):
function get_mcore_model_config (line 566) | def get_mcore_model_config(args, hf_config):
FILE: swift/megatron/model/modules/gated_delta_net.py
class GatedDeltaNet (line 24) | class GatedDeltaNet(_GatedDeltaNet):
method forward (line 26) | def forward(
FILE: swift/megatron/model/modules/gated_self_attention.py
class GatedSelfAttention (line 13) | class GatedSelfAttention(SelfAttention):
method get_query_key_value_tensors (line 15) | def get_query_key_value_tensors(self, hidden_states, key_value_states=...
FILE: swift/megatron/model/register.py
class MegatronModelMeta (line 28) | class MegatronModelMeta:
method __post_init__ (line 37) | def __post_init__(self):
function register_megatron_model (line 44) | def register_megatron_model(megatron_model_meta: MegatronModelMeta, *, e...
function get_megatron_model_meta (line 57) | def get_megatron_model_meta(model_type: str) -> Optional[MegatronModelMe...
class MegatronModelLoader (line 69) | class MegatronModelLoader:
method __init__ (line 72) | def __init__(self, args, hf_config):
method _replace_spec_dsa (line 81) | def _replace_spec_dsa(self, layer_spec):
method get_transformer_layer_spec (line 95) | def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
method _get_transformer_layer_spec (line 107) | def _get_transformer_layer_spec(self):
method get_mtp_block_spec (line 119) | def get_mtp_block_spec(self, transformer_layer_spec, vp_stage: Optiona...
method _set_shared_expert_gate (line 131) | def _set_shared_expert_gate(self, transformer_layer_spec):
method build_model (line 139) | def build_model(
function get_mcore_model (line 161) | def get_mcore_model(args, hf_config):
FILE: swift/megatron/model/rope.py
class DummyConfig (line 20) | class DummyConfig(RotaryEmbeddingConfigMixin):
method __init__ (line 22) | def __init__(self, **kwargs):
function _get_dummy_config (line 27) | def _get_dummy_config(config):
function _compute_default_rope_parameters (line 53) | def _compute_default_rope_parameters(
function _get_rope_type (line 100) | def _get_rope_type(rope_scaling: Optional[Dict[str, Any]]):
function get_rope_inv_freq (line 109) | def get_rope_inv_freq(config, seq_len=None):
function longrope_frequency_update (line 121) | def longrope_frequency_update(config, model, inv_freq, seq_len: int):
function dynamic_frequency_update (line 138) | def dynamic_frequency_update(config, model, inv_freq, seq_len: int):
function dynamic_rope_update (line 155) | def dynamic_rope_update(model, inv_freq, seq_len: int):
function _compute_dynamic_alpha_ntk_parameters (line 166) | def _compute_dynamic_alpha_ntk_parameters(
FILE: swift/megatron/pipelines/export/export.py
class MegatronExport (line 18) | class MegatronExport(SwiftPipeline):
method run (line 22) | def run(self):
method convert_mcore2hf (line 30) | def convert_mcore2hf(self) -> None:
method convert_hf2mcore (line 80) | def convert_hf2mcore(self) -> None:
function megatron_export_main (line 127) | def megatron_export_main(args: Optional[Union[List[str], MegatronExportA...
FILE: swift/megatron/pipelines/train/pretrain.py
class MegatronPretrain (line 11) | class MegatronPretrain(MegatronSft):
function megatron_pretrain_main (line 16) | def megatron_pretrain_main(args: Optional[Union[List[str], MegatronPretr...
FILE: swift/megatron/pipelines/train/rlhf.py
class MegatronRLHF (line 13) | class MegatronRLHF(MegatronSft):
method prepare_trainer (line 17) | def prepare_trainer(self):
method _prepare_template (line 36) | def _prepare_template(self) -> None:
method _get_dataset (line 41) | def _get_dataset(self):
method _prepare_vllm_client (line 48) | def _prepare_vllm_client(self):
function megatron_rlhf_main (line 72) | def megatron_rlhf_main(args: Optional[Union[List[str], MegatronRLHFArgum...
FILE: swift/megatron/pipelines/train/sft.py
class MegatronSft (line 22) | class MegatronSft(SwiftSft):
method prepare_trainer (line 26) | def prepare_trainer(self):
method _set_seed (line 35) | def _set_seed(self):
method __init__ (line 38) | def __init__(self, args: Optional[Union[List[str], MegatronSftArgument...
method run (line 61) | def run(self):
function megatron_sft_main (line 87) | def megatron_sft_main(args: Optional[Union[List[str], MegatronSftArgumen...
FILE: swift/megatron/trainers/base.py
class BaseMegatronTrainer (line 53) | class BaseMegatronTrainer(ABC):
method __init__ (line 55) | def __init__(self, args, template: Template):
method _load_checkpoint (line 100) | def _load_checkpoint(self):
method call_event (line 112) | def call_event(self, event, **kwargs):
method on_log (line 116) | def on_log(self, logs, prefix=''):
method _log_callback (line 123) | def _log_callback(self, logs, n_steps):
method prepare_model (line 177) | def prepare_model(self):
method _prepare_peft_model (line 184) | def _prepare_peft_model(self, models):
method get_optimizer_and_scheduler (line 194) | def get_optimizer_and_scheduler(self):
method _get_data_collator (line 210) | def _get_data_collator(self):
method cyclic_iter (line 217) | def cyclic_iter(self, iterable, use_origin_cyclic: bool = False):
method _get_param_groups_mcore_016 (line 242) | def _get_param_groups_mcore_016(
method _get_param_groups (line 260) | def _get_param_groups(
method _patch_get_param_groups (line 416) | def _patch_get_param_groups(self):
method _load_iteration (line 429) | def _load_iteration(self):
method _prepare_vit_gradient_checkpointing (line 456) | def _prepare_vit_gradient_checkpointing(self, model):
method _initialize_embedding (line 471) | def _initialize_embedding(model):
method _all_reduce_metric (line 492) | def _all_reduce_metric(self,
method merge_lora_adapters (line 502) | def merge_lora_adapters(self, adapter_name='default'):
method unmerge_lora_adapters (line 511) | def unmerge_lora_adapters(self):
method copy_path (line 521) | def copy_path(src_path: str, tgt_path: str):
method _prepare_data_iterator (line 535) | def _prepare_data_iterator(self, train_dataset, val_dataset=None, use_...
method train (line 543) | def train(self, train_dataset, val_dataset):
method _determine_best_metric (line 654) | def _determine_best_metric(self, metrics) -> bool:
method save_checkpoint (line 672) | def save_checkpoint(self):
method _rotate_checkpoints (line 727) | def _rotate_checkpoints(self, output_dir: str):
method _sorted_checkpoints (line 744) | def _sorted_checkpoints(self, output_dir: str):
method training_log (line 760) | def training_log(self, metrics, grad_norm):
method evaluate (line 768) | def evaluate(self, val_data_iterator):
method compute_eval_metrics (line 794) | def compute_eval_metrics(self, metrics):
method _replace_data_iterator (line 801) | def _replace_data_iterator(self, data_iterator):
method train_step (line 804) | def train_step(self, train_data_iterator):
method _aggregated_metrics (line 830) | def _aggregated_metrics(self, metrics, total_metrics):
method _prepare_dataloader (line 851) | def _prepare_dataloader(self, train_dataset, val_dataset=None):
method _create_dataloader (line 882) | def _create_dataloader(self, dataset, batch_sampler):
method forward_step (line 897) | def forward_step(self, data_iterator, model):
method _prepare_batch (line 900) | def _prepare_batch(self, data, vp_stage=None, num_samples=None):
method get_batch (line 916) | def get_batch(self, data_iterator, vp_stage=None):
method _collect_config_info (line 920) | def _collect_config_info(self) -> Dict[str, str]:
method get_last_tokens (line 941) | def get_last_tokens(self, output_tensor, packed_seq_params=None, atten...
FILE: swift/megatron/trainers/batch_sampler.py
class MegatronPretrainingSampler (line 10) | class MegatronPretrainingSampler:
method __init__ (line 12) | def __init__(self,
method __len__ (line 40) | def __len__(self):
method get_start_end_idx (line 43) | def get_start_end_idx(self):
method __iter__ (line 48) | def __iter__(self):
class MegatronPretrainingRandomSampler (line 65) | class MegatronPretrainingRandomSampler:
method __init__ (line 67) | def __init__(
method __len__ (line 110) | def __len__(self):
method __iter__ (line 113) | def __iter__(self):
FILE: swift/megatron/trainers/dpo_trainer.py
class DummyDPOTrainer (line 15) | class DummyDPOTrainer(DPOTrainer):
method __init__ (line 17) | def __init__(self, args):
class MegatronDPOTrainer (line 29) | class MegatronDPOTrainer(MegatronRLHFTrainer):
method __init__ (line 31) | def __init__(self, args, template):
method loss_func (line 35) | def loss_func(self, output_tensor: torch.Tensor, *, labels: torch.Tens...
method forward_step (line 78) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/embedding_trainer.py
class MegatronEmbeddingTrainer (line 13) | class MegatronEmbeddingTrainer(BaseMegatronTrainer):
method __init__ (line 15) | def __init__(self, args, template):
method loss_func (line 25) | def loss_func(self, output_tensor: torch.Tensor, *, labels: torch.Tens...
method forward_step (line 35) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/gkd_trainer.py
class DataSource (line 27) | class DataSource(str, Enum):
class MegatronGKDTrainer (line 34) | class MegatronGKDTrainer(MegatronRolloutMixin, MegatronRLHFTrainer):
method __init__ (line 36) | def __init__(self, args: MegatronArguments, template, **kwargs):
method train (line 77) | def train(self, train_dataset, val_dataset):
method prepare_model (line 82) | def prepare_model(self):
method _offload_teacher_models (line 116) | def _offload_teacher_models(self):
method _load_teacher_models_to_gpu (line 121) | def _load_teacher_models_to_gpu(self):
method load_teacher_model_context (line 127) | def load_teacher_model_context(self):
method _template_context (line 147) | def _template_context(self, template: Template, max_length: Optional[i...
method _build_opsd_teacher_data (line 156) | def _build_opsd_teacher_data(self, inputs: List[Dict]) -> Optional[Lis...
method _encode_batch (line 172) | def _encode_batch(self, batch: List[Dict]) -> Dict[str, torch.Tensor]:
method _get_random_num (line 185) | def _get_random_num(self) -> float:
method _determine_data_source (line 198) | def _determine_data_source(self) -> DataSource:
method _init_resample_data_iterator (line 230) | def _init_resample_data_iterator(self, train_dataset):
method resample_encode_failed_inputs (line 258) | def resample_encode_failed_inputs(self, inputs: List[Dict], max_resamp...
method _compute_teacher_logits (line 297) | def _compute_teacher_logits(self, encoded_batches: List[Dict], vp_stag...
method _compute_teacher_logits_local (line 303) | def _compute_teacher_logits_local(self, encoded_batches: List[Dict], v...
method _compute_teacher_logits_from_api (line 331) | def _compute_teacher_logits_from_api(self, encoded_batches: List[Dict]...
method _replace_data_iterator (line 362) | def _replace_data_iterator(self, data_iterator):
method _align_vocab_size (line 412) | def _align_vocab_size(
method generalized_jsd_loss (line 446) | def generalized_jsd_loss(
method _vocab_parallel_topk (line 545) | def _vocab_parallel_topk(self, logits: torch.Tensor, k: int) -> tuple:
method _tp_gather_topk (line 582) | def _tp_gather_topk(self, logits: torch.Tensor, indices: torch.Tensor)...
method _jsd_topk (line 609) | def _jsd_topk(self, student_logits, teacher_topk_logprobs, teacher_top...
method loss_func (line 642) | def loss_func(self,
method forward_step (line 714) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/grpo_trainer.py
class MegatronGRPOTrainer (line 44) | class MegatronGRPOTrainer(MegatronRolloutMixin, MegatronRLHFTrainer):
method __init__ (line 46) | def __init__(self, args: MegatronArguments, template: Template, **kwar...
method train (line 59) | def train(self, train_dataset, val_dataset):
method _init_grpo_params (line 64) | def _init_grpo_params(self):
method _init_rollout_engine (line 120) | def _init_rollout_engine(self):
method _prepare_rewards (line 136) | def _prepare_rewards(self):
method _prepare_scheduler (line 192) | def _prepare_scheduler(self):
method _init_resample_data_iterator (line 209) | def _init_resample_data_iterator(self, train_dataset):
method _replace_data_iterator (line 238) | def _replace_data_iterator(self, data_iterator):
method _batch_encode (line 255) | def _batch_encode(self, infer_requests: List[Dict], template: Template...
method _get_encoded_batch (line 273) | def _get_encoded_batch(self, encoded_list, rollout_batch, template):
method _generate_and_score_completions (line 365) | def _generate_and_score_completions(self, batch):
method _generate_completions (line 444) | def _generate_completions(self, batch):
method _rollout (line 492) | def _rollout(self, batch) -> List[RolloutOutput]:
method postprocess_rollout_data (line 507) | def postprocess_rollout_data(self, batch, outputs):
method _get_request_config (line 565) | def _get_request_config(self) -> RequestConfig:
method _server_rollout (line 581) | def _server_rollout(self,
method _score_completions (line 616) | def _score_completions(self, inputs: DataType) -> torch.Tensor:
method _compute_rewards_per_func (line 630) | def _compute_rewards_per_func(self, batch: DataType) -> torch.Tensor:
method _compute_advantages (line 694) | def _compute_advantages(self,
method _dynamic_sampling (line 854) | def _dynamic_sampling(self, rollout_batch: DataType,
method _maybe_compute_logps (line 930) | def _maybe_compute_logps(self, batch: Dict[str, Any]) -> Dict[str, Any]:
method _compute_kl_from_batches (line 969) | def _compute_kl_from_batches(self, mini_batch_data: List[Dict[str, Any...
method _disable_maxlength_template_context (line 1006) | def _disable_maxlength_template_context(self, template: Template):
method _maybe_replace_response_token (line 1015) | def _maybe_replace_response_token(self, batch):
method on_policy (line 1029) | def on_policy(self):
method forward_step (line 1033) | def forward_step(self, data_iterator, model):
method loss_func (line 1122) | def loss_func(self, output_tensor: torch.Tensor, data: Dict[str, Any]):
method model_forward (line 1412) | def model_forward(self, model, data_iterator, no_grad=True, per_token=...
method inputs2requests (line 1444) | def inputs2requests(self, inputs: Union[DataType, List[RolloutInferReq...
method _preprocess_inputs (line 1512) | def _preprocess_inputs(self, inputs: DataType) -> DataType:
method resample_encode_failed_inputs (line 1520) | def resample_encode_failed_inputs(self, inputs: DataType, max_resample...
method _add_prompt_id_to_inputs (line 1580) | def _add_prompt_id_to_inputs(self, inputs: DataType) -> DataType:
method get_num_iters_per_step (line 1602) | def get_num_iters_per_step(self):
method get_local_rollout_batch (line 1628) | def get_local_rollout_batch(self, batch):
method _template_context (line 1645) | def _template_context(self, template: Template):
method _prepare_metrics (line 1654) | def _prepare_metrics(self):
method _apply_chat_template_to_messages_list (line 1671) | def _apply_chat_template_to_messages_list(self, messages_list: DataType):
method _set_inputs_system (line 1680) | def _set_inputs_system(self, batch: DataType) -> DataType:
method _compute_sequence_level_ratios (line 1717) | def _compute_sequence_level_ratios(self, is_ratio: torch.Tensor, compl...
method _apply_rollout_importance_sampling (line 1734) | def _apply_rollout_importance_sampling(self, rollout_log_ratio: torch....
method _compute_off_policy_sequence_mask (line 1785) | def _compute_off_policy_sequence_mask(
method _compute_rollout_offpolicy_metrics (line 1827) | def _compute_rollout_offpolicy_metrics(
method _compute_is_correction_metrics (line 1911) | def _compute_is_correction_metrics(
method _prepare_model_inputs (line 1970) | def _prepare_model_inputs(self, inputs: 'DataType') -> Dict[str, Any]:
method _collect_config_info (line 1980) | def _collect_config_info(self) -> Dict[str, str]:
FILE: swift/megatron/trainers/kto_trainer.py
class DummyKTOTrainer (line 15) | class DummyKTOTrainer(KTOTrainer):
method gather_for_metrics (line 18) | def gather_for_metrics(self, input_data, *args, **kwargs):
method __init__ (line 27) | def __init__(self, args):
class MegatronKTOTrainer (line 37) | class MegatronKTOTrainer(MegatronRLHFTrainer):
method __init__ (line 39) | def __init__(self, args, template):
method _kto_get_logps (line 43) | def _kto_get_logps(self, output_tensor, data, is_KL: bool, is_ref: boo...
method _get_kto_length (line 50) | def _get_kto_length(self, data: Dict[str, Any]) -> int:
method loss_func (line 56) | def loss_func(self, output_tensor, *, data, kl_data, label):
method _get_input_tensor (line 100) | def _get_input_tensor(input_tensor, is_KL: bool, is_ref: bool, length:...
method forward_step (line 112) | def forward_step(self, data_iterator, model):
method _prepare_batch (line 157) | def _prepare_batch(self, data, vp_stage=None, num_samples=None):
method _log_callback (line 170) | def _log_callback(self, logs, n_steps):
FILE: swift/megatron/trainers/reranker_trainer.py
class MegatronRerankerTrainer (line 15) | class MegatronRerankerTrainer(BaseMegatronTrainer):
method __init__ (line 17) | def __init__(self, args, template):
method _get_listwise_reranker_preds (line 28) | def _get_listwise_reranker_preds(logits, labels):
method loss_func (line 39) | def loss_func(self, output_tensor: torch.Tensor, *, labels: torch.Tens...
method prepare_model (line 56) | def prepare_model(self):
method forward_step (line 62) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/reward_trainer.py
class MegatronRewardTrainer (line 12) | class MegatronRewardTrainer(MegatronRLHFTrainer):
method __init__ (line 14) | def __init__(self, args, template):
method loss_func (line 18) | def loss_func(self, output_tensor, *, data):
method forward_step (line 45) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/rlhf_mixin.py
class MegatronRLHFTrainer (line 17) | class MegatronRLHFTrainer(BaseMegatronTrainer):
method _load_checkpoint (line 19) | def _load_checkpoint(self):
method prepare_model (line 27) | def prepare_model(self):
method _get_data_collator (line 44) | def _get_data_collator(self):
method null_ref_context (line 50) | def null_ref_context(self):
method get_logps (line 70) | def get_logps(self, output_tensor, labels, packed_seq_params, num_samp...
method _postprocess_packed_tensor_cp (line 94) | def _postprocess_packed_tensor_cp(self, tensor, packed_seq_params, num...
FILE: swift/megatron/trainers/rollout_mixin.py
function create_rollout_group (line 32) | def create_rollout_group(trainer) -> torch.distributed.ProcessGroup:
class MegatronRolloutMixin (line 102) | class MegatronRolloutMixin:
method _init_rollout_params (line 104) | def _init_rollout_params(self):
method _get_rollout_group (line 136) | def _get_rollout_group(self):
method _get_local_rollout_batch (line 140) | def _get_local_rollout_batch(self, batch: List[Dict]) -> List[Dict]:
method _gather_rollout_results (line 174) | def _gather_rollout_results(self, local_batch: List[Dict]) -> List[Dict]:
method _init_rollout_engine (line 186) | def _init_rollout_engine(self):
method _prepare_vllm_engine (line 229) | def _prepare_vllm_engine(self):
method _move_model_to_vllm (line 276) | def _move_model_to_vllm(self):
method _export_and_load_weights (line 296) | def _export_and_load_weights(self):
method _load_weights_to_server_in_buckets (line 311) | def _load_weights_to_server_in_buckets(self, weight_iterator):
method _sync_bucket_to_server (line 332) | def _sync_bucket_to_server(self, bucket_params: List[Tuple[str, torch....
method _generate_completions (line 346) | def _generate_completions(self, batch: DataType) -> DataType:
method _rollout (line 391) | def _rollout(self, batch: DataType) -> List[RolloutOutput]:
method _get_request_config (line 401) | def _get_request_config(self) -> RequestConfig:
method _server_rollout (line 412) | def _server_rollout(self, inputs: DataType, request_config: RequestCon...
method _colocate_rollout (line 438) | def _colocate_rollout(self, batch: DataType, request_config: RequestCo...
method _preprocess_rollout_inputs (line 465) | def _preprocess_rollout_inputs(self, inputs: DataType) -> DataType:
method _set_inputs_system (line 474) | def _set_inputs_system(self, inputs: DataType) -> DataType:
method _inputs_to_requests (line 486) | def _inputs_to_requests(self, inputs: DataType) -> List[RolloutInferRe...
method _postprocess_rollout_outputs (line 529) | def _postprocess_rollout_outputs(self, inputs: DataType, outputs: List...
method offload_context (line 569) | def offload_context(self):
FILE: swift/megatron/trainers/trainer.py
class MegatronTrainer (line 18) | class MegatronTrainer(BaseMegatronTrainer):
method seq_cls_loss_func (line 20) | def seq_cls_loss_func(self, output_tensor, *, labels: torch.Tensor, pa...
method loss_func (line 50) | def loss_func(self,
method _compute_channel_loss (line 87) | def _compute_channel_loss(self, losses, loss_mask, channels, packed_se...
method forward_step (line 115) | def forward_step(self, data_iterator, model):
FILE: swift/megatron/trainers/utils.py
function get_batch_on_this_pp_rank (line 26) | def get_batch_on_this_pp_rank(args, data, vp_stage=None):
function get_packed_seq_params (line 49) | def get_packed_seq_params(position_ids: torch.Tensor) -> PackedSeqParams:
function get_batch_on_this_cp_rank (line 65) | def get_batch_on_this_cp_rank(args, batch: Dict[str, Any]):
function gather (line 95) | def gather(tensor, group: Optional[torch.distributed.ProcessGroup] = None):
function gather_object (line 105) | def gather_object(object: Any, group: Optional[torch.distributed.Process...
function load_megatron_model_to_gpu (line 116) | def load_megatron_model_to_gpu(models, load_grad=True):
function offload_megatron_model_to_cpu (line 143) | def offload_megatron_model_to_cpu(models):
function load_megatron_copy_params (line 179) | def load_megatron_copy_params(optimizers):
function offload_megatron_copy_params (line 220) | def offload_megatron_copy_params(optimizers):
function load_megatron_optimizer (line 261) | def load_megatron_optimizer(optimizers):
function offload_megatron_optimizer (line 285) | def offload_megatron_optimizer(optimizers):
function log_gpu_memory (line 304) | def log_gpu_memory(prefix: str = '', info_once: bool = False):
class TrainerState (line 314) | class TrainerState:
method global_step (line 330) | def global_step(self) -> int:
class MegatronDataLoaderDispatcher (line 334) | class MegatronDataLoaderDispatcher(DataLoaderDispatcher):
method group (line 337) | def group(self):
function build_streaming_dataloader (line 341) | def build_streaming_dataloader(args, dataset, collate_fn):
FILE: swift/megatron/trainers/vocab_parallel_utils.py
function vocab_parallel_log_softmax (line 19) | def vocab_parallel_log_softmax(logits: torch.Tensor) -> torch.Tensor:
function vocab_parallel_entropy (line 56) | def vocab_parallel_entropy(log_probs: torch.Tensor, chunk_size: int = 51...
function vocab_parallel_kl_div (line 105) | def vocab_parallel_kl_div(input_log_probs: torch.Tensor, target_log_prob...
function vocab_parallel_gather_logps (line 134) | def vocab_parallel_gather_logps(
function compute_logps_and_entropy_from_logits (line 196) | def compute_logps_and_entropy_from_logits(
FILE: swift/megatron/tuners/lora.py
class LoraParallelLinear (line 36) | class LoraParallelLinear(MegatronModule, LoraLayer):
method __init__ (line 38) | def __init__(
method update_layer (line 85) | def update_layer(self, adapter_name, r, *, lora_alpha, lora_dropout, i...
method _get_rng_context (line 224) | def _get_rng_context(self, lora):
method reset_lora_parameters (line 233) | def reset_lora_parameters(self, adapter_name, init_lora_weights):
method _patch_router_gating (line 267) | def _patch_router_gating(self):
method forward (line 298) | def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
method sharded_state_dict (line 372) | def sharded_state_dict(
method get_delta_weights (line 400) | def get_delta_weights(self, adapter) -> List[torch.Tensor]:
method merge (line 423) | def merge(self, safe_merge: bool = False, adapter_names: Optional[list...
method unmerge (line 475) | def unmerge(self) -> None:
function dispatch_megatron (line 510) | def dispatch_megatron(
FILE: swift/megatron/utils/convert_utils.py
function _test_params_sum (line 18) | def _test_params_sum(model):
function _find_modules (line 38) | def _find_modules(model, recurse: bool = True, prefix='', ignore_modules...
function _model_cpu_forward_context (line 60) | def _model_cpu_forward_context(modules,
function get_examples (line 95) | def get_examples(is_multimodal: bool) -> Dict[str, Any]:
function broadcast_mg_logits (line 149) | def broadcast_mg_logits(mg_logits=None, src_rank=None):
function test_convert_precision (line 172) | def test_convert_precision(args, hf_model, mg_model, template, test_conv...
FILE: swift/megatron/utils/megatron_lm_utils.py
function _patch_megatron_timeout (line 40) | def _patch_megatron_timeout(distributed_timeout_minutes):
function _initialize_mpu (line 57) | def _initialize_mpu(args):
function set_random_seed (line 85) | def set_random_seed(
function initialize_megatron (line 107) | def initialize_megatron(args):
function _get_rng_state (line 121) | def _get_rng_state():
function _generate_state_dict (line 145) | def _generate_state_dict(args,
function _filter_adapter_state_dict (line 180) | def _filter_adapter_state_dict(state_dict, is_peft_format: bool, adapter...
function _preprocess_common_before_consistancy_check (line 216) | def _preprocess_common_before_consistancy_check(common_state_dict):
function get_sharded_sd_metadata (line 226) | def get_sharded_sd_metadata(args):
function save_mcore_checkpoint (line 246) | def save_mcore_checkpoint(
function init_persistent_async_worker (line 325) | def init_persistent_async_worker():
function schedule_async_save (line 332) | def schedule_async_save(async_request: AsyncRequest):
function maybe_finalize_async_save (line 341) | def maybe_finalize_async_save(args, blocking: bool = False, terminate=Fa...
function is_empty_async_queue (line 360) | def is_empty_async_queue() -> bool:
function _load_iteration (line 365) | def _load_iteration(tracker_path: str):
function load_mcore_checkpoint (line 378) | def load_mcore_checkpoint(args,
function wrap_model (line 496) | def wrap_model(args, models, wrap_with_ddp: bool = True):
function get_optimizer_param_scheduler (line 552) | def get_optimizer_param_scheduler(args, optimizer):
function unwrap_model (line 585) | def unwrap_model(models, module_instances=None):
function should_disable_forward_pre_hook (line 610) | def should_disable_forward_pre_hook(args):
function enable_forward_pre_hook (line 615) | def enable_forward_pre_hook(model_chunks):
function disable_forward_pre_hook (line 621) | def disable_forward_pre_hook(model_chunks, param_sync=True):
function initialize_tp_communicators (line 627) | def initialize_tp_communicators(args, config):
function warmup_jit_function (line 657) | def warmup_jit_function(config, args):
FILE: swift/megatron/utils/parallel_utils.py
function reduce_max_stat_across_model_parallel_group (line 7) | def reduce_max_stat_across_model_parallel_group(stat: float) -> float:
function logical_and_across_model_parallel_group (line 20) | def logical_and_across_model_parallel_group(input: bool) -> bool:
function split_cp_inputs (line 30) | def split_cp_inputs(inputs: torch.Tensor, cu_seqlens: Optional[torch.Ten...
FILE: swift/megatron/utils/patcher.py
function patch_torch_dist_shard (line 12) | def patch_torch_dist_shard(thread_count):
function patch_merge_fn (line 22) | def patch_merge_fn(state_dict_model):
FILE: swift/megatron/utils/utils.py
function find_all_linears (line 30) | def find_all_linears(model, extra_layers=None):
function find_router (line 41) | def find_router(model):
function find_embedding (line 45) | def find_embedding(model):
function get_multimodal_target_regex (line 49) | def get_multimodal_target_regex(
function get_target_modules (line 100) | def get_target_modules(args, model):
function get_modules_to_save (line 137) | def get_modules_to_save(args, model):
function set_linear_is_expert (line 147) | def set_linear_is_expert(model):
function _patch_deepcopy (line 155) | def _patch_deepcopy():
function prepare_adapter (line 183) | def prepare_adapter(args, model):
function _prepare_full_vit (line 219) | def _prepare_full_vit(args, model):
function prepare_mcore_model (line 229) | def prepare_mcore_model(args, model):
function tuners_sharded_state_dict (line 246) | def tuners_sharded_state_dict(
function copy_original_module_weight (line 269) | def copy_original_module_weight(model):
function copy_ref_adapter_weight (line 279) | def copy_ref_adapter_weight(model, ref_adapter_name: str):
function forward_step_helper (line 297) | def forward_step_helper(args, model, inputs, dtype=None):
function get_padding_to (line 329) | def get_padding_to(args):
function get_local_layer_specs (line 346) | def get_local_layer_specs(config, layer_specs, vp_stage=None):
FILE: swift/metrics/acc.py
function compute_acc (line 10) | def compute_acc(preds,
class AccMetrics (line 44) | class AccMetrics(EvalMetrics):
method compute_metrics (line 46) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
method preprocess_logits_for_metrics (line 56) | def preprocess_logits_for_metrics(self, logits: torch.Tensor, labels: ...
FILE: swift/metrics/base.py
class EvalMetrics (line 11) | class EvalMetrics(ABC):
method __init__ (line 13) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method compute_metrics (line 18) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
method preprocess_logits_for_metrics (line 21) | def preprocess_logits_for_metrics(self, logits: torch.Tensor, labels: ...
FILE: swift/metrics/embedding.py
class EmbedddingMetricMixin (line 14) | class EmbedddingMetricMixin(Metric):
method __init__ (line 16) | def __init__(self):
method update (line 21) | def update(self, last_hidden_state, labels):
method compute (line 25) | def compute(self):
class PairedMetrics (line 31) | class PairedMetrics(EvalMetrics, EmbedddingMetricMixin):
method __init__ (line 33) | def __init__(self, *args, **kwargs):
method compute_metrics (line 37) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
method _calculate_metrics (line 42) | def _calculate_metrics(self, predictions, labels):
class InfonceMetrics (line 76) | class InfonceMetrics(EvalMetrics, EmbedddingMetricMixin):
method __init__ (line 78) | def __init__(self, *args, **kwargs):
method compute_metrics (line 82) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
method _calculate_metrics (line 87) | def _calculate_metrics(self, predictions, labels):
FILE: swift/metrics/nlg.py
function compute_rouge_bleu (line 12) | def compute_rouge_bleu(preds: List[str], labels: List[str]):
class NlgMetrics (line 33) | class NlgMetrics(EvalMetrics):
method compute_metrics (line 35) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
FILE: swift/metrics/reranker.py
class RerankerMetrics (line 10) | class RerankerMetrics(EvalMetrics, Metric):
method __init__ (line 12) | def __init__(self, *args, **kwargs):
method update (line 18) | def update(self, logits, labels):
method compute (line 22) | def compute(self):
method compute_metrics (line 27) | def compute_metrics(self, eval_prediction: EvalPrediction) -> Dict[str...
method _calculate_metrics (line 30) | def _calculate_metrics(self, logits, labels):
FILE: swift/metrics/utils.py
class Metric (line 13) | class Metric(ABC):
method __init__ (line 15) | def __init__(self):
method add_state (line 19) | def add_state(self, name: str, default=None, default_factory=None) -> ...
method reset (line 31) | def reset(self):
method update (line 38) | def update(self, *args, **kwargs):
method compute (line 42) | def compute(self):
class InferStats (line 46) | class InferStats(Metric):
method __init__ (line 48) | def __init__(self):
method update (line 54) | def update(self, output):
method compute (line 59) | def compute(self):
class MeanMetric (line 73) | class MeanMetric(Metric):
method __init__ (line 75) | def __init__(self, nan_value=0, device=None, group=None):
method update (line 85) | def update(self, state: torch.Tensor):
method compute (line 102) | def compute(self):
FILE: swift/model/constant.py
class LLMModelType (line 6) | class LLMModelType:
class BertModelType (line 122) | class BertModelType:
class RMModelType (line 128) | class RMModelType:
class MLLMModelType (line 136) | class MLLMModelType:
class ModelType (line 265) | class ModelType(LLMModelType, MLLMModelType, BertModelType, RMModelType):
method get_model_name_list (line 268) | def get_model_name_list(cls) -> List[str]:
FILE: swift/model/model_arch.py
class LLMModelArch (line 10) | class LLMModelArch:
class MLLMModelArch (line 27) | class MLLMModelArch:
class ModelArch (line 93) | class ModelArch(LLMModelArch, MLLMModelArch):
class ModelKeys (line 100) | class ModelKeys:
class MultiModelKeys (line 127) | class MultiModelKeys(ModelKeys):
method __post_init__ (line 134) | def __post_init__(self):
function register_model_arch (line 146) | def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False...
function get_model_arch (line 764) | def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
FILE: swift/model/model_meta.py
class Model (line 21) | class Model:
class ModelGroup (line 31) | class ModelGroup:
method __post_init__ (line 40) | def __post_init__(self):
class BaseModelLoader (line 45) | class BaseModelLoader(ABC):
method __init__ (line 48) | def __init__(self, model_info, model_meta, *args, **kwargs):
method load (line 52) | def load(self) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizer...
class ModelMeta (line 57) | class ModelMeta:
method __post_init__ (line 81) | def __post_init__(self):
method get_matched_model_group (line 96) | def get_matched_model_group(self, model_name: str) -> Optional[ModelGr...
method check_requires (line 105) | def check_requires(self, model_info=None):
class ModelInfo (line 125) | class ModelInfo:
method __post_init__ (line 141) | def __post_init__(self):
function get_model_name (line 145) | def get_model_name(model_id_or_path: str) -> Optional[str]:
function get_matched_model_meta (line 161) | def get_matched_model_meta(model_id_or_path: str) -> Optional[ModelMeta]:
function _get_arch_mapping (line 173) | def _get_arch_mapping():
function get_matched_model_types (line 186) | def get_matched_model_types(architectures: Optional[List[str]]) -> List[...
function _read_args_json_model_type (line 195) | def _read_args_json_model_type(model_dir):
function _get_model_info (line 203) | def _get_model_info(model_dir: str, model_type: Optional[str], quantizat...
function get_model_info_meta (line 246) | def get_model_info_meta(
FILE: swift/model/models/baai.py
class Emu3GenLoader (line 15) | class Emu3GenLoader(ModelLoader):
method get_processor (line 17) | def get_processor(self, model_dir, config) -> Processor:
method get_model (line 33) | def get_model(self, model_dir: str, config, processor, model_kwargs):
class Emu3ChatLoader (line 54) | class Emu3ChatLoader(ModelLoader):
method get_processor (line 56) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class BgeRerankerLoader (line 92) | class BgeRerankerLoader(ModelLoader):
method get_model (line 94) | def get_model(self, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/baichuan.py
class BaichuanLoader (line 17) | class BaichuanLoader(ModelLoader):
method get_model (line 19) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class BaichuanM1Loader (line 46) | class BaichuanM1Loader(BaichuanLoader):
method get_model (line 48) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
function patch_baichuan2_lm_head_forward (line 76) | def patch_baichuan2_lm_head_forward(self, hidden_states: Tensor) -> Tensor:
class Baichuan2Loader (line 89) | class Baichuan2Loader(ModelLoader):
method get_model (line 91) | def get_model(self, model_dir: str, config, *args, **kwargs) -> PreTra...
FILE: swift/model/models/baidu.py
class ErnieVLLoader (line 57) | class ErnieVLLoader(ModelLoader):
method get_model (line 59) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class PaddleOCR1_5Loader (line 88) | class PaddleOCR1_5Loader(ModelLoader):
method get_config (line 90) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_model (line 95) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/bert.py
class ModernBertLoader (line 14) | class ModernBertLoader(ModelLoader):
method get_model (line 16) | def get_model(self, model_dir: str, config, *args, **kwargs) -> PreTra...
class GTEBertLoader (line 36) | class GTEBertLoader(ModelLoader):
method get_model (line 38) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class GTEBertReranker (line 63) | class GTEBertReranker(ModelLoader):
method get_model (line 65) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/codefuse.py
class CodeLlamaLoader (line 41) | class CodeLlamaLoader(ModelLoader):
method get_processor (line 43) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
FILE: swift/model/models/deepseek.py
class DeepseekLoader (line 16) | class DeepseekLoader(ModelLoader):
method get_model (line 18) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class DeepseekV32Loader (line 129) | class DeepseekV32Loader(ModelLoader):
method get_config (line 131) | def get_config(self, model_dir: str):
method get_model (line 138) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class DeepseekVLLoader (line 169) | class DeepseekVLLoader(ModelLoader):
method get_config (line 171) | def get_config(self, model_dir: str):
method _get_model (line 186) | def _get_model(self, model_dir: str, llm_prefix, *args, **kwargs) -> P...
method get_model (line 195) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class DeepseekJanusLoader (line 216) | class DeepseekJanusLoader(DeepseekVLLoader):
method get_model (line 218) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_config (line 221) | def get_config(self, model_dir: str):
class DeepseekVL2Loader (line 261) | class DeepseekVL2Loader(DeepseekVLLoader):
method get_config (line 263) | def get_config(self, model_dir: str):
method get_model (line 278) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class DeepseekOCRLoader (line 300) | class DeepseekOCRLoader(ModelLoader):
method get_model (line 303) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 312) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class DeepseekOCR2Loader (line 326) | class DeepseekOCR2Loader(DeepseekOCRLoader):
FILE: swift/model/models/gemma.py
class PaligemmaVisionLoader (line 12) | class PaligemmaVisionLoader(ModelLoader):
method get_model (line 14) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Gemma3TextLoader (line 94) | class Gemma3TextLoader(ModelLoader):
method get_config (line 96) | def get_config(self, model_dir):
class Gemma3VisionLoader (line 122) | class Gemma3VisionLoader(ModelLoader):
method get_config (line 124) | def get_config(self, model_dir):
method get_model (line 129) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Gemma3nLoader (line 159) | class Gemma3nLoader(ModelLoader):
method get_model (line 161) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/glm.py
function remove_property (line 22) | def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase], tokeni...
function _patch_tokenizer (line 28) | def _patch_tokenizer(tokenizer):
class ChatGLMLoader (line 43) | class ChatGLMLoader(ModelLoader):
method get_model (line 45) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method get_processor (line 59) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class ChatGLM4Loader (line 110) | class ChatGLM4Loader(ChatGLMLoader):
method get_processor (line 112) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class ChatGLM4vLoader (line 174) | class ChatGLM4vLoader(ChatGLMLoader):
method get_model (line 176) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 190) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class GLM4vLoader (line 220) | class GLM4vLoader(ModelLoader):
method get_model (line 222) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class CogVLMLoader (line 265) | class CogVLMLoader(ModelLoader):
method get_model (line 267) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 273) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class CogVLM2Loader (line 317) | class CogVLM2Loader(ModelLoader):
method get_model (line 319) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class GLMEdgeVLoader (line 377) | class GLMEdgeVLoader(ModelLoader):
method get_processor (line 379) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class Glm4vMoeLoader (line 440) | class Glm4vMoeLoader(ModelLoader):
method get_model (line 442) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class GLMOCRLoader (line 472) | class GLMOCRLoader(ModelLoader):
method get_model (line 474) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/internlm.py
class InternVLLoader (line 88) | class InternVLLoader(ModelLoader):
method get_processor (line 90) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 94) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Interns1Loader (line 374) | class Interns1Loader(ModelLoader):
method get_model (line 376) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class InternVLHfLoader (line 397) | class InternVLHfLoader(Interns1Loader):
method get_model (line 399) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Xcomposer2Loader (line 474) | class Xcomposer2Loader(ModelLoader):
method get_model (line 477) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Xcomposer2_4khdLoader (line 517) | class Xcomposer2_4khdLoader(Xcomposer2Loader):
class Xcomposer2_5Loader (line 537) | class Xcomposer2_5Loader(Xcomposer2Loader):
FILE: swift/model/models/llama.py
class LlamaLoader (line 14) | class LlamaLoader(ModelLoader):
method get_config (line 16) | def get_config(self, model_dir):
class Llama3_2VisionLoader (line 242) | class Llama3_2VisionLoader(ModelLoader):
method get_model (line 244) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Llama4Loader (line 270) | class Llama4Loader(ModelLoader):
method get_model (line 272) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Llama3OmniLoader (line 301) | class Llama3OmniLoader(ModelLoader):
method get_model (line 303) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
FILE: swift/model/models/llava.py
class LlavaLlamaHfLoader (line 15) | class LlavaLlamaHfLoader(ModelLoader):
method get_config (line 17) | def get_config(self, model_dir: str):
method get_model (line 22) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
function _patch_llava (line 45) | def _patch_llava(model):
class LlavahfLoader (line 61) | class LlavahfLoader(ModelLoader):
method get_model (line 63) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class LlavaOnevisionHfLoader (line 87) | class LlavaOnevisionHfLoader(ModelLoader):
method get_model (line 89) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class LlavaNextHfLoader (line 114) | class LlavaNextHfLoader(ModelLoader):
method get_model (line 116) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class LlavaNextYiHfLoader (line 205) | class LlavaNextYiHfLoader(LlavaNextHfLoader):
method get_config (line 207) | def get_config(self, model_dir: str) -> PretrainedConfig:
class LlavaNextVideoHfLoader (line 230) | class LlavaNextVideoHfLoader(ModelLoader):
method get_model (line 232) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class LlavaNextVideoYiHfLoader (line 257) | class LlavaNextVideoYiHfLoader(LlavaNextVideoHfLoader):
method get_config (line 259) | def get_config(self, model_dir: str) -> PretrainedConfig:
class LlavaLoader (line 283) | class LlavaLoader(ModelLoader):
method get_config (line 286) | def get_config(self, model_dir: str):
method get_model (line 306) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Llama3LlavaNextLoader (line 340) | class Llama3LlavaNextLoader(LlavaLoader):
class LlavaMistralLoader (line 361) | class LlavaMistralLoader(LlavaLoader):
class LlavaLlamaLoader (line 382) | class LlavaLlamaLoader(LlavaLoader):
class LlavaNextQwenLoader (line 401) | class LlavaNextQwenLoader(LlavaLoader):
class LlavaOnevisionLoader (line 421) | class LlavaOnevisionLoader(ModelLoader):
method get_config (line 423) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_model (line 428) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/llm.py
class GrokLoader (line 15) | class GrokLoader(ModelLoader):
method get_processor (line 17) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class PolyLMLoader (line 36) | class PolyLMLoader(ModelLoader):
method get_processor (line 38) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class YuanLoader (line 58) | class YuanLoader(ModelLoader):
method get_processor (line 60) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
FILE: swift/model/models/mamba.py
class MambaLoader (line 13) | class MambaLoader(ModelLoader):
method get_model (line 15) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/microsoft.py
class Phi3VisionLoader (line 16) | class Phi3VisionLoader(ModelLoader):
method get_processor (line 19) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 25) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Phi4MultimodalLoader (line 49) | class Phi4MultimodalLoader(ModelLoader):
method get_processor (line 51) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 63) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class FlorenceLoader (line 84) | class FlorenceLoader(ModelLoader):
method get_model (line 86) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Phi3SmallLoader (line 117) | class Phi3SmallLoader(ModelLoader):
method get_model (line 119) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/minicpm.py
function _patch_minicpmv_device_map (line 33) | def _patch_minicpmv_device_map(model) -> None:
class MiniCPMVLoader (line 58) | class MiniCPMVLoader(ModelLoader):
method get_model (line 60) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class MiniCPMV2Loader (line 90) | class MiniCPMV2Loader(MiniCPMVLoader):
method get_model (line 92) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class MiniCPMO2Loader (line 133) | class MiniCPMO2Loader(MiniCPMV2Loader):
method get_model (line 135) | def get_model(self, model_dir: str, config, *args, **kwargs) -> PreTra...
FILE: swift/model/models/minimax.py
class MiniMaxVLLoader (line 17) | class MiniMaxVLLoader(ModelLoader):
method get_model (line 19) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method get_processor (line 66) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class MinimaxTextLoader (line 92) | class MinimaxTextLoader(ModelLoader):
method get_model (line 94) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
FILE: swift/model/models/mistral.py
class DevstralLoader (line 119) | class DevstralLoader(ModelLoader):
method get_processor (line 121) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class Mistral3Loader (line 142) | class Mistral3Loader(ModelLoader):
method get_model (line 144) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Mistral3_2506Loader (line 189) | class Mistral3_2506Loader(Mistral3Loader):
method get_processor (line 191) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
FILE: swift/model/models/mllm.py
class Idefics3Loader (line 20) | class Idefics3Loader(ModelLoader):
method get_model (line 22) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class PixtralLoader (line 45) | class PixtralLoader(ModelLoader):
method get_model (line 47) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class MolMoeLoader (line 70) | class MolMoeLoader(ModelLoader):
method get_model (line 72) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class MolmoLoader (line 107) | class MolmoLoader(ModelLoader):
method get_model (line 109) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class MegrezOmniLoader (line 136) | class MegrezOmniLoader(ModelLoader):
method get_model (line 138) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method _get_model_processor (line 148) | def _get_model_processor(self, model_dir, config):
class JinaRerankerM0Loader (line 185) | class JinaRerankerM0Loader(ModelLoader):
method get_model (line 187) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class KeyeVLLoader (line 269) | class KeyeVLLoader(ModelLoader):
method get_processor (line 271) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class DotsOCRLoader (line 312) | class DotsOCRLoader(ModelLoader):
method get_model (line 314) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Sail2VLLoader (line 334) | class Sail2VLLoader(ModelLoader):
method get_model (line 336) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/moonshot.py
class KimiVLLoader (line 13) | class KimiVLLoader(ModelLoader):
method get_model (line 15) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/mplug.py
class MplugOwl2Loader (line 21) | class MplugOwl2Loader(ModelLoader):
method _get_model (line 23) | def _get_model(self, model_dir: str, vocab_size, *args, **kwargs) -> P...
method get_model (line 38) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 41) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class MplugOwl2_1Loader (line 59) | class MplugOwl2_1Loader(QwenLoader, MplugOwl2Loader):
method get_model (line 61) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class MplugOwl3Loader (line 77) | class MplugOwl3Loader(ModelLoader):
method get_model (line 79) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method _get_model_processor (line 104) | def _get_model_processor(self, model_dir, config):
class DocOwl2Loader (line 142) | class DocOwl2Loader(ModelLoader):
method _get_model_processor (line 144) | def _get_model_processor(self, model_dir, config):
FILE: swift/model/models/qwen.py
class QwenLoader (line 30) | class QwenLoader(ModelLoader):
method get_model (line 32) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method _update_attn_impl (line 51) | def _update_attn_impl(self, config):
method get_processor (line 55) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
function _qwen_vl_audio_decode (line 115) | def _qwen_vl_audio_decode(self, *args, skip_special_tokens=False, **kwar...
function fix_qwen_inplace_bug (line 125) | def fix_qwen_inplace_bug(model) -> None:
class QwenAudioLoader (line 133) | class QwenAudioLoader(QwenLoader):
method get_model (line 135) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 140) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
function _qwen_vl_visual_block_forward (line 169) | def _qwen_vl_visual_block_forward(
class QwenVLLoader (line 185) | class QwenVLLoader(QwenLoader):
method get_model (line 187) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method get_processor (line 217) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
function _get_new_read_video_func (line 639) | def _get_new_read_video_func(read_video_func, read_backend):
function patch_qwen_vl_utils (line 659) | def patch_qwen_vl_utils(vision_process):
function compat_qwen_vl_utils (line 709) | def compat_qwen_vl_utils(image_patch_size: int):
class Qwen2VLLoader (line 724) | class Qwen2VLLoader(ModelLoader):
method get_model (line 726) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method _check_qwen_vl_utils (line 734) | def _check_qwen_vl_utils(self):
method get_processor (line 745) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class Qwen2_5VLLoader (line 803) | class Qwen2_5VLLoader(Qwen2VLLoader):
method get_model (line 805) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
function patch_Qwen3VLMoeTextExperts_dtype (line 838) | def patch_Qwen3VLMoeTextExperts_dtype():
function _forward_qwen3_vl_or_qwen3_omni (line 852) | def _forward_qwen3_vl_or_qwen3_omni(
function _patch_deepstack_process (line 935) | def _patch_deepstack_process(model):
function _compat_qwen3_vl_mixed_data (line 958) | def _compat_qwen3_vl_mixed_data(model, processor, is_moe: bool = False):
class Qwen3VLLoader (line 1052) | class Qwen3VLLoader(Qwen2VLLoader):
method _check_qwen_vl_utils (line 1054) | def _check_qwen_vl_utils(self):
method get_model (line 1058) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Qwen3VLMoeLoader (line 1095) | class Qwen3VLMoeLoader(Qwen3VLLoader):
method get_model (line 1097) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Qwen3_5MoeLoader (line 1125) | class Qwen3_5MoeLoader(Qwen3VLLoader):
method get_model (line 1127) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Qwen3_5Loader (line 1157) | class Qwen3_5Loader(Qwen3VLLoader):
method get_model (line 1159) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class Qwen2_5OmniLoader (line 1193) | class Qwen2_5OmniLoader(ModelLoader):
method _check_qwen_omni_utils (line 1195) | def _check_qwen_omni_utils(self):
method get_config (line 1204) | def get_config(self, model_dir):
method get_model (line 1214) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
method get_processor (line 1225) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
function _compat_qwen3_omni_mixed_data (line 1253) | def _compat_qwen3_omni_mixed_data(model, processor):
class Qwen3OmniLoader (line 1407) | class Qwen3OmniLoader(ModelLoader):
method _check_qwen_omni_utils (line 1409) | def _check_qwen_omni_utils(self):
method get_config (line 1413) | def get_config(self, model_dir: str):
method get_model (line 1423) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
method get_processor (line 1436) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
class MidashengLMLoader (line 1464) | class MidashengLMLoader(ModelLoader):
method get_model (line 1466) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Qwen2AudioLoader (line 1487) | class Qwen2AudioLoader(ModelLoader):
method get_model (line 1489) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class OvisLoader (line 1512) | class OvisLoader(ModelLoader):
method get_processor (line 1514) | def get_processor(self, model_dir, config) -> Processor:
method get_model (line 1518) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Ovis2_5Loader (line 1596) | class Ovis2_5Loader(ModelLoader):
method get_model (line 1598) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Qwen3VLEmbLoader (line 1689) | class Qwen3VLEmbLoader(Qwen3VLLoader):
method _check_qwen_vl_utils (line 1691) | def _check_qwen_vl_utils(self):
class Qwen3VLRerankerLoader (line 1714) | class Qwen3VLRerankerLoader(Qwen3VLLoader):
method _check_qwen_vl_utils (line 1716) | def _check_qwen_vl_utils(self):
FILE: swift/model/models/skywork.py
class SkyworkLoader (line 14) | class SkyworkLoader(ModelLoader):
method get_processor (line 16) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
FILE: swift/model/models/stepfun.py
class GotOCR2Loader (line 15) | class GotOCR2Loader(ModelLoader):
method get_model (line 17) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class GotOCR2HfLoader (line 36) | class GotOCR2HfLoader(ModelLoader):
method get_model (line 38) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class StepAudioLoader (line 58) | class StepAudioLoader(ModelLoader):
method get_model (line 60) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
function _patch_step_audio2_mini (line 91) | def _patch_step_audio2_mini(model):
class StepAudio2MiniLoader (line 109) | class StepAudio2MiniLoader(ModelLoader):
method get_model (line 111) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
class Step3VLLoader (line 133) | class Step3VLLoader(ModelLoader):
method get_config (line 135) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_processor (line 140) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 144) | def get_model(self, model_dir: str, config: PretrainedConfig, processo...
FILE: swift/model/models/telechat.py
class TeleChatLoader (line 12) | class TeleChatLoader(ModelLoader):
method get_model (line 14) | def get_model(self, model_dir: str, config, processor, **kwargs) -> Pr...
FILE: swift/model/models/tencent.py
class HunyuanVLLoader (line 12) | class HunyuanVLLoader(ModelLoader):
method get_config (line 14) | def get_config(self, model_dir: str):
method get_model (line 18) | def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
FILE: swift/model/models/valley.py
class ValleyLoader (line 15) | class ValleyLoader(ModelLoader):
method get_config (line 17) | def get_config(self, model_dir: str):
method get_model (line 27) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
FILE: swift/model/models/yi.py
class YiVLLoader (line 17) | class YiVLLoader(ModelLoader):
method get_config (line 19) | def get_config(self, model_dir: str) -> PretrainedConfig:
method get_processor (line 33) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 36) | def get_model(self, model_dir: str, config, processor, **kwargs) -> Pr...
FILE: swift/model/npu_patcher.py
function _set_default_hccl_connect_timeout_for_npu (line 25) | def _set_default_hccl_connect_timeout_for_npu() -> None:
class NPUCastError (line 36) | class NPUCastError(RuntimeError):
function _get_first_parameter (line 40) | def _get_first_parameter(module: torch.nn.Module) -> torch.nn.Parameter ...
function _needs_fp32_cast_for_npu (line 46) | def _needs_fp32_cast_for_npu(
function _cast_to_fp32 (line 60) | def _cast_to_fp32(module: torch.nn.Module) -> torch.nn.Module:
function wrapped_fsdp2_prepare_model (line 81) | def wrapped_fsdp2_prepare_model(
function wrapped_prepare_fsdp2 (line 101) | def wrapped_prepare_fsdp2(
class NpuRMSNorm (line 117) | class NpuRMSNorm(nn.Module):
method __init__ (line 119) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 124) | def forward(self, hidden_states):
method extra_repr (line 127) | def extra_repr(self):
function npu_apply_rotary_pos_emb (line 131) | def npu_apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueez...
function npu_swiglu_forward (line 140) | def npu_swiglu_forward(self, hidden_state):
class NpuGmmFunction (line 145) | class NpuGmmFunction(torch.autograd.Function):
method forward (line 148) | def forward(ctx, x, weight, group_list, split_size):
method backward (line 157) | def backward(ctx, grad_outputs):
function npu_moe_block_forward (line 172) | def npu_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Te...
class GmmFunction (line 236) | class GmmFunction(torch.autograd.Function):
method forward (line 239) | def forward(ctx, x, weight, group_list):
method backward (line 252) | def backward(ctx, grad_output):
class NpuMoeFused (line 275) | class NpuMoeFused:
method npu_moe_experts_forward (line 278) | def npu_moe_experts_forward(self, hidden_states: torch.Tensor, routing...
method npu_moe_sparse_block_forward (line 293) | def npu_moe_sparse_block_forward(self, hidden_states: torch.Tensor) ->...
function _setattr_path (line 306) | def _setattr_path(root: Any, path: str, value: Any) -> None:
function _apply_patch_map (line 314) | def _apply_patch_map(root: Any, patch_map: dict[str, Any]) -> None:
FILE: swift/model/patcher.py
function patch_fixed_float_dtype (line 29) | def patch_fixed_float_dtype(module: torch.nn.Module, dtype):
function patch_fixed_device (line 42) | def patch_fixed_device(module: torch.nn.Module, device):
function patch_output_clone (line 55) | def patch_output_clone(module: torch.nn.Module):
function patch_get_input_embeddings (line 64) | def patch_get_input_embeddings(model, embedding_keys: str):
function patch_output_normalizer (line 72) | def patch_output_normalizer(module: torch.nn.Module, model_meta):
function patch_output_to_input_device (line 102) | def patch_output_to_input_device(module: torch.nn.Module):
function patch_device_map (line 117) | def patch_device_map():
function patch_ignore_check_imports (line 134) | def patch_ignore_check_imports():
function get_lm_head_model (line 148) | def get_lm_head_model(model, model_meta=None, lm_heads=None):
function transformers_seq_cls_forward (line 168) | def transformers_seq_cls_forward(self, *args, origin_forward, padding_si...
function _patch_sequence_classification (line 243) | def _patch_sequence_classification(model, model_meta):
function patch_automodel_for_sequence_classification (line 270) | def patch_automodel_for_sequence_classification(model_info=None,
function patch_automodel (line 370) | def patch_automodel(model_info, model_meta, auto_model_cls, return_dummy...
function _get_max_memory (line 398) | def _get_max_memory(device_ids: List[int]) -> Dict[Union[int, str], int]:
function _sync_max_memory (line 416) | def _sync_max_memory(max_memory: Dict[Union[int, str], int]) -> Dict[Uni...
function patch_mp_ddp (line 436) | def patch_mp_ddp():
function patch_get_dynamic_module (line 477) | def patch_get_dynamic_module():
function patch_tp_plan (line 492) | def patch_tp_plan(load_model: bool):
function revert_padding_free (line 505) | def revert_padding_free(outputs: Dict[str, Any], inputs: Dict[str, Any],...
function gather_sequence_parallel_outputs (line 560) | def gather_sequence_parallel_outputs(
function patch_attach_align_device_hook_on_blocks (line 586) | def patch_attach_align_device_hook_on_blocks():
function patch_module_forward (line 600) | def patch_module_forward(module, new_forward):
FILE: swift/model/register.py
function register_model (line 31) | def register_model(model_meta: ModelMeta, *, exist_ok: bool = False) -> ...
function load_by_unsloth (line 45) | def load_by_unsloth(args):
function _patch_awq_compat (line 98) | def _patch_awq_compat(model_info):
function _set_property (line 120) | def _set_property(model, key):
function fix_do_sample_warning (line 133) | def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
function get_model_list (line 143) | def get_model_list() -> List[str]:
class ModelLoader (line 160) | class ModelLoader(BaseModelLoader):
method __init__ (line 162) | def __init__(
method _postprocess_config (line 213) | def _postprocess_config(self, config):
method get_config (line 244) | def get_config(self, model_dir: str) -> PretrainedConfig:
method _get_tokenizer (line 248) | def _get_tokenizer(self, processor):
method get_processor (line 256) | def get_processor(self, model_dir: str, config: PretrainedConfig) -> P...
method get_model (line 266) | def get_model(self, model_dir: str, config: PretrainedConfig, processo...
method _patch_generative_reranker (line 325) | def _patch_generative_reranker(self, model, processor):
method _postprocess_model (line 334) | def _postprocess_model(self, model_dir, model):
method _add_new_special_tokens (line 349) | def _add_new_special_tokens(self, model, processor, config):
method _postprocess_processor (line 365) | def _postprocess_processor(self, processor: Processor):
method _compat_transformers5 (line 380) | def _compat_transformers5(self, model):
method _update_attn_impl (line 385) | def _update_attn_impl(self, config):
method _deepspeed_set_z3_leaf_modules (line 388) | def _deepspeed_set_z3_leaf_modules(self, model, z3_leaf_modules):
method _init_generation_config (line 442) | def _init_generation_config(self, model, model_dir):
method _get_model_processor (line 451) | def _get_model_processor(self, model_dir, config):
method load (line 458) | def load(self) -> Tuple[Optional[PreTrainedModel], Processor]:
class SentenceTransformersLoader (line 472) | class SentenceTransformersLoader(ModelLoader):
method get_model (line 474) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
class RewardModelLoader (line 493) | class RewardModelLoader(ModelLoader):
method get_model (line 495) | def get_model(self, model_dir: str, config, processor, model_kwargs) -...
function get_model_processor (line 501) | def get_model_processor(
function get_processor (line 618) | def get_processor(
FILE: swift/model/utils.py
class AttnImpl (line 26) | class AttnImpl:
method to_use_flash_attn (line 31) | def to_use_flash_attn(attn_impl: Optional[str], auto_value: _T = None)...
method update_attn_impl (line 37) | def update_attn_impl(config: PretrainedConfig,
function get_llm_model (line 55) | def get_llm_model(model: torch.nn.Module, model_meta=None, inner_backbon...
function use_submodel_func (line 90) | def use_submodel_func(model, submodel_name: str, func_list: Optional[Lis...
class InitModelStrategy (line 124) | class InitModelStrategy:
method is_uninitialized (line 127) | def is_uninitialized(param: torch.Tensor) -> bool:
method constant_init (line 153) | def constant_init(param: torch.Tensor, c: float = 0) -> None:
method uniform_init (line 157) | def uniform_init(param: torch.Tensor, a: float = -0.1, b: float = 0.1)...
method normal_init (line 161) | def normal_init(param: torch.Tensor, mean: float = 0.0, std: float = 0...
method _init_high_dim (line 165) | def _init_high_dim(param: torch.Tensor, init_func, *args, **kwargs) ->...
method xavier_uniform_init (line 173) | def xavier_uniform_init(param: torch.Tensor) -> None:
method xavier_normal_init (line 177) | def xavier_normal_init(param: torch.Tensor) -> None:
method kaiming_uniform_init (line 181) | def kaiming_uniform_init(param: torch.Tensor) -> None:
method kaiming_normal_init (line 186) | def kaiming_normal_init(param: torch.Tensor) -> None:
method orthogonal_init (line 190) | def orthogonal_init(param: torch.Tensor) -> None:
method init_parameters (line 205) | def init_parameters(model: nn.Module, init_strategy: str) -> None:
function get_default_device_map (line 224) | def get_default_device_map():
function get_default_torch_dtype (line 240) | def get_default_torch_dtype(torch_dtype: Optional[torch.dtype]):
function _patch_conv3d (line 261) | def _patch_conv3d():
function save_checkpoint (line 289) | def save_checkpoint(model: Optional[PreTrainedModel],
function get_ckpt_dir (line 328) | def get_ckpt_dir(model_dir: str, adapters_dir: Optional[List[str]]) -> str:
FILE: swift/optimizers/base.py
class OptimizerCallback (line 13) | class OptimizerCallback:
method __init__ (line 28) | def __init__(self, args: 'TrainingArguments', trainer: 'Trainer'):
method create_optimizer_and_scheduler (line 32) | def create_optimizer_and_scheduler(self, num_training_steps: int) -> N...
method create_optimizer (line 50) | def create_optimizer(self) -> Optimizer:
method create_scheduler (line 53) | def create_scheduler(self, num_training_steps: int, optimizer: Optimiz...
FILE: swift/optimizers/galore/adafactor.py
class Adafactor (line 11) | class Adafactor(Optimizer):
method __init__ (line 96) | def __init__(
method _get_lr (line 129) | def _get_lr(param_group, param_state):
method _get_options (line 140) | def _get_options(param_group, param_shape):
method _rms (line 146) | def _rms(tensor):
method _approx_sq_grad (line 150) | def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col):
method step (line 158) | def step(self, closure=None):
FILE: swift/optimizers/galore/adamw.py
class AdamW (line 13) | class AdamW(Optimizer):
method __init__ (line 35) | def __init__(
method step (line 58) | def step(self, closure: Callable = None):
FILE: swift/optimizers/galore/adamw8bit.py
class AdamW8bit (line 9) | class AdamW8bit(Optimizer2State):
method __init__ (line 11) | def __init__(self,
method step (line 39) | def step(self, closure=None):
FILE: swift/optimizers/galore/galore_projector.py
class GaLoreProjector (line 6) | class GaLoreProjector:
method __init__ (line 8) | def __init__(self, rank, verbose=False, update_proj_gap=200, scale=1.0...
method project (line 16) | def project(self, full_rank_grad, iter):
method project_back (line 51) | def project_back(self, low_rank_grad):
method get_orthogonal_matrix (line 73) | def get_orthogonal_matrix(self, weights, rank, type):
FILE: swift/optimizers/galore/utils.py
class GaLoreConfig (line 27) | class GaLoreConfig:
class GaloreOptimizerWrapper (line 59) | class GaloreOptimizerWrapper(Optimizer):
method __init__ (line 61) | def __init__(self, optimizers: Dict[Any, Optimizer]):
method zero_grad (line 65) | def zero_grad(self, *args, **kwargs) -> None:
method step (line 69) | def step(self, *args, **kwargs) -> None:
class GaloreSchedulerWrapper (line 74) | class GaloreSchedulerWrapper(LRScheduler):
method __init__ (line 76) | def __init__(self, lr_schedulers: Dict[Any, LRScheduler]):
method step (line 79) | def step(self, *args, **kwargs) -> None:
function _create_optimizer_and_scheduler (line 85) | def _create_optimizer_and_scheduler(model: nn.Module, args: 'TrainingArg...
function get_optimizer (line 175) | def get_optimizer(args: 'TrainingArguments', config: GaLoreConfig) -> Tu...
class GaloreOptimizerCallback (line 222) | class GaloreOptimizerCallback(OptimizerCallback):
method create_optimizer_and_scheduler (line 224) | def create_optimizer_and_scheduler(self, num_training_steps: int):
FILE: swift/optimizers/lorap.py
class LorapOptimizerCallback (line 7) | class LorapOptimizerCallback(OptimizerCallback):
method create_optimizer (line 9) | def create_optimizer(self) -> Optimizer:
FILE: swift/optimizers/multimodal.py
function get_param_startswith (line 13) | def get_param_startswith(model,
class MultimodalOptimizerCallback (line 43) | class MultimodalOptimizerCallback(OptimizerCallback):
method create_optimizer (line 45) | def create_optimizer(self):
FILE: swift/optimizers/muon.py
class MuonOptimizerCallback (line 8) | class MuonOptimizerCallback(OptimizerCallback):
method create_optimizer (line 10) | def create_optimizer(self):
FILE: swift/optimizers/muonclip.py
class _MaxLogitsTracker (line 15) | class _MaxLogitsTracker:
method _get_and_reset (line 38) | def _get_and_reset(cls) -> Optional[float]:
method _update (line 44) | def _update(cls, v: float):
method enable_softmax (line 52) | def enable_softmax(cls):
method enable_sdpa (line 86) | def enable_sdpa(cls):
method enable_flash_attn (line 123) | def enable_flash_attn(cls):
method enable_all (line 170) | def enable_all(cls):
method consume (line 179) | def consume(cls) -> Optional[float]:
class MuonClip (line 183) | class MuonClip(Optimizer):
method __init__ (line 192) | def __init__(
method newton_schulz (line 219) | def newton_schulz(G: torch.Tensor, steps: int = 5, eps: float = 1e-7) ...
method _is_qk_w
Condensed preview — 1228 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,854K chars).
[
{
"path": ".dev_scripts/build_docs.sh",
"chars": 166,
"preview": "pip install -r requirements/docs.txt\ncd docs\nrm -rf build\n\n# update api rst\n#rm -rf source/api/\n#sphinx-apidoc --module-"
},
{
"path": ".dev_scripts/ci_container_test.sh",
"chars": 1844,
"preview": "if [ \"$MODELSCOPE_SDK_DEBUG\" == \"True\" ]; then\n # pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/s"
},
{
"path": ".dev_scripts/dockerci.sh",
"chars": 3695,
"preview": "#!/bin/bash\nMODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache\nCODE_DIR=$PWD\nCODE_DIR_IN_CONTAINER=/ms-swift\nMODELSCOPE"
},
{
"path": ".dev_scripts/dockerci_npu.sh",
"chars": 1666,
"preview": "#!/bin/bash\nMODELSCOPE_CACHE_DIR=/modelscope_cache\nCODE_DIR=$PWD\nMODELSCOPE_SDK_DEBUG=True\necho \"$USER\"\ngpus='0,1 2,3'\ni"
},
{
"path": ".github/ISSUE_TEMPLATE/1-bug-report.yml",
"chars": 1395,
"preview": "name: \"🐛 Bug Report\"\ndescription: Create a bug report to help us improve ms-swift\nlabels: [\"bug\"]\n\nbody:\n - type: markd"
},
{
"path": ".github/ISSUE_TEMPLATE/2-feature-request.yml",
"chars": 1110,
"preview": "name: \"🚀 Feature Request\"\ndescription: Submit a request for a new feature\nlabels: [\"enhancement\"]\n\nbody:\n - type: markd"
},
{
"path": ".github/ISSUE_TEMPLATE/3-question-discussion.yml",
"chars": 815,
"preview": "name: \"🤔 Question & Discussion\"\ndescription: Create an issue for questions and discussions\nlabels: [\"question\"]\n\nbody:\n "
},
{
"path": ".github/ISSUE_TEMPLATE/config.yml",
"chars": 28,
"preview": "blank_issues_enabled: false\n"
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 241,
"preview": "# PR type\n- [ ] Bug Fix\n- [ ] New Feature\n- [ ] Document Updates\n- [ ] More Models or Datasets Support\n\n# PR information"
},
{
"path": ".github/SECURITY.md",
"chars": 369,
"preview": "# Reporting Security Issues\n\nUsually security issues of a deep learning project come from non-standard 3rd packages or c"
},
{
"path": ".github/workflows/citest.yaml",
"chars": 2074,
"preview": "name: citest\n\non:\n push:\n branches:\n - master\n - \"release/**\"\n paths-ignore:\n - \"setup.*\"\n - "
},
{
"path": ".github/workflows/citest_npu.yaml",
"chars": 2398,
"preview": "name: citest-npu\n\non:\n push:\n branches:\n - master\n - \"release/**\"\n paths-ignore:\n - \"setup.*\"\n "
},
{
"path": ".github/workflows/close_tale_issue.yaml",
"chars": 725,
"preview": "name: Close Stale Issues\non:\n schedule:\n - cron: '0 0 * * *'\n workflow_dispatch:\n\njobs:\n close-stale:\n runs-on:"
},
{
"path": ".github/workflows/lint.yaml",
"chars": 486,
"preview": "name: Lint test\n\non: [push, pull_request]\n\nconcurrency:\n group: ${{ github.workflow }}-${{ github.ref }}\n cancel-in-pr"
},
{
"path": ".github/workflows/publish.yaml",
"chars": 761,
"preview": "name: release\n\non:\n push:\n tags:\n - 'v**'\n\nconcurrency:\n group: ${{ github.workflow }}-${{ github.ref }}-publi"
},
{
"path": ".gitignore",
"chars": 1660,
"preview": "# Byte-compiled / optimized / DLL files\ntmp\n*.ttf\n__pycache__/\n*.py[cod]\n*$py.class\ntest.py\n# C extensions\n*.so\n\n# Distr"
},
{
"path": ".pre-commit-config.yaml",
"chars": 625,
"preview": "repos:\n - repo: https://github.com/pycqa/flake8.git\n rev: 7.3.0\n hooks:\n - id: flake8\n - repo: https://gith"
},
{
"path": ".pre-commit-config_local.yaml",
"chars": 591,
"preview": "repos:\n - repo: /home/admin/pre-commit/flake8\n rev: 7.3.0\n hooks:\n - id: flake8\n - repo: /home/admin/pre-co"
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 5486,
"preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participa"
},
{
"path": "CONTRIBUTING.md",
"chars": 4356,
"preview": "# Contributor Guide\n\n_Welcome to offer PRs, bug reports, documentation supplements or other types of contributions to SW"
},
{
"path": "CONTRIBUTING_CN.md",
"chars": 1750,
"preview": "# 贡献者指引\n\n*欢迎帮SWIFT提供Feature PR、Bug反馈、文档补充或其他类型的贡献!*\n\n## 目录\n\n- [代码规约](#-代码规约)\n- [贡献流程](#-贡献流程)\n- [资源支持](#-资源支持)\n\n## 📖 代码规"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "MANIFEST.in",
"chars": 37,
"preview": "recursive-include requirements *.txt\n"
},
{
"path": "Makefile",
"chars": 359,
"preview": "WHL_BUILD_DIR :=package\nDOC_BUILD_DIR :=docs/build/\n\n# default rule\ndefault: whl docs\n\n.PHONY: docs\ndocs:\n\tbash .dev_scr"
},
{
"path": "README.md",
"chars": 34674,
"preview": "# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)\n\n<p align=\"center\">\n <br>\n <img src=\"asset/banner.pn"
},
{
"path": "README_CN.md",
"chars": 23513,
"preview": "# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)\n\n<p align=\"center\">\n <br>\n <img src=\"asset/banner.pn"
},
{
"path": "docs/Makefile",
"chars": 638,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/README.md",
"chars": 1482,
"preview": "## maintain docs\n1. build docs\n ```shell\n # in root directory:\n make docs\n ```\n\n2. doc string format\n\n We"
},
{
"path": "docs/make.bat",
"chars": 764,
"preview": "@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-bu"
},
{
"path": "docs/source/.readthedocs.yaml",
"chars": 770,
"preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
},
{
"path": "docs/source/BestPractices/Elastic.md",
"chars": 6287,
"preview": "# Elastic\n\n\n\n## 安装依赖\n\n集群部署K8S,并在集群中部署DLrover,[DLRover](https://github.com/intelligent-machine-learning/dlrover),\n`pip in"
},
{
"path": "docs/source/BestPractices/Embedding.md",
"chars": 8792,
"preview": "# Embedding训练\n\nSWIFT已经支持Embedding模型的训练,包括纯文本和多模态两个类型。目前已经支持的模型有:\n\n1. modernbert embedding模型\n - [ModelScope](https://mo"
},
{
"path": "docs/source/BestPractices/GRPO-Code-Training.md",
"chars": 6955,
"preview": "# GRPO代码训练\n本文档介绍如何使用GRPO对模型进行代码训练\n\n模型:[Qwen/Qwen2.5-7B-Instruct](https://www.modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Ins"
},
{
"path": "docs/source/BestPractices/GRPO-Multi-Modal-Training.md",
"chars": 13348,
"preview": "# 多模态GRPO完整实验流程\n本文介绍如何使用SWIFT GRPO进行多模态模型和任务的训练。目标是对多个多模态任务进行训练,提升任务精度,任务定义和训练参数等参考了 [R1-V](https://github.com/Deep-Agen"
},
{
"path": "docs/source/BestPractices/GRPO.md",
"chars": 12568,
"preview": "# GRPO完整实验流程\n\n本文从较为简单的数学任务 Coundown Game 出发,从数据集定义、奖励函数定义和GRPO训练几个步骤介绍完整的GRPO训练流程。任务定义和训练参数等参考了 [mini-deepseek-r1](https"
},
{
"path": "docs/source/BestPractices/MLLM-Registration.md",
"chars": 28218,
"preview": "\n# 注册多模态模型最佳实践\n\n本文将介绍如何在ms-swift中注册多模态模型,并成功推理和训练。本文将以Qwen2.5-Omni为例子,注册新的model_type和template `my_qwen2_5_omni`,并支持文本、图片"
},
{
"path": "docs/source/BestPractices/Metax-support.md",
"chars": 17580,
"preview": "# Metax支持\n\n## 1. 在 Metax 平台上使用 Swift\n你可以选择构建自己的镜像,也可以直接拉取已有的预构建镜像。本文以拉取预构建镜像为例,演示如何在 Metax 上使用 ms-swift。\n### 1.1. 启动 ms-"
},
{
"path": "docs/source/BestPractices/More-Best-Practices.md",
"chars": 343,
"preview": "\n# 更多最佳实践\n\n- [Qwen2.5自我认知微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition)\n"
},
{
"path": "docs/source/BestPractices/NPU-support.md",
"chars": 13900,
"preview": "# NPU支持\n\n我们在 ms-swift 上增加了对昇腾 NPU 的支持,用户可以在昇腾 NPU 上进行模型的微调和推理。\n\n本文档介绍了如何在昇腾 NPU 上进行环境准备、模型微调、推理和部署。\n\n## 安装\n\n基础环境准备:\n\n| s"
},
{
"path": "docs/source/BestPractices/Qwen3-Best-Practice.md",
"chars": 13247,
"preview": "# Qwen3最佳实践\n\n讨论区:[issue 4030](https://github.com/modelscope/ms-swift/issues/4030)\n\nQwen文档: [https://qwen.readthedocs.io/"
},
{
"path": "docs/source/BestPractices/Qwen3-VL-Best-Practice.md",
"chars": 14710,
"preview": "\n# Qwen3-VL最佳实践\n\n## 环境准备\n\n在开始推理和训练之前,请确保您的环境已准备就绪。\n\n```shell\npip install \"transformers>=4.57\" \"qwen_vl_utils>=0.0.14\"\n\np"
},
{
"path": "docs/source/BestPractices/Qwen3_5-Best-Practice.md",
"chars": 18287,
"preview": "# Qwen3.5 最佳实践\n\nms-swift 支持使用transformers/Megatron后端对[Qwen3.5](https://github.com/QwenLM/Qwen3.5) Dense/Moe模型进行训练。Qwen3."
},
{
"path": "docs/source/BestPractices/Rapidly-Training-VL-model.md",
"chars": 6101,
"preview": "# 快速训练VL模型\n\n本文档提供从零开始快速训练视觉语言(Vision-Language, VL)模型的最佳实践。\n\n涉及的模型链接:\n- [Qwen2.5-VL-7B-Instruct](https://www.modelscope.c"
},
{
"path": "docs/source/BestPractices/Reranker.md",
"chars": 5231,
"preview": "# Reranker训练\n\nSWIFT已经支持Reranker模型的训练,目前已经支持的模型有:\n\n1. modernbert reranker模型\n - [ModelScope](https://www.modelscope.cn/m"
},
{
"path": "docs/source/Customization/Architecture.md",
"chars": 10495,
"preview": "# 架构介绍\n\nms-swift 4.0 采用模块化设计,各功能模块分布在一级目录下,便于开发者进行自定义扩展。本文档将详细介绍各模块的功能及自定义方法。\n\n## Agent Template\n\nagent模板的mapping文件可以参考["
},
{
"path": "docs/source/Customization/Custom-dataset.md",
"chars": 22534,
"preview": "# 自定义数据集\n\n自定义数据集的接入方法有三种,对预处理函数的控制能力逐渐加强,但接入难度逐步增加。例如,方案一最为方便,但对预处理函数的控制能力最弱,需要预先对数据集进行转换,传入特定格式的数据集:\n1. 【推荐】直接使用命令行传参的方"
},
{
"path": "docs/source/Customization/Custom-model.md",
"chars": 3017,
"preview": "# 自定义模型\n\nms-swift内置的模型,你可以直接通过指定model_id或者model_path来使用:`--model <model_id_or_path>`。ms-swift会根据model_id/model_path的后缀和`"
},
{
"path": "docs/source/GetStarted/Quick-start.md",
"chars": 4459,
"preview": "# 快速开始\n\n🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持600+纯文本大模型与400+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen3、Qwen3.5、I"
},
{
"path": "docs/source/GetStarted/SWIFT-installation.md",
"chars": 8437,
"preview": "# SWIFT安装\n\n## Wheel包安装\n\n可以使用pip进行安装:\n\n```shell\n# 推荐\npip install 'ms-swift' -U\n# 使用评测\npip install 'ms-swift[eval]' -U\n# 全"
},
{
"path": "docs/source/GetStarted/Web-UI.md",
"chars": 1106,
"preview": "# Web-UI\n\n目前SWIFT已经支持了界面化的训练和推理,参数支持和脚本训练相同。在安装SWIFT后,使用如下命令:\n\n```shell\nswift web-ui --lang zh\n# or en\nswift web-ui --la"
},
{
"path": "docs/source/Instruction/Agent-support.md",
"chars": 10638,
"preview": "# Agent支持\n\n## 数据集格式\n\nms-swift 使用 agent-template 实现了Agent数据格式与模型的解耦:基于统一的数据集格式,可以灵活切换不同模型进行训练,无需修改数据。\n\n纯文本Agent和多模态Agent的"
},
{
"path": "docs/source/Instruction/Command-line-parameters.md",
"chars": 58594,
"preview": "# 命令行参数\n\n命令行参数的介绍会分为基本参数,原子参数、集成参数和特定模型参数。**命令行最终使用的参数列表为集成参数。集成参数继承自基本参数和一些原子参数**。特定模型参数是针对于具体模型的参数,可以通过`--model_kwargs"
},
{
"path": "docs/source/Instruction/Evaluation.md",
"chars": 9974,
"preview": "# 评测\n\nSWIFT支持了eval(评测)能力,用于对原始模型和训练后的模型给出标准化的评测指标。\n\n## 能力介绍\n\nSWIFT的eval能力使用了魔搭社区[评测框架EvalScope](https://github.com/model"
},
{
"path": "docs/source/Instruction/Export-and-push.md",
"chars": 2253,
"preview": "# 导出与推送\n\n\n## Merge LoRA\n\n- 查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lora.sh)。\n\n## 量"
},
{
"path": "docs/source/Instruction/Frequently-asked-questions.md",
"chars": 24748,
"preview": "# 常见问题整理\n\n下面是SWIFT使用过程中遇到的一些常见问题。\n\n## 训练\n\nSWIFT支持的训练方法包括预训练、指令监督微调、偏好学习、GRPO、Embedding、Reranker、序列分类任务等,详见[主页](https://g"
},
{
"path": "docs/source/Instruction/GKD.md",
"chars": 9908,
"preview": "# GKD\n\nGKD(Generalized Knowledge Distillation,广义知识蒸馏)训练算法由论文 [On-Policy Distillation of Language Models: Learning from S"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/CHORD.md",
"chars": 1813,
"preview": "# On-Policy RL Meets Off-Policy Experts: Harmonizing SFT and RL via Dynamic Weighting (CHORD)\n\n**版本依赖**:ms-swift>=3.9\n\n本"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/CISPO.md",
"chars": 1968,
"preview": "# Clipped Importance Sampling Policy Optimization (CISPO)\n\n**版本依赖**:ms-swift>=3.11\n\nClipped Importance Sampling Policy O"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/DAPO.md",
"chars": 2471,
"preview": "# DAPO: An Open-Source LLM Reinforcement Learning System at Scale\n\n\n\n[Decoupled Clip and Dynamic sAmpling Policy Optimiz"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md",
"chars": 2617,
"preview": "# Group Sequence Policy Optimization\n\n**版本依赖**:ms-swift>=3.7\n\n[Group Sequence Policy Optimization](https://arxiv.org/abs"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/REINFORCEPP.md",
"chars": 2176,
"preview": "# REINFORCE++: An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models\n\n**版本依赖**:ms-swift>=3.10\n\n[R"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/RLOO.md",
"chars": 2370,
"preview": "# REINFORCE Leave-One-Out (RLOO)\n\n**版本依赖**:ms-swift>=3.10\n\n[REINFORCE Leave-One-Out (RLOO)](https://arxiv.org/abs/2402.1"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/SAPO.md",
"chars": 2370,
"preview": "# Soft Adaptive Policy Optimization (SAPO)\n\n**版本依赖**:ms-swift>=3.11\n\n[Soft Adaptive Policy Optimization (SAPO)](https://"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/deepeyes.md",
"chars": 3305,
"preview": "# DeepEyes: Incentivizing \"Thinking with Images\" via Reinforcement Learning\n\n**版本依赖**:ms-swift>=3.7\n\n## 原理介绍\n\n[DeepEyes论"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/entropy_mask.md",
"chars": 1418,
"preview": "# Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning\n\n**版本依赖**"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/index.rst",
"chars": 231,
"preview": "Advanced Research\n===============\n.. toctree::\n :maxdepth: 1\n\n entropy_mask.md\n CISPO.md\n DAPO.md\n deepeyes.md"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/training_inference_mismatch.md",
"chars": 6417,
"preview": "# Training-Inference-Mismatch\n\n**版本依赖**:ms-swift>=3.11\n\n**TL;DR**: GRPO 引入 vLLM 加速采样过程的同时,也引入了训练-推理不一致(Training-Inferenc"
},
{
"path": "docs/source/Instruction/GRPO/AdvancedResearch/treepo.md",
"chars": 3163,
"preview": "# TreePO: Bridging the Gap of Policy Optimization and Efficacy and Inference Efficiency with Heuristic Tree-based Modeli"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/gym_env.md",
"chars": 6397,
"preview": "# GYM环境训练\n\n**注意** GYM环境训练逻辑已在 ms-swift 3.8 中进行重构,如果您的 ms-swift 版本低于该版本,请参考对应版本的文档。\n\n## Gym接口\n\nGYM源自于[OpenAI Gym](https:/"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/index.rst",
"chars": 168,
"preview": "Developer Guide\n===============\n.. toctree::\n :maxdepth: 1\n\n loss_types.md\n multi_turn.md\n multi_task.md\n rewa"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/loss_types.md",
"chars": 2715,
"preview": "# Loss Types\n\nGRPO训练支持多种不同的loss类型,主要区别在于归一化的维度和梯度处理方式上有所不同。\n\n## 损失函数\n\ntoken 级别上,GRPO 训练使用以下损失函数\n\n$$\\mathcal{L}_{i,t} = -"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/multi_task.md",
"chars": 1672,
"preview": "# 多任务训练\n我们可以在数据集中添加一个用于标识任务类型的列,并在奖励函数/奖励模型插件中根据任务类型进行判断,从而实现多任务训练。假设数据集中包含数学和编程任务,比如:\n\n```\n {\"query\": \"Solve the equ"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/multi_turn.md",
"chars": 10420,
"preview": "# 多轮训练\n\n**注意** 多轮训练逻辑已在 ms-swift 3.8 中进行重构,如果您的 ms-swift 版本低于该版本,请参考对应版本的文档。\n\n在强化学习训练场景中,模型采样可能需要与环境进行多轮交互(如工具调用)。这种交互式训"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/reward_function.md",
"chars": 5434,
"preview": "# 奖励函数\n## 自定义奖励函数\n奖励函数接受模型生成的文本 completions 其他数据集中的列以及训练器状态作为参数(kwargs)进行打分, 其中[训练器状态](https://huggingface.co/docs/trans"
},
{
"path": "docs/source/Instruction/GRPO/DeveloperGuide/reward_model.md",
"chars": 4741,
"preview": "# 奖励模型\n\n默认情况下,奖励模型是指具有分类头数值输出的模型,通常称为输出奖励模型(ORM)。这些模型会对其他模型的输出进行评分,从而生成一个标量值,表示模型响应的质量。\n\n我们可以通过使用参数 `reward_models` 来加载具"
},
{
"path": "docs/source/Instruction/GRPO/GetStarted/GRPO.md",
"chars": 11419,
"preview": "# GRPO\n\nGRPOTrainer在ms-swift3.5进行了代码重构,如果你使用的swift版本<3.5, 请参考[stable文档](https://github.com/modelscope/ms-swift/blob/v3.4"
},
{
"path": "docs/source/Instruction/GRPO/GetStarted/index.rst",
"chars": 69,
"preview": "Get Started\n===============\n.. toctree::\n :maxdepth: 1\n\n GRPO.md\n"
},
{
"path": "docs/source/Instruction/GRPO/index.rst",
"chars": 280,
"preview": "GRPO\n===============\n.. toctree::\n :maxdepth: 2\n :caption: Get Started\n\n GetStarted/index.rst\n\n.. toctree::\n :ma"
},
{
"path": "docs/source/Instruction/Inference-and-deployment.md",
"chars": 14472,
"preview": "# 推理和部署\n\n以下为swift支持的推理引擎以及接入部分的相应能力,三种推理加速引擎为SWIFT的推理、部署、评测模块提供推理加速:\n\n| 推理加速引擎 | OpenAI API | 多模态 | 量化模型 | 多LoRA | QLoR"
},
{
"path": "docs/source/Instruction/Pre-training-and-Fine-tuning.md",
"chars": 17820,
"preview": "# 预训练与微调\n\n训练能力:\n\n| 方法 | 全参数 | LoRA "
},
{
"path": "docs/source/Instruction/RLHF.md",
"chars": 4425,
"preview": "# 人类对齐\n\n本文档提供了各种人类偏好对齐算法的训练脚本。若您希望深入了解更详尽的算法信息及其选择方法,请参考[文档](https://github.com/modelscope/modelscope-classroom/blob/mai"
},
{
"path": "docs/source/Instruction/Ray.md",
"chars": 4288,
"preview": "# ray的支持\n\nSWIFT已经支持使用ray来进行多卡或多节点训练。已有功能中对ray的支持情况如下:\n\n| 功能 | 支持ray | 例子 "
},
{
"path": "docs/source/Instruction/Reinforced-Fine-tuning.md",
"chars": 3343,
"preview": "# 强化微调\n\n强化微调是目前模型训练非常重要的功能之一,它本身的实现是多种多样的,SWIFT目前已经支持了强化微调所需要的原子能力,如采样、强化学习和微调。目前我们提供了拒绝采样微调的一个具体示例,可以查看[这里](https://git"
},
{
"path": "docs/source/Instruction/Sample.md",
"chars": 2572,
"preview": "# 采样\n\n采样是SWIFT新支持的重要能力之一,这部分可以理解为`test-time compute`的落地实现。同时,该能力对RFT(强化微调)的实现也至关重要。\n\n## 能力介绍\n\nSWIFT的sample能力可以使用下面的例子进行:"
},
{
"path": "docs/source/Instruction/Supported-models-and-datasets.md",
"chars": 295601,
"preview": "# 支持的模型和数据集\n\n> [!WARNING]\n> 本文档适配swift4.0.0,若要查看swift3.x模型对应的model_type和template,请查看[v3.12文档](https://swift.readthedocs."
},
{
"path": "docs/source/Instruction/Use-tuners.md",
"chars": 5741,
"preview": "# 使用Tuners\n\ntuner是指附加在模型上的额外结构部分,用于减少训练参数量或者提高训练精度。目前SWIFT支持的tuners有:\n\n- LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGU"
},
{
"path": "docs/source/Megatron-SWIFT/Ascend.md",
"chars": 6439,
"preview": "# Ascend NPU\n\n关于Megatron-SWIFT在Ascend NPU上的环境准备,请参考[NPU最佳实践](../BestPractices/NPU-support.md)。\n\n## NPU 性能数据采集\n\nNPU性能采集通过"
},
{
"path": "docs/source/Megatron-SWIFT/Command-line-parameters.md",
"chars": 29296,
"preview": "# 命令行参数\n\n## Megatron参数\n\n**训练参数**:\n- 🔥micro_batch_size: 每个device的批次大小,默认为1。\n- 🔥global_batch_size: 总批次大小,等价于`micro_batch_s"
},
{
"path": "docs/source/Megatron-SWIFT/GKD.md",
"chars": 2381,
"preview": "# GKD\n\n**版本依赖**:ms-swift >= 3.12\n\n如果你是首次使用 GKD,请先参考 [GKD文档](../Instruction/GKD.md)。\n\nGKD(Generalized Knowledge Distillat"
},
{
"path": "docs/source/Megatron-SWIFT/GRPO.md",
"chars": 2553,
"preview": "# GRPO\n\n**版本依赖**:ms-swift >= 3.11\n\n如果你是首次使用 GRPO,请先参考 [GRPO文档](../Instruction/GRPO/GetStarted/GRPO.md)。\n\nMegatron GRPO 当"
},
{
"path": "docs/source/Megatron-SWIFT/LoRA-Training.md",
"chars": 5926,
"preview": "# LoRA训练\n\nQwen3-235B-A22B-Instruct-250718 单机8卡H20 LoRA训练的最佳实践参考:[https://github.com/modelscope/ms-swift/pull/5033](https"
},
{
"path": "docs/source/Megatron-SWIFT/Mcore-Bridge.md",
"chars": 11332,
"preview": "# Mcore Bridge\n\nMegatron 以其卓越的训练速度和丰富的并行技术而著称,但也因此带来了较高的使用门槛。因此mcore-bridge 应运而生,旨在让 Megatron 训练像 transformers 一样简单易用。通过"
},
{
"path": "docs/source/Megatron-SWIFT/Multimodal-Model.md",
"chars": 5240,
"preview": "# 多模态模型\n\nms-swift引入了Megatron的并行技术来加速多模态大模型的训练。目前支持Qwen3-VL, Qwen3-Omni, InternVL3.5, GLM4.5v, Kimi-VL等模型的CPT/SFT/GRPO/DP"
},
{
"path": "docs/source/Megatron-SWIFT/Quick-start.md",
"chars": 9852,
"preview": "\n# 快速开始\n\nms-swift引入了Megatron的并行技术来加速大模型的训练,包括数据并行、张量并行、流水线并行、序列并行,上下文并行,专家并行。支持Qwen3、Qwen3.5、Deepseek-R1、GLM4.5、GPT-OSS等"
},
{
"path": "docs/source/_templates/autosummary/class.rst",
"chars": 184,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :inherited-members:\n :members:"
},
{
"path": "docs/source/_templates/classtemplate.rst",
"chars": 243,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :members:\n :special-members: _"
},
{
"path": "docs/source/_templates/sobolengine.rst",
"chars": 245,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :members:\n :exclude-members: M"
},
{
"path": "docs/source/conf.py",
"chars": 4197,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/source/index.rst",
"chars": 2024,
"preview": ".. swift documentation file,\n You can adapt this file completely to your liking, but it should at least\n contain the"
},
{
"path": "docs/source_en/.readthedocs.yaml",
"chars": 773,
"preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
},
{
"path": "docs/source_en/BestPractices/Elastic.md",
"chars": 7273,
"preview": "# Elastic\n\n\n## Installing Dependencies\n\nDeploy a K8S cluster and deploy [DLRover](https://github.com/intelligent-machine"
},
{
"path": "docs/source_en/BestPractices/Embedding.md",
"chars": 12350,
"preview": "# Embedding Training\n\nSWIFT has already supported the training of embedding models, including both pure text and multimo"
},
{
"path": "docs/source_en/BestPractices/GRPO-Code-Training.md",
"chars": 7763,
"preview": "# Code Training with GRPO\nThis document explains how to use GRPO to train models for code tasks.\n\n\nModel: [Qwen/Qwen2.5-"
},
{
"path": "docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md",
"chars": 15984,
"preview": "# Complete Multimodal GRPO Experiment Workflow\n\nThis document explains how to use SWIFT GRPO for training multimodal mod"
},
{
"path": "docs/source_en/BestPractices/GRPO.md",
"chars": 15836,
"preview": "# Complete GRPO Experiment Process\n\nThis article starts with the relatively simple mathematical task \"Countdown Game\" an"
},
{
"path": "docs/source_en/BestPractices/MLLM-Registration.md",
"chars": 31560,
"preview": "\n# Best Practices for Registering Multimodal Models\n\nThis document introduces how to register a multimodal model in ms-s"
},
{
"path": "docs/source_en/BestPractices/Metax-support.md",
"chars": 18747,
"preview": "# Metax Support\n\n## 1. use swift with Metax\nyou can either build an image or pull an existing one. Here, we demonstrate "
},
{
"path": "docs/source_en/BestPractices/More-Best-Practices.md",
"chars": 376,
"preview": "\n# More Best Practices\n\n- [Qwen2.5 self-cognition SFT](https://github.com/modelscope/ms-swift/tree/main/examples/noteboo"
},
{
"path": "docs/source_en/BestPractices/NPU-support.md",
"chars": 15882,
"preview": "# NPU Support\n\nWe add Ascend NPU support in ms-swift, so you can fine-tune and run inference on Ascend NPUs.\n\nThis docum"
},
{
"path": "docs/source_en/BestPractices/Qwen3-Best-Practice.md",
"chars": 16837,
"preview": "# Qwen3 Best Practices\nDiscussion: [issue 4030](https://github.com/modelscope/ms-swift/issues/4030)\n\nQwen Documentation:"
},
{
"path": "docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md",
"chars": 16749,
"preview": "# Qwen3-VL Best Practices\n## Environment Setup\nBefore starting inference and training, please ensure your environment is"
},
{
"path": "docs/source_en/BestPractices/Qwen3_5-Best-Practice.md",
"chars": 21454,
"preview": "# Qwen3.5 Best Practices\n\nms-swift supports training [Qwen3.5](https://github.com/QwenLM/Qwen3.5) Dense/MoE models using"
},
{
"path": "docs/source_en/BestPractices/Rapidly-Training-VL-model.md",
"chars": 7429,
"preview": "# Best Practices for Rapidly Training Vision-Language (VL) Models\n\nThis document provides best practices for quickly tra"
},
{
"path": "docs/source_en/BestPractices/Reranker.md",
"chars": 7760,
"preview": "# Reranker Training\n\nSWIFT supports Reranker model training. Currently supported models include:\n\n1. modernbert reranker"
},
{
"path": "docs/source_en/Customization/Architecture.md",
"chars": 15658,
"preview": "# Architecture Introduction\n\nms-swift 4.0 adopts a modular design, with functional modules distributed in first-level di"
},
{
"path": "docs/source_en/Customization/Custom-dataset.md",
"chars": 34514,
"preview": "# Custom Dataset\n\nThere are three methods for accessing custom datasets, each offering progressively greater control ove"
},
{
"path": "docs/source_en/Customization/Custom-model.md",
"chars": 5577,
"preview": "# Custom Model\n\nThe models built into ms-swift can be used directly by specifying either `model_id` or `model_path`: `--"
},
{
"path": "docs/source_en/GetStarted/Quick-start.md",
"chars": 7828,
"preview": "# Quick Start\n\n🍲 **ms-swift** is a large model and multimodal large model fine-tuning and deployment framework provided "
},
{
"path": "docs/source_en/GetStarted/SWIFT-installation.md",
"chars": 9574,
"preview": "# SWIFT Installation\n\n## Wheel Packages Installation\n\nYou can install it using pip:\n\n```shell\n# recommend\npip install 'm"
},
{
"path": "docs/source_en/GetStarted/Web-UI.md",
"chars": 2566,
"preview": "# Web-UI\n\nCurrently, SWIFT supports interface-based training and inference, with parameter support similar to script tra"
},
{
"path": "docs/source_en/Instruction/Agent-support.md",
"chars": 15475,
"preview": "# Agent Support\n\n\n## Dataset Format\n\nms-swift leverages agent-template to decouple Agent data formats from model impleme"
},
{
"path": "docs/source_en/Instruction/Command-line-parameters.md",
"chars": 108769,
"preview": "# Command Line Parameters\n\nThe command-line arguments will be introduced in four categories: basic arguments, atomic arg"
},
{
"path": "docs/source_en/Instruction/Evaluation.md",
"chars": 13234,
"preview": "# Evaluation\n\nSWIFT supports eval (evaluation) capabilities to provide standardized evaluation metrics for both raw mode"
},
{
"path": "docs/source_en/Instruction/Export-and-push.md",
"chars": 3831,
"preview": "# Export and Push\n\n## Merge LoRA\n\n- See [here](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lo"
},
{
"path": "docs/source_en/Instruction/Frequently-asked-questions.md",
"chars": 46617,
"preview": "# Frequently-asked-questions\n\nHere are some common questions encountered during the use of SWIFT.\n\n## Training\n\nSWIFT su"
},
{
"path": "docs/source_en/Instruction/GKD.md",
"chars": 15156,
"preview": "# GKD\n\nGKD (Generalized Knowledge Distillation) training algorithm is proposed in the paper [On-Policy Distillation of L"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/CHORD.md",
"chars": 2930,
"preview": "# On-Policy RL Meets Off-Policy Experts: Harmonizing SFT and RL via Dynamic Weighting (CHORD)\n\n**Version Requirement**: "
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/CISPO.md",
"chars": 3147,
"preview": "# Clipped Importance Sampling Policy Optimization (CISPO)\n\n**Version requirement**: ms-swift>=3.11\n\nClipped Importance S"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/DAPO.md",
"chars": 4295,
"preview": "# DAPO: An Open-Source LLM Reinforcement Learning System at Scale\n\n[Decoupled Clip and Dynamic sAmpling Policy Optimizat"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md",
"chars": 3707,
"preview": "# Group Sequence Policy Optimization (GSPO)\n\n**Version Requirement**: ms-swift>=3.8\n\nIn [Group Sequence Policy Optimizat"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/REINFORCEPP.md",
"chars": 3711,
"preview": "# REINFORCE++: An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models\n\n**Version requirement**: ms"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/RLOO.md",
"chars": 4024,
"preview": "# REINFORCE Leave-One-Out (RLOO)\n\n**Version requirement**: ms-swift>=3.10\n\n[REINFORCE Leave-One-Out (RLOO)](https://arxi"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/SAPO.md",
"chars": 4112,
"preview": "# Soft Adaptive Policy Optimization (SAPO)\n\n**Version Requirement**: ms-swift>=3.11\n\n[Soft Adaptive Policy Optimization "
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/deepeyes.md",
"chars": 5161,
"preview": "# DeepEyes: Incentivizing \"Thinking with Images\" via Reinforcement Learning\n\n**Version Requirement**: ms-swift>=3.7\n\n## "
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/entropy_mask.md",
"chars": 2800,
"preview": "# Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning\n\n**Versio"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/index.rst",
"chars": 231,
"preview": "Advanced Research\n===============\n.. toctree::\n :maxdepth: 1\n\n entropy_mask.md\n CISPO.md\n DAPO.md\n deepeyes.md"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/training_inference_mismatch.md",
"chars": 10021,
"preview": "# Training-Inference-Mismatch\n\n**Version Requirement**: ms-swift>=3.11\n\n**TL;DR**: While GRPO introduces vLLM to acceler"
},
{
"path": "docs/source_en/Instruction/GRPO/AdvancedResearch/treepo.md",
"chars": 3922,
"preview": "# TreePO: Bridging the Gap of Policy Optimization and Efficacy and Inference Efficiency with Heuristic Tree-based Modeli"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/gym_env.md",
"chars": 8409,
"preview": "# GYM Environment Training\n\nNote: The GYM environment training logic was refactored in ms-swift 3.8. If you are using an"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/index.rst",
"chars": 168,
"preview": "Developer Guide\n===============\n.. toctree::\n :maxdepth: 1\n\n loss_types.md\n multi_turn.md\n multi_task.md\n rewa"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/loss_types.md",
"chars": 4266,
"preview": "# Loss Types\n\nGRPO training supports multiple loss types, with the main differences being the normalization dimension an"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/multi_task.md",
"chars": 2284,
"preview": "# Multi-Task Training\nWe can add a column to the dataset that indicates the task type, and then use this information in "
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/multi_turn.md",
"chars": 14422,
"preview": "# Multi-turn Training\n\n**Note** The multi-turn training logic was refactored in ms-swift 3.8.\nIf your ms-swift version i"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/reward_function.md",
"chars": 8881,
"preview": "# Reward Function\n## Custom Reward Function\nThe reward function takes as arguments (via kwargs) the model-generated comp"
},
{
"path": "docs/source_en/Instruction/GRPO/DeveloperGuide/reward_model.md",
"chars": 6862,
"preview": "# Reward Model\n\nBy default, a reward model refers to a model with a classification head that outputs numeric values, usu"
},
{
"path": "docs/source_en/Instruction/GRPO/GetStarted/GRPO.md",
"chars": 17068,
"preview": "# GRPO\n\nGRPOTrainer underwent a code refactoring in ms-swift3.5. If you are using a swift version < 3.5, please refer to"
},
{
"path": "docs/source_en/Instruction/GRPO/GetStarted/index.rst",
"chars": 69,
"preview": "Get Started\n===============\n.. toctree::\n :maxdepth: 1\n\n GRPO.md\n"
},
{
"path": "docs/source_en/Instruction/GRPO/index.rst",
"chars": 280,
"preview": "GRPO\n===============\n.. toctree::\n :maxdepth: 2\n :caption: Get Started\n\n GetStarted/index.rst\n\n.. toctree::\n :ma"
},
{
"path": "docs/source_en/Instruction/Inference-and-deployment.md",
"chars": 19514,
"preview": "# Inference and Deployment\n\nBelow are the inference engines supported by Swift along with their corresponding capabiliti"
},
{
"path": "docs/source_en/Instruction/Pre-training-and-Fine-tuning.md",
"chars": 27653,
"preview": "# Pre-training and Fine-tuning\nTraining Capability:\n\n| Method | Fu"
},
{
"path": "docs/source_en/Instruction/RLHF.md",
"chars": 8112,
"preview": "# RLHF\n\nThis document provides training scripts for various human preference alignment algorithms. If you want to learn "
},
{
"path": "docs/source_en/Instruction/Ray.md",
"chars": 6207,
"preview": "# Ray Support\n\nSWIFT already supports using Ray for multi-GPU or multi-node training. The support status for Ray in exis"
},
{
"path": "docs/source_en/Instruction/Reinforced-Fine-tuning.md",
"chars": 7692,
"preview": "# Reinforced Fine-Tuning\n\nReinforced fine-tuning is one of the most important functionalities in current model training,"
},
{
"path": "docs/source_en/Instruction/Sample.md",
"chars": 4583,
"preview": "# Sampling\n\nSampling is one of the newly supported key capabilities of SWIFT. This feature can be understood as the prac"
},
{
"path": "docs/source_en/Instruction/Supported-models-and-datasets.md",
"chars": 296341,
"preview": "# Supported Models and Datasets\n\n> [!WARNING]\n> This document is compatible with swift 4.0.0. To view the model_type and"
},
{
"path": "docs/source_en/Instruction/Use-tuners.md",
"chars": 9082,
"preview": "# Using Tuners\n\nTuners refer to additional structural components attached to a model, aimed at reducing the number of tr"
},
{
"path": "docs/source_en/Megatron-SWIFT/Ascend.md",
"chars": 7641,
"preview": "# Ascend NPU\n\nFor environment preparation of Megatron-SWIFT on Ascend NPU, please refer to [NPU Best Practices](../BestP"
},
{
"path": "docs/source_en/Megatron-SWIFT/Command-line-parameters.md",
"chars": 55817,
"preview": "# Command Line Arguments\n\n## Megatron Parameters\n\n**Training Parameters**:\n\n- 🔥micro_batch_size: Batch size per device, "
},
{
"path": "docs/source_en/Megatron-SWIFT/GKD.md",
"chars": 3952,
"preview": "# GKD\n\n**Version Requirement**: ms-swift >= 3.12\n\nIf you are new to GKD, please refer to the [GKD Documentation](../Inst"
},
{
"path": "docs/source_en/Megatron-SWIFT/GRPO.md",
"chars": 3947,
"preview": "# Megatron GRPO\n\n**Version Requirement**: ms-swift >= 3.11\n\nIf you are new to GRPO, please refer to the [GRPO documentat"
},
{
"path": "docs/source_en/Megatron-SWIFT/LoRA-Training.md",
"chars": 7074,
"preview": "# LoRA Training\n\nBest practice reference for single-node 8xH20 LoRA training with Qwen3-235B-A22B-Instruct-250718: https"
},
{
"path": "docs/source_en/Megatron-SWIFT/Mcore-Bridge.md",
"chars": 13969,
"preview": "# Mcore Bridge\n\nMegatron is renowned for its excellent training speed and rich parallelism techniques, but this also bri"
},
{
"path": "docs/source_en/Megatron-SWIFT/Multimodal-Model.md",
"chars": 5921,
"preview": "# Multimodal Models\n\nms-swift introduces Megatron's parallelization techniques to accelerate the training of large multi"
},
{
"path": "docs/source_en/Megatron-SWIFT/Quick-start.md",
"chars": 13510,
"preview": "# Quick Start\n\nms-swift incorporates Megatron's parallelization techniques to accelerate the training of large models, i"
},
{
"path": "docs/source_en/_templates/autosummary/class.rst",
"chars": 184,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :inherited-members:\n :members:"
},
{
"path": "docs/source_en/_templates/classtemplate.rst",
"chars": 243,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :members:\n :special-members: _"
},
{
"path": "docs/source_en/_templates/sobolengine.rst",
"chars": 245,
"preview": ".. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n :members:\n :exclude-members: M"
},
{
"path": "docs/source_en/conf.py",
"chars": 4200,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/source_en/index.rst",
"chars": 2025,
"preview": ".. swift documentation file,\n You can adapt this file completely to your liking, but it should at least\n contain the"
},
{
"path": "examples/README.md",
"chars": 383,
"preview": "# Instructions\n\nThe example provides instructions for using SWIFT for training, inference, deployment, evaluation, and q"
},
{
"path": "examples/app/base_url/demo.py",
"chars": 613,
"preview": "# Copyright (c) ModelScope Contributors. All rights reserved.\nimport os\n\nos.environ['CUDA_VISIBLE_DEVICES'] = '0'\n\nif __"
},
{
"path": "examples/app/base_url/demo.sh",
"chars": 225,
"preview": "# You need to have a deployed model or api service first\nCUDA_VISIBLE_DEVICES=0 swift app \\\n --model '<model_name>' \\"
},
{
"path": "examples/app/llm/sglang.sh",
"chars": 215,
"preview": "# test_env: pip install \"sglang[all]==0.4.6.*\" -U\nCUDA_VISIBLE_DEVICES=0 swift app \\\n --model Qwen/Qwen2.5-7B-Instruc"
},
{
"path": "examples/app/llm/vllm.sh",
"chars": 235,
"preview": "CUDA_VISIBLE_DEVICES=0 swift app \\\n --model Qwen/Qwen2.5-7B-Instruct \\\n --stream true \\\n --infer_backend vllm \\"
},
{
"path": "examples/app/mllm.sh",
"chars": 366,
"preview": "CUDA_VISIBLE_DEVICES=0 \\\nMAX_PIXELS=1003520 \\\nVIDEO_MAX_PIXELS=50176 \\\nFPS_MAX_FRAMES=12 \\\nswift app \\\n --model Qwen/"
},
{
"path": "examples/ascend/activation_cpu_offload/fsdp2.json",
"chars": 2222,
"preview": "{\n \"_description\": \"FSDP2 configuration for distributed training (PyTorch native FSDP v2)\",\n \"_requires\": \"torch>="
},
{
"path": "examples/ascend/activation_cpu_offload/train.sh",
"chars": 6301,
"preview": "#!/bin/bash\nASCEND_RT_VISIBLE_DEVICES=0,1 \\\nNPROC_PER_NODE=2 \\\nswift sft \\\n --model 'Qwen/Qwen3-8B' \\\n --tuner_typ"
},
{
"path": "examples/ascend/deploy/vllm.sh",
"chars": 153,
"preview": "ASCEND_RT_VISIBLE_DEVICES=0 swift deploy \\\n --model Qwen/Qwen2.5-7B-Instruct \\\n --infer_backend vllm \\\n --serve"
},
{
"path": "examples/ascend/infer/vllm/dp_tp.sh",
"chars": 347,
"preview": "NPROC_PER_NODE=4 \\\nASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \\\nswift infer \\\n --model Qwen/Qwen3-8B \\\n --infer_backend vll"
},
{
"path": "examples/ascend/megatron/train_sft_full.sh",
"chars": 1054,
"preview": "PYTORCH_NPU_ALLOC_CONF='expandable_segments:True' \\\nNPROC_PER_NODE=4 \\\nmegatron sft \\\n --model Qwen/Qwen3-4B \\\n --"
},
{
"path": "examples/ascend/multi-node/megatron/node1.sh",
"chars": 935,
"preview": "# Atlas A2 * 2 nodes * 8 cards per node\n\nASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\nNNODES=2 \\\nNODE_RANK=0 \\\nMASTER_ADD"
},
{
"path": "examples/ascend/multi-node/megatron/node2.sh",
"chars": 941,
"preview": "# Atlas A2 * 2 nodes * 8 cards per node\n\nASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\nNNODES=2 \\\nNODE_RANK=1 \\\nMASTER_ADD"
},
{
"path": "examples/ascend/train/qwen3/qwen3_lora_deepspeed.sh",
"chars": 916,
"preview": "# hardware: Atlas 900 A2\nexport TASK_QUEUE_ENABLE=2\nexport CPU_AFFINITY_CONF=2\nnproc_per_node=8\nASCEND_RT_VISIBLE_DEVICE"
},
{
"path": "examples/ascend/train/qwen3/qwen3_lora_fsdp/fsdp.json",
"chars": 782,
"preview": "{\n \"compute_environment\": \"LOCAL_MACHINE\",\n \"debug\": false,\n \"distributed_type\": \"FSDP\",\n \"downcast_bf16\": \"no\",\n \""
},
{
"path": "examples/ascend/train/qwen3/qwen3_lora_fsdp/train.sh",
"chars": 1105,
"preview": "# hardware: Atlas 900 A2\n# For NPU, in Transformers versions 5.0 and above, it is recommended to disable\n# cpu_ram_effic"
},
{
"path": "examples/ascend/train/qwen3/qwen3_lora_megatron.sh",
"chars": 1141,
"preview": "NPROC_PER_NODE=2 \\\nASCEND_RT_VISIBLE_DEVICES=0,1 \\\nmegatron sft \\\n --model Qwen/Qwen3-8B \\\n --save_safetensors tru"
},
{
"path": "examples/ascend/train/qwen3_next/qwen3_next_megatron.sh",
"chars": 1217,
"preview": "export TASK_QUEUE_ENABLE=2\nNPROC_PER_NODE=8 \\\nASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \\\nmegatron sft \\\n --model Qwe"
},
{
"path": "examples/ascend/train/qwen3_omni/qwen3_omni_full_mindspeed.sh",
"chars": 1625,
"preview": "# 16 * 64GiB Ascend A3\n# Modified from https://github.com/modelscope/ms-swift/blob/main/examples/megatron/multimodal/omn"
},
{
"path": "examples/ascend/train/qwen3_vl/moe_full_mindspeed.sh",
"chars": 1796,
"preview": "# 16 * 64GiB Ascend A3\n# Modified from https://github.com/modelscope/ms-swift/blob/main/examples/models/qwen3_vl/mcore_f"
},
{
"path": "examples/custom/dataset.py",
"chars": 969,
"preview": "# Copyright (c) ModelScope Contributors. All rights reserved.\nfrom typing import Any, Dict, Optional\n\nfrom swift.dataset"
},
{
"path": "examples/custom/infer.sh",
"chars": 251,
"preview": "# sh examples/custom/infer.sh\nCUDA_VISIBLE_DEVICES=0 \\\nswift infer \\\n --adapters output/vx-xxx/checkpoint-xxx \\\n -"
},
{
"path": "examples/custom/model.py",
"chars": 1498,
"preview": "# Copyright (c) ModelScope Contributors. All rights reserved.\nfrom swift.infer_engine import InferRequest, RequestConfig"
},
{
"path": "examples/custom/model_hf.py",
"chars": 2484,
"preview": "# Copyright (c) ModelScope Contributors. All rights reserved.\n\"\"\"\nHere is another way to register the model, by customiz"
},
{
"path": "examples/custom/my_qwen2_5_omni/my_register.py",
"chars": 23815,
"preview": "import torch\nfrom functools import partial\nfrom transformers import PretrainedConfig, PreTrainedModel\nfrom transformers."
},
{
"path": "examples/custom/my_qwen2_5_omni/test_register.py",
"chars": 3417,
"preview": "import os\nimport requests\nimport sys\nfrom modelscope import snapshot_download\nfrom qwen_omni_utils import process_mm_inf"
},
{
"path": "examples/custom/my_qwen2_5_omni/train.py",
"chars": 1272,
"preview": "import os\nimport sys\n\nfrom swift import SftArguments, sft_main\n\nsys.path.append('examples/custom/my_qwen2_5_omni')\n\nif _"
},
{
"path": "examples/custom/sft.sh",
"chars": 780,
"preview": "# sh examples/custom/sft.sh\nCUDA_VISIBLE_DEVICES=0 \\\nswift sft \\\n --external_plugins examples/custom/dataset.py \\\n "
},
{
"path": "examples/deploy/README.md",
"chars": 301,
"preview": "Please refer to the examples in [examples/infer](../../infer/) and change `swift infer` to `swift deploy` to start the s"
},
{
"path": "examples/deploy/agent/client.py",
"chars": 3191,
"preview": "# Copyright (c) ModelScope Contributors. All rights reserved.\nimport os\nfrom openai import OpenAI\n\nos.environ['CUDA_VISI"
},
{
"path": "examples/deploy/agent/server.sh",
"chars": 278,
"preview": "CUDA_VISIBLE_DEVICES=0 swift deploy \\\n --model Qwen/Qwen2.5-7B-Instruct \\\n --infer_backend vllm \\\n --vllm_gpu_m"
}
]
// ... and 1028 more files (download for full content)
About this extraction
This page contains the full source code of the modelscope/ms-swift GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1228 files (7.2 MB), approximately 1.9M tokens, and a symbol index with 4974 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.