Repository: modelscope/ms-swift
Branch: main
Commit: fe928a9f1464
Files: 1228
Total size: 7.2 MB
Directory structure:
gitextract_9q3kx9l7/
├── .dev_scripts/
│ ├── build_docs.sh
│ ├── ci_container_test.sh
│ ├── dockerci.sh
│ └── dockerci_npu.sh
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1-bug-report.yml
│ │ ├── 2-feature-request.yml
│ │ ├── 3-question-discussion.yml
│ │ └── config.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── SECURITY.md
│ └── workflows/
│ ├── citest.yaml
│ ├── citest_npu.yaml
│ ├── close_tale_issue.yaml
│ ├── lint.yaml
│ └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .pre-commit-config_local.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTING_CN.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_CN.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── source/
│ │ ├── .readthedocs.yaml
│ │ ├── BestPractices/
│ │ │ ├── Elastic.md
│ │ │ ├── Embedding.md
│ │ │ ├── GRPO-Code-Training.md
│ │ │ ├── GRPO-Multi-Modal-Training.md
│ │ │ ├── GRPO.md
│ │ │ ├── MLLM-Registration.md
│ │ │ ├── Metax-support.md
│ │ │ ├── More-Best-Practices.md
│ │ │ ├── NPU-support.md
│ │ │ ├── Qwen3-Best-Practice.md
│ │ │ ├── Qwen3-VL-Best-Practice.md
│ │ │ ├── Qwen3_5-Best-Practice.md
│ │ │ ├── Rapidly-Training-VL-model.md
│ │ │ └── Reranker.md
│ │ ├── Customization/
│ │ │ ├── Architecture.md
│ │ │ ├── Custom-dataset.md
│ │ │ └── Custom-model.md
│ │ ├── GetStarted/
│ │ │ ├── Quick-start.md
│ │ │ ├── SWIFT-installation.md
│ │ │ └── Web-UI.md
│ │ ├── Instruction/
│ │ │ ├── Agent-support.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── Evaluation.md
│ │ │ ├── Export-and-push.md
│ │ │ ├── Frequently-asked-questions.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO/
│ │ │ │ ├── AdvancedResearch/
│ │ │ │ │ ├── CHORD.md
│ │ │ │ │ ├── CISPO.md
│ │ │ │ │ ├── DAPO.md
│ │ │ │ │ ├── GSPO.md
│ │ │ │ │ ├── REINFORCEPP.md
│ │ │ │ │ ├── RLOO.md
│ │ │ │ │ ├── SAPO.md
│ │ │ │ │ ├── deepeyes.md
│ │ │ │ │ ├── entropy_mask.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ │ └── treepo.md
│ │ │ │ ├── DeveloperGuide/
│ │ │ │ │ ├── gym_env.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── loss_types.md
│ │ │ │ │ ├── multi_task.md
│ │ │ │ │ ├── multi_turn.md
│ │ │ │ │ ├── reward_function.md
│ │ │ │ │ └── reward_model.md
│ │ │ │ ├── GetStarted/
│ │ │ │ │ ├── GRPO.md
│ │ │ │ │ └── index.rst
│ │ │ │ └── index.rst
│ │ │ ├── Inference-and-deployment.md
│ │ │ ├── Pre-training-and-Fine-tuning.md
│ │ │ ├── RLHF.md
│ │ │ ├── Ray.md
│ │ │ ├── Reinforced-Fine-tuning.md
│ │ │ ├── Sample.md
│ │ │ ├── Supported-models-and-datasets.md
│ │ │ └── Use-tuners.md
│ │ ├── Megatron-SWIFT/
│ │ │ ├── Ascend.md
│ │ │ ├── Command-line-parameters.md
│ │ │ ├── GKD.md
│ │ │ ├── GRPO.md
│ │ │ ├── LoRA-Training.md
│ │ │ ├── Mcore-Bridge.md
│ │ │ ├── Multimodal-Model.md
│ │ │ └── Quick-start.md
│ │ ├── _templates/
│ │ │ ├── autosummary/
│ │ │ │ └── class.rst
│ │ │ ├── classtemplate.rst
│ │ │ └── sobolengine.rst
│ │ ├── conf.py
│ │ └── index.rst
│ └── source_en/
│ ├── .readthedocs.yaml
│ ├── BestPractices/
│ │ ├── Elastic.md
│ │ ├── Embedding.md
│ │ ├── GRPO-Code-Training.md
│ │ ├── GRPO-Multi-Modal-Training.md
│ │ ├── GRPO.md
│ │ ├── MLLM-Registration.md
│ │ ├── Metax-support.md
│ │ ├── More-Best-Practices.md
│ │ ├── NPU-support.md
│ │ ├── Qwen3-Best-Practice.md
│ │ ├── Qwen3-VL-Best-Practice.md
│ │ ├── Qwen3_5-Best-Practice.md
│ │ ├── Rapidly-Training-VL-model.md
│ │ └── Reranker.md
│ ├── Customization/
│ │ ├── Architecture.md
│ │ ├── Custom-dataset.md
│ │ └── Custom-model.md
│ ├── GetStarted/
│ │ ├── Quick-start.md
│ │ ├── SWIFT-installation.md
│ │ └── Web-UI.md
│ ├── Instruction/
│ │ ├── Agent-support.md
│ │ ├── Command-line-parameters.md
│ │ ├── Evaluation.md
│ │ ├── Export-and-push.md
│ │ ├── Frequently-asked-questions.md
│ │ ├── GKD.md
│ │ ├── GRPO/
│ │ │ ├── AdvancedResearch/
│ │ │ │ ├── CHORD.md
│ │ │ │ ├── CISPO.md
│ │ │ │ ├── DAPO.md
│ │ │ │ ├── GSPO.md
│ │ │ │ ├── REINFORCEPP.md
│ │ │ │ ├── RLOO.md
│ │ │ │ ├── SAPO.md
│ │ │ │ ├── deepeyes.md
│ │ │ │ ├── entropy_mask.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── training_inference_mismatch.md
│ │ │ │ └── treepo.md
│ │ │ ├── DeveloperGuide/
│ │ │ │ ├── gym_env.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── loss_types.md
│ │ │ │ ├── multi_task.md
│ │ │ │ ├── multi_turn.md
│ │ │ │ ├── reward_function.md
│ │ │ │ └── reward_model.md
│ │ │ ├── GetStarted/
│ │ │ │ ├── GRPO.md
│ │ │ │ └── index.rst
│ │ │ └── index.rst
│ │ ├── Inference-and-deployment.md
│ │ ├── Pre-training-and-Fine-tuning.md
│ │ ├── RLHF.md
│ │ ├── Ray.md
│ │ ├── Reinforced-Fine-tuning.md
│ │ ├── Sample.md
│ │ ├── Supported-models-and-datasets.md
│ │ └── Use-tuners.md
│ ├── Megatron-SWIFT/
│ │ ├── Ascend.md
│ │ ├── Command-line-parameters.md
│ │ ├── GKD.md
│ │ ├── GRPO.md
│ │ ├── LoRA-Training.md
│ │ ├── Mcore-Bridge.md
│ │ ├── Multimodal-Model.md
│ │ └── Quick-start.md
│ ├── _templates/
│ │ ├── autosummary/
│ │ │ └── class.rst
│ │ ├── classtemplate.rst
│ │ └── sobolengine.rst
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── app/
│ │ ├── base_url/
│ │ │ ├── demo.py
│ │ │ └── demo.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ └── mllm.sh
│ ├── ascend/
│ │ ├── activation_cpu_offload/
│ │ │ ├── fsdp2.json
│ │ │ └── train.sh
│ │ ├── deploy/
│ │ │ └── vllm.sh
│ │ ├── infer/
│ │ │ └── vllm/
│ │ │ └── dp_tp.sh
│ │ ├── megatron/
│ │ │ └── train_sft_full.sh
│ │ ├── multi-node/
│ │ │ └── megatron/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ └── train/
│ │ ├── qwen3/
│ │ │ ├── qwen3_lora_deepspeed.sh
│ │ │ ├── qwen3_lora_fsdp/
│ │ │ │ ├── fsdp.json
│ │ │ │ └── train.sh
│ │ │ └── qwen3_lora_megatron.sh
│ │ ├── qwen3_next/
│ │ │ └── qwen3_next_megatron.sh
│ │ ├── qwen3_omni/
│ │ │ └── qwen3_omni_full_mindspeed.sh
│ │ └── qwen3_vl/
│ │ └── moe_full_mindspeed.sh
│ ├── custom/
│ │ ├── dataset.py
│ │ ├── infer.sh
│ │ ├── model.py
│ │ ├── model_hf.py
│ │ ├── my_qwen2_5_omni/
│ │ │ ├── my_register.py
│ │ │ ├── test_register.py
│ │ │ └── train.py
│ │ └── sft.sh
│ ├── deploy/
│ │ ├── README.md
│ │ ├── agent/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── bert/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── client/
│ │ │ ├── llm/
│ │ │ │ ├── base/
│ │ │ │ │ ├── openai_client.py
│ │ │ │ │ └── swift_client.py
│ │ │ │ └── chat/
│ │ │ │ ├── openai_client.py
│ │ │ │ └── swift_client.py
│ │ │ └── mllm/
│ │ │ ├── openai_client.py
│ │ │ └── swift_client.py
│ │ ├── embedding/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── lora/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── reranker/
│ │ │ ├── client.py
│ │ │ ├── client_generative.py
│ │ │ └── server.sh
│ │ ├── reward_model/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── seq_cls/
│ │ │ ├── client.py
│ │ │ └── server.sh
│ │ ├── sglang.sh
│ │ ├── vllm.sh
│ │ └── vllm_dp.sh
│ ├── eval/
│ │ ├── eval_url/
│ │ │ ├── demo.py
│ │ │ └── eval.sh
│ │ ├── llm/
│ │ │ ├── sglang.sh
│ │ │ └── vllm.sh
│ │ ├── train_eval/
│ │ │ └── train.sh
│ │ └── vlm/
│ │ └── eval.sh
│ ├── export/
│ │ ├── merge_lora.sh
│ │ ├── ollama.sh
│ │ ├── push_to_hub.sh
│ │ └── quantize/
│ │ ├── awq.sh
│ │ ├── bert/
│ │ │ ├── bnb.sh
│ │ │ └── gptq.sh
│ │ ├── bnb.sh
│ │ ├── fp8.sh
│ │ ├── gptq.sh
│ │ ├── gptq_v2.sh
│ │ ├── mllm/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── moe/
│ │ │ ├── awq.sh
│ │ │ ├── bnb.sh
│ │ │ ├── fp8.sh
│ │ │ └── gptq.sh
│ │ ├── omni/
│ │ │ └── gptq.sh
│ │ └── reward_model/
│ │ ├── bnb.sh
│ │ └── gptq.sh
│ ├── infer/
│ │ ├── cli_demo.sh
│ │ ├── demo.py
│ │ ├── demo_agent.py
│ │ ├── demo_bert.py
│ │ ├── demo_embedding.py
│ │ ├── demo_grounding.py
│ │ ├── demo_hf.py
│ │ ├── demo_lora.py
│ │ ├── demo_mllm.py
│ │ ├── demo_reranker.py
│ │ ├── demo_reward_model.py
│ │ ├── demo_vllm_reasoning_parser.py
│ │ ├── lmdeploy/
│ │ │ ├── batch_ddp.sh
│ │ │ └── mllm_tp.sh
│ │ ├── sglang/
│ │ │ ├── demo.sh
│ │ │ ├── distill_qwen3_235b.sh
│ │ │ ├── mtp.sh
│ │ │ └── tp.sh
│ │ ├── transformers/
│ │ │ ├── batch_ddp.sh
│ │ │ ├── bert.sh
│ │ │ ├── lora.sh
│ │ │ ├── mllm_device_map.sh
│ │ │ ├── prm.sh
│ │ │ └── reward_model.sh
│ │ └── vllm/
│ │ ├── dp_tp.sh
│ │ ├── mllm_ddp.sh
│ │ ├── mllm_tp.sh
│ │ └── mtp.sh
│ ├── megatron/
│ │ ├── base_to_chat.sh
│ │ ├── benchmark/
│ │ │ └── deepspeed.sh
│ │ ├── dense/
│ │ │ ├── 72b_offload.sh
│ │ │ └── qwen3_32b.sh
│ │ ├── embedding/
│ │ │ ├── qwen3_emb.sh
│ │ │ └── qwen3_vl_emb.sh
│ │ ├── export/
│ │ │ ├── full.sh
│ │ │ └── lora.sh
│ │ ├── fp8/
│ │ │ ├── benchmark.sh
│ │ │ ├── llm.sh
│ │ │ └── vlm.sh
│ │ ├── grpo/
│ │ │ ├── dense_colocate.sh
│ │ │ ├── dense_server.sh
│ │ │ ├── moe_colocate_full.sh
│ │ │ ├── moe_colocate_lora.sh
│ │ │ └── sapo.sh
│ │ ├── long_text.sh
│ │ ├── lora/
│ │ │ ├── dense.sh
│ │ │ ├── dpo.sh
│ │ │ ├── loss_scale.sh
│ │ │ ├── moe.sh
│ │ │ ├── mtp.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── qwen3_235b.sh
│ │ ├── mcore_bridge/
│ │ │ ├── full/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── lora/
│ │ │ ├── moe.sh
│ │ │ ├── new_special_tokens.sh
│ │ │ └── seq_cls.sh
│ │ ├── moe/
│ │ │ ├── deepseek_v3.sh
│ │ │ ├── moe.sh
│ │ │ ├── qwen3_moe.sh
│ │ │ └── qwen3_moe_offload.sh
│ │ ├── multi-node/
│ │ │ ├── node1.sh
│ │ │ └── node2.sh
│ │ ├── multimodal/
│ │ │ ├── dense/
│ │ │ │ ├── dpo.sh
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── lora_llm_vit_full/
│ │ │ │ └── sft.sh
│ │ │ ├── moe/
│ │ │ │ ├── full_dpo_offload.sh
│ │ │ │ └── lora.sh
│ │ │ └── omni/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── pretrain.sh
│ │ ├── reranker/
│ │ │ ├── qwen3_reranker.sh
│ │ │ └── qwen3_vl_reranker.sh
│ │ ├── rlhf/
│ │ │ ├── dpo/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── group_by_length.sh
│ │ │ │ ├── moe.sh
│ │ │ │ └── packing.sh
│ │ │ ├── gkd/
│ │ │ │ ├── dense.sh
│ │ │ │ ├── opsd.sh
│ │ │ │ └── teacher_server.sh
│ │ │ ├── kto/
│ │ │ │ ├── dense.sh
│ │ │ │ └── moe.sh
│ │ │ └── rm/
│ │ │ ├── dense.sh
│ │ │ └── moe.sh
│ │ ├── seq_cls/
│ │ │ ├── full.sh
│ │ │ └── lora/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ └── sft.sh
│ ├── models/
│ │ ├── deepseek_ocr/
│ │ │ ├── infer.py
│ │ │ └── train.sh
│ │ ├── deepseek_vl2/
│ │ │ └── train.sh
│ │ ├── glm-4.6v/
│ │ │ ├── flash.sh
│ │ │ └── mcore.sh
│ │ ├── gpt_oss/
│ │ │ ├── internvl3_5_gpt.sh
│ │ │ ├── mcore.sh
│ │ │ └── train.sh
│ │ ├── hunyuan_ocr/
│ │ │ └── train.sh
│ │ ├── internvl3/
│ │ │ └── train.sh
│ │ ├── keye/
│ │ │ └── train.sh
│ │ ├── llama4/
│ │ │ └── mcore.sh
│ │ ├── minicpmv/
│ │ │ └── train.sh
│ │ ├── ovis2/
│ │ │ └── train.sh
│ │ ├── qwen3_5/
│ │ │ ├── mcore.sh
│ │ │ ├── mcore_full.sh
│ │ │ ├── mcore_grpo_moe.sh
│ │ │ ├── packing.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_next/
│ │ │ ├── mcore.sh
│ │ │ ├── mtp.sh
│ │ │ ├── non_padding_free.sh
│ │ │ └── transformers.sh
│ │ ├── qwen3_omni/
│ │ │ ├── transformers.sh
│ │ │ └── zero3.sh
│ │ └── qwen3_vl/
│ │ ├── mcore.sh
│ │ ├── mcore_full.sh
│ │ ├── mixed.sh
│ │ ├── transformers.sh
│ │ └── zero3.sh
│ ├── notebook/
│ │ ├── qwen2_5-self-cognition/
│ │ │ ├── infer.ipynb
│ │ │ ├── infer.sh
│ │ │ ├── self-cognition-sft.ipynb
│ │ │ └── sft.sh
│ │ ├── qwen2_5-vl-grounding/
│ │ │ └── zh.ipynb
│ │ └── qwen2vl-ocr/
│ │ ├── infer.ipynb
│ │ └── ocr-sft.ipynb
│ ├── sampler/
│ │ ├── distill/
│ │ │ ├── distill.sh
│ │ │ └── distill.yaml
│ │ └── sample/
│ │ ├── sample.sh
│ │ └── sampling.yaml
│ ├── train/
│ │ ├── agent/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── glm4.sh
│ │ │ ├── loss_scale/
│ │ │ │ ├── infer_lora.py
│ │ │ │ └── train.sh
│ │ │ └── qwen2_5.sh
│ │ ├── all_to_all/
│ │ │ ├── infer.sh
│ │ │ └── train.sh
│ │ ├── base_to_chat/
│ │ │ ├── full.sh
│ │ │ ├── lora.sh
│ │ │ └── lora2.sh
│ │ ├── cached_dataset/
│ │ │ ├── dpo.sh
│ │ │ ├── mcore.sh
│ │ │ ├── pretrained.sh
│ │ │ ├── reranker.sh
│ │ │ ├── seq_cls.sh
│ │ │ ├── sft.sh
│ │ │ └── vlm.sh
│ │ ├── early_stop/
│ │ │ └── lora_sft.sh
│ │ ├── embedding/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_emb.sh
│ │ │ │ └── qwen3_vl_emb.sh
│ │ │ └── train_gme.sh
│ │ ├── flash_attention_3/
│ │ │ ├── mcore.sh
│ │ │ └── transformers.sh
│ │ ├── full/
│ │ │ ├── dft.sh
│ │ │ ├── infer.sh
│ │ │ ├── qwen2_5_32b.sh
│ │ │ └── train.sh
│ │ ├── grpo/
│ │ │ ├── external/
│ │ │ │ ├── README.md
│ │ │ │ ├── agent.sh
│ │ │ │ ├── grpo_32b_full.sh
│ │ │ │ ├── grpo_7b.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── vllm_gym.sh
│ │ │ │ └── vllm_multi_turn.sh
│ │ │ ├── internal/
│ │ │ │ ├── README.md
│ │ │ │ ├── chord.sh
│ │ │ │ ├── full_lmdeploy.sh
│ │ │ │ ├── gspo.sh
│ │ │ │ ├── moe_full.sh
│ │ │ │ ├── moe_lora.sh
│ │ │ │ ├── qlora.sh
│ │ │ │ ├── reinforce_plus_plus.sh
│ │ │ │ ├── rloo.sh
│ │ │ │ ├── sapo.sh
│ │ │ │ ├── transformers.sh
│ │ │ │ ├── vllm_72b_4gpu.sh
│ │ │ │ ├── vllm_lora_qwenvl72b.sh
│ │ │ │ ├── vllm_multi_turn.sh
│ │ │ │ └── vllm_vl7b.sh
│ │ │ ├── multi_node/
│ │ │ │ ├── Qwen2_5_32B_full.sh
│ │ │ │ ├── colocate_multi_node1.sh
│ │ │ │ ├── colocate_multi_node2.sh
│ │ │ │ ├── server_multi_node.sh
│ │ │ │ └── train_dlc.sh
│ │ │ ├── plugin/
│ │ │ │ ├── deepeyes/
│ │ │ │ │ ├── deepeyes.sh
│ │ │ │ │ └── deepeyes_plugin.py
│ │ │ │ ├── gsm8k/
│ │ │ │ │ ├── gsm8k.sh
│ │ │ │ │ └── gsm8k_plugin.py
│ │ │ │ ├── plugin.py
│ │ │ │ ├── run_external_reward_func.sh
│ │ │ │ ├── run_external_reward_model.sh
│ │ │ │ ├── run_external_scheduler.sh
│ │ │ │ └── treepo/
│ │ │ │ ├── tree_rollout.py
│ │ │ │ ├── tree_rollout.sh
│ │ │ │ └── tree_rollout_plugin.py
│ │ │ ├── prompt.txt
│ │ │ └── qwen2_5_omni/
│ │ │ ├── grpo.sh
│ │ │ └── infer.sh
│ │ ├── infer.sh
│ │ ├── liger/
│ │ │ └── sft.sh
│ │ ├── lora_sft.sh
│ │ ├── moe/
│ │ │ ├── llama4.sh
│ │ │ └── qwen3_moe.sh
│ │ ├── multi-gpu/
│ │ │ ├── ddp/
│ │ │ │ └── train.sh
│ │ │ ├── ddp_device_map/
│ │ │ │ └── train.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── train_zero2.sh
│ │ │ │ └── train_zero3.sh
│ │ │ ├── device_map/
│ │ │ │ └── train.sh
│ │ │ ├── fsdp2_lora/
│ │ │ │ ├── fsdp2.json
│ │ │ │ └── train.sh
│ │ │ └── fsdp_qlora/
│ │ │ ├── fsdp_offload.json
│ │ │ └── train.sh
│ │ ├── multi-node/
│ │ │ ├── accelerate/
│ │ │ │ ├── multi_node.yaml
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ ├── deepspeed/
│ │ │ │ ├── README.md
│ │ │ │ ├── host.txt
│ │ │ │ └── train.sh
│ │ │ ├── dlc/
│ │ │ │ └── train.sh
│ │ │ ├── ray/
│ │ │ │ ├── sft.sh
│ │ │ │ └── sft.yaml
│ │ │ ├── swift/
│ │ │ │ ├── train_node1.sh
│ │ │ │ └── train_node2.sh
│ │ │ └── torchrun/
│ │ │ ├── train_node1.sh
│ │ │ └── train_node2.sh
│ │ ├── multimodal/
│ │ │ ├── audio.sh
│ │ │ ├── caption.sh
│ │ │ ├── grounding.sh
│ │ │ ├── infer.sh
│ │ │ ├── lora_llm_full_vit/
│ │ │ │ ├── infer.sh
│ │ │ │ ├── merge_lora.sh
│ │ │ │ ├── seq_cls.sh
│ │ │ │ └── sft.sh
│ │ │ ├── ocr.sh
│ │ │ ├── omni/
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── rlhf/
│ │ │ │ ├── dpo/
│ │ │ │ │ ├── full.sh
│ │ │ │ │ └── lora.sh
│ │ │ │ ├── gkd/
│ │ │ │ │ ├── fast.sh
│ │ │ │ │ └── full.sh
│ │ │ │ └── kto.sh
│ │ │ ├── video.sh
│ │ │ └── vit_gradient_checkpointing.sh
│ │ ├── new_special_tokens/
│ │ │ ├── infer.sh
│ │ │ ├── merge_lora.sh
│ │ │ ├── tokens.txt
│ │ │ └── train.sh
│ │ ├── on_policy_distillation.sh
│ │ ├── optimizer/
│ │ │ ├── muon.sh
│ │ │ └── muonclip.sh
│ │ ├── packing/
│ │ │ ├── dpo.sh
│ │ │ ├── dpo_vlm.sh
│ │ │ ├── liger_kernel.sh
│ │ │ ├── llm.sh
│ │ │ ├── qwen2_5_omni.sh
│ │ │ ├── qwen2_5_vl.sh
│ │ │ └── streaming.sh
│ │ ├── padding_free/
│ │ │ ├── dpo_vlm.sh
│ │ │ └── sft.sh
│ │ ├── plugins/
│ │ │ ├── loss_scale.sh
│ │ │ └── tuner_phi4_mm.sh
│ │ ├── predict_with_generate/
│ │ │ └── train.sh
│ │ ├── pretrain/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ ├── awq/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── bnb/
│ │ │ │ ├── merge_lora.sh
│ │ │ │ └── train.sh
│ │ │ ├── gptq.sh
│ │ │ └── hqq.sh
│ │ ├── reranker/
│ │ │ ├── qwen3/
│ │ │ │ ├── infer.py
│ │ │ │ ├── qwen3_reranker.sh
│ │ │ │ └── qwen3_vl_reranker.sh
│ │ │ ├── train_generative_reranker.sh
│ │ │ ├── train_generative_reranker_listwise.sh
│ │ │ ├── train_reranker.sh
│ │ │ ├── train_reranker_auto_patch.sh
│ │ │ ├── train_reranker_listwise.sh
│ │ │ └── train_reranker_mm.sh
│ │ ├── rft/
│ │ │ ├── math.json
│ │ │ └── rft.py
│ │ ├── rlhf/
│ │ │ ├── README.md
│ │ │ ├── cpo.sh
│ │ │ ├── dpo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── gkd/
│ │ │ │ ├── fast.sh
│ │ │ │ ├── full.sh
│ │ │ │ ├── teacher_server.sh
│ │ │ │ ├── think_model.sh
│ │ │ │ ├── vllm_colocate.sh
│ │ │ │ └── vllm_server.sh
│ │ │ ├── kto.sh
│ │ │ ├── mpo.sh
│ │ │ ├── opsd/
│ │ │ │ ├── opsd.sh
│ │ │ │ └── opsd_plugin.py
│ │ │ ├── orpo.sh
│ │ │ ├── ppo/
│ │ │ │ ├── full.sh
│ │ │ │ └── lora.sh
│ │ │ ├── rm.sh
│ │ │ └── simpo.sh
│ │ ├── seq_cls/
│ │ │ ├── bert/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── multi_label/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ ├── sft.sh
│ │ │ │ └── vlm.sh
│ │ │ ├── qwen2_5/
│ │ │ │ ├── deploy.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ ├── qwen2_5_omni/
│ │ │ │ ├── infer.py
│ │ │ │ ├── infer.sh
│ │ │ │ └── sft.sh
│ │ │ └── regression/
│ │ │ ├── deploy.sh
│ │ │ ├── infer.sh
│ │ │ └── sft.sh
│ │ ├── sequence_parallel/
│ │ │ ├── sequence_parallel.sh
│ │ │ ├── sequence_parallel_512k.sh
│ │ │ ├── sequence_parallel_dpo.sh
│ │ │ ├── sequence_parallel_emb.sh
│ │ │ ├── sequence_parallel_grpo.sh
│ │ │ ├── sequence_parallel_reranker.sh
│ │ │ └── sequence_parallel_seq_cls.sh
│ │ ├── streaming/
│ │ │ ├── lazy_tokenize.sh
│ │ │ └── streaming.sh
│ │ ├── think_model/
│ │ │ ├── deepseek_r1.sh
│ │ │ ├── qwen3_demo1.sh
│ │ │ └── qwen3_demo2.sh
│ │ └── tuners/
│ │ ├── adalora/
│ │ │ └── train.sh
│ │ ├── adapter/
│ │ │ └── train.sh
│ │ ├── boft/
│ │ │ └── train.sh
│ │ ├── bone/
│ │ │ └── train.sh
│ │ ├── dora/
│ │ │ └── train.sh
│ │ ├── galore/
│ │ │ ├── train_galore.sh
│ │ │ └── train_qgalore.sh
│ │ ├── lisa/
│ │ │ └── train.sh
│ │ ├── llamapro/
│ │ │ └── train.sh
│ │ ├── longlora/
│ │ │ └── train.sh
│ │ ├── lora/
│ │ │ └── train.sh
│ │ ├── lora-ga/
│ │ │ └── train.sh
│ │ ├── neftune/
│ │ │ └── train.sh
│ │ ├── olora/
│ │ │ └── train.sh
│ │ ├── pissa/
│ │ │ └── train.sh
│ │ ├── qlora/
│ │ │ └── train.sh
│ │ ├── reft/
│ │ │ └── train.sh
│ │ └── unsloth/
│ │ └── train.sh
│ └── yaml/
│ ├── sft.sh
│ └── sft.yaml
├── requirements/
│ ├── docs.txt
│ ├── eval.txt
│ ├── framework.txt
│ ├── install_all.sh
│ ├── ray.txt
│ ├── swanlab.txt
│ └── tests.txt
├── requirements.txt
├── scripts/
│ ├── benchmark/
│ │ ├── config/
│ │ │ └── tuner.json
│ │ ├── exp.py
│ │ ├── exp_utils.py
│ │ └── generate_report.py
│ └── utils/
│ ├── plot_loss.py
│ ├── run_dataset_info.py
│ ├── run_model_info.py
│ ├── run_template.py
│ └── test_link_valid.py
├── setup.cfg
├── setup.py
├── swift/
│ ├── __init__.py
│ ├── agent_template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── deepseek_v3_1.py
│ │ ├── extra.py
│ │ ├── glm4.py
│ │ ├── hermes.py
│ │ ├── llama.py
│ │ ├── mapping.py
│ │ ├── minimax_m2.py
│ │ ├── mistral.py
│ │ ├── qwen.py
│ │ ├── qwen3_coder.py
│ │ ├── react.py
│ │ ├── seed_oss.py
│ │ ├── toolbench.py
│ │ └── youtu.py
│ ├── arguments/
│ │ ├── __init__.py
│ │ ├── app_args.py
│ │ ├── base_args/
│ │ │ ├── __init__.py
│ │ │ ├── base_args.py
│ │ │ ├── data_args.py
│ │ │ ├── generation_args.py
│ │ │ ├── model_args.py
│ │ │ ├── quant_args.py
│ │ │ └── template_args.py
│ │ ├── deploy_args.py
│ │ ├── eval_args.py
│ │ ├── export_args.py
│ │ ├── infer_args.py
│ │ ├── merge_args.py
│ │ ├── pretrain_args.py
│ │ ├── rlhf_args.py
│ │ ├── sampling_args.py
│ │ ├── sft_args.py
│ │ ├── tuner_args.py
│ │ └── webui_args.py
│ ├── callbacks/
│ │ ├── __init__.py
│ │ ├── activation_cpu_offload.py
│ │ ├── adalora.py
│ │ ├── base.py
│ │ ├── deepspeed_elastic.py
│ │ ├── early_stop.py
│ │ ├── lisa.py
│ │ ├── mapping.py
│ │ └── perf_log.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── _megatron/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── main.py
│ │ │ ├── pt.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── app.py
│ │ ├── deploy.py
│ │ ├── eval.py
│ │ ├── export.py
│ │ ├── infer.py
│ │ ├── main.py
│ │ ├── merge_lora.py
│ │ ├── pt.py
│ │ ├── rlhf.py
│ │ ├── rollout.py
│ │ ├── sample.py
│ │ ├── sft.py
│ │ ├── utils.py
│ │ └── web_ui.py
│ ├── config/
│ │ ├── fsdp2.json
│ │ ├── zero0.json
│ │ ├── zero1.json
│ │ ├── zero2.json
│ │ ├── zero2_offload.json
│ │ ├── zero3.json
│ │ └── zero3_offload.json
│ ├── dataloader/
│ │ ├── __init__.py
│ │ ├── dispatcher.py
│ │ └── shard.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ └── dataset_info.json
│ │ ├── dataset/
│ │ │ ├── __init__.py
│ │ │ ├── llm.py
│ │ │ └── mllm.py
│ │ ├── dataset_meta.py
│ │ ├── dataset_syntax.py
│ │ ├── indexed_dataset.py
│ │ ├── loader.py
│ │ ├── media.py
│ │ ├── packing.py
│ │ ├── preprocessor/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── extra.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── hub/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ └── hub.py
│ ├── infer_engine/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── grpo_vllm_engine.py
│ │ ├── infer_client.py
│ │ ├── infer_engine.py
│ │ ├── lmdeploy_engine.py
│ │ ├── patch.py
│ │ ├── protocol.py
│ │ ├── sglang_engine.py
│ │ ├── transformers_engine.py
│ │ ├── utils.py
│ │ └── vllm_engine.py
│ ├── loss/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── causal_lm.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ └── reranker.py
│ ├── loss_scale/
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── base.py
│ │ ├── config/
│ │ │ ├── agentflan.json
│ │ │ ├── alpha_umi.json
│ │ │ ├── hermes.json
│ │ │ ├── ignore_empty_think.json
│ │ │ ├── qwen.json
│ │ │ └── react.json
│ │ ├── mapping.py
│ │ ├── other.py
│ │ └── utils.py
│ ├── megatron/
│ │ ├── __init__.py
│ │ ├── arguments/
│ │ │ ├── __init__.py
│ │ │ ├── export_args.py
│ │ │ ├── megatron_args.py
│ │ │ ├── megatron_base_args.py
│ │ │ ├── pretrain_args.py
│ │ │ ├── rlhf_args.py
│ │ │ └── sft_args.py
│ │ ├── callbacks/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── default_flow.py
│ │ │ ├── mapping.py
│ │ │ ├── print.py
│ │ │ ├── swanlab.py
│ │ │ ├── tensorboard.py
│ │ │ ├── utils.py
│ │ │ └── wandb.py
│ │ ├── convert.py
│ │ ├── init.py
│ │ ├── model/
│ │ │ ├── __init__.py
│ │ │ ├── constant.py
│ │ │ ├── gpt_bridge.py
│ │ │ ├── gpt_model.py
│ │ │ ├── gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm4.py
│ │ │ │ ├── minimax_m2.py
│ │ │ │ ├── olmoe.py
│ │ │ │ ├── qwen3_emb.py
│ │ │ │ └── qwen3_next.py
│ │ │ ├── mm_gpt_model.py
│ │ │ ├── mm_gpts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glm.py
│ │ │ │ ├── internvl.py
│ │ │ │ ├── kimi_vl.py
│ │ │ │ ├── llama4.py
│ │ │ │ ├── qwen.py
│ │ │ │ ├── qwen3_5.py
│ │ │ │ ├── qwen3_5_gdn.py
│ │ │ │ ├── qwen3_vl.py
│ │ │ │ └── utils.py
│ │ │ ├── model_config.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gated_delta_net.py
│ │ │ │ └── gated_self_attention.py
│ │ │ ├── register.py
│ │ │ └── rope.py
│ │ ├── pipelines/
│ │ │ ├── __init__.py
│ │ │ ├── export/
│ │ │ │ ├── __init__.py
│ │ │ │ └── export.py
│ │ │ └── train/
│ │ │ ├── __init__.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ └── sft.py
│ │ ├── trainers/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── batch_sampler.py
│ │ │ ├── dpo_trainer.py
│ │ │ ├── embedding_trainer.py
│ │ │ ├── gkd_trainer.py
│ │ │ ├── grpo_trainer.py
│ │ │ ├── kto_trainer.py
│ │ │ ├── reranker_trainer.py
│ │ │ ├── reward_trainer.py
│ │ │ ├── rlhf_mixin.py
│ │ │ ├── rollout_mixin.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── vocab_parallel_utils.py
│ │ ├── tuners/
│ │ │ ├── __init__.py
│ │ │ └── lora.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── convert_utils.py
│ │ ├── megatron_lm_utils.py
│ │ ├── parallel_utils.py
│ │ ├── patcher.py
│ │ └── utils.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── acc.py
│ │ ├── base.py
│ │ ├── embedding.py
│ │ ├── mapping.py
│ │ ├── nlg.py
│ │ ├── reranker.py
│ │ └── utils.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── constant.py
│ │ ├── model_arch.py
│ │ ├── model_meta.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baichuan.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── codefuse.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── internlm.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── mamba.py
│ │ │ ├── microsoft.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── mistral.py
│ │ │ ├── mllm.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── skywork.py
│ │ │ ├── stepfun.py
│ │ │ ├── telechat.py
│ │ │ ├── tencent.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── npu_patcher.py
│ │ ├── patcher.py
│ │ ├── register.py
│ │ └── utils.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── galore/
│ │ │ ├── __init__.py
│ │ │ ├── adafactor.py
│ │ │ ├── adamw.py
│ │ │ ├── adamw8bit.py
│ │ │ ├── galore_projector.py
│ │ │ └── utils.py
│ │ ├── lorap.py
│ │ ├── mapping.py
│ │ ├── multimodal.py
│ │ ├── muon.py
│ │ └── muonclip.py
│ ├── pipelines/
│ │ ├── __init__.py
│ │ ├── app/
│ │ │ ├── __init__.py
│ │ │ ├── app.py
│ │ │ ├── build_ui.py
│ │ │ └── locale.py
│ │ ├── base.py
│ │ ├── eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ └── utils.py
│ │ ├── export/
│ │ │ ├── __init__.py
│ │ │ ├── cached_dataset.py
│ │ │ ├── export.py
│ │ │ ├── merge_lora.py
│ │ │ ├── ollama.py
│ │ │ └── quant.py
│ │ ├── infer/
│ │ │ ├── __init__.py
│ │ │ ├── deploy.py
│ │ │ ├── infer.py
│ │ │ ├── rollout.py
│ │ │ └── utils.py
│ │ ├── sampling/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── distill_sampler.py
│ │ │ ├── sampling.py
│ │ │ ├── utils.py
│ │ │ └── vanilla_sampler.py
│ │ ├── train/
│ │ │ ├── __init__.py
│ │ │ ├── kto.py
│ │ │ ├── pretrain.py
│ │ │ ├── rlhf.py
│ │ │ ├── sft.py
│ │ │ └── tuner.py
│ │ └── utils.py
│ ├── ray/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── base.py
│ │ └── resource_manager.py
│ ├── rewards/
│ │ ├── __init__.py
│ │ ├── orm.py
│ │ ├── prm.py
│ │ └── rm_plugin.py
│ ├── rlhf_trainers/
│ │ ├── __init__.py
│ │ ├── args_mixin.py
│ │ ├── arguments.py
│ │ ├── cpo_trainer.py
│ │ ├── dpo_trainer.py
│ │ ├── gkd_trainer.py
│ │ ├── grpo_trainer.py
│ │ ├── kto_trainer.py
│ │ ├── orpo_trainer.py
│ │ ├── ppo_trainer.py
│ │ ├── reward_trainer.py
│ │ ├── rlhf_mixin.py
│ │ ├── rollout_mixin.py
│ │ ├── utils.py
│ │ └── vllm_client.py
│ ├── rollout/
│ │ ├── __init__.py
│ │ ├── gym_env.py
│ │ └── multi_turn.py
│ ├── sequence_parallel/
│ │ ├── __init__.py
│ │ ├── ulysses.py
│ │ ├── utils.py
│ │ └── zigzag_ring_attn.py
│ ├── template/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── constant.py
│ │ ├── grounding.py
│ │ ├── register.py
│ │ ├── template_inputs.py
│ │ ├── template_meta.py
│ │ ├── templates/
│ │ │ ├── __init__.py
│ │ │ ├── baai.py
│ │ │ ├── baidu.py
│ │ │ ├── bert.py
│ │ │ ├── deepseek.py
│ │ │ ├── dots.py
│ │ │ ├── gemma.py
│ │ │ ├── glm.py
│ │ │ ├── idefics3.py
│ │ │ ├── internlm.py
│ │ │ ├── internvl.py
│ │ │ ├── kwai.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── llm.py
│ │ │ ├── megrez.py
│ │ │ ├── microsoft.py
│ │ │ ├── midashenglm.py
│ │ │ ├── minicpm.py
│ │ │ ├── minimax.py
│ │ │ ├── minimind.py
│ │ │ ├── mistral.py
│ │ │ ├── molmo.py
│ │ │ ├── moonshot.py
│ │ │ ├── mplug.py
│ │ │ ├── openbuddy.py
│ │ │ ├── pixtral.py
│ │ │ ├── qwen.py
│ │ │ ├── seed.py
│ │ │ ├── stepfun.py
│ │ │ ├── tencent.py
│ │ │ ├── utils.py
│ │ │ ├── valley.py
│ │ │ └── yi.py
│ │ ├── utils.py
│ │ └── vision_utils.py
│ ├── trainers/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── embedding_trainer.py
│ │ ├── mixin.py
│ │ ├── patcher.py
│ │ ├── reranker_trainer.py
│ │ ├── seq2seq_trainer.py
│ │ ├── trainer.py
│ │ ├── trainer_factory.py
│ │ └── utils.py
│ ├── tuner_plugin/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dummy.py
│ │ ├── ia3.py
│ │ ├── lora_llm.py
│ │ └── mapping.py
│ ├── tuners/
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── base.py
│ │ ├── llamapro.py
│ │ ├── longlora/
│ │ │ ├── __init__.py
│ │ │ ├── llama.py
│ │ │ └── longlora.py
│ │ ├── lora.py
│ │ ├── lora_layers.py
│ │ ├── mapping.py
│ │ ├── neftune.py
│ │ ├── part.py
│ │ ├── peft.py
│ │ ├── prompt.py
│ │ ├── reft.py
│ │ ├── restuning.py
│ │ ├── restuning_components.py
│ │ ├── scetuning/
│ │ │ ├── __init__.py
│ │ │ ├── scetuning.py
│ │ │ └── scetuning_components.py
│ │ ├── side.py
│ │ └── utils.py
│ ├── ui/
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── base.py
│ │ ├── llm_eval/
│ │ │ ├── __init__.py
│ │ │ ├── eval.py
│ │ │ ├── llm_eval.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_export/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ ├── llm_export.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_grpo/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── external_rollout.py
│ │ │ ├── external_runtime.py
│ │ │ ├── grpo_advanced.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_grpo.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── reward.py
│ │ │ ├── rollout.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_infer/
│ │ │ ├── __init__.py
│ │ │ ├── generate.py
│ │ │ ├── llm_infer.py
│ │ │ ├── model.py
│ │ │ └── runtime.py
│ │ ├── llm_rlhf/
│ │ │ ├── __init__.py
│ │ │ ├── advanced.py
│ │ │ ├── dataset.py
│ │ │ ├── hyper.py
│ │ │ ├── llm_rlhf.py
│ │ │ ├── lora.py
│ │ │ ├── model.py
│ │ │ ├── optimizer.py
│ │ │ ├── quantization.py
│ │ │ ├── report_to.py
│ │ │ ├── rlhf.py
│ │ │ ├── runtime.py
│ │ │ ├── save.py
│ │ │ ├── target.py
│ │ │ └── tuner.py
│ │ ├── llm_sample/
│ │ │ ├── __init__.py
│ │ │ ├── llm_sample.py
│ │ │ ├── model.py
│ │ │ ├── runtime.py
│ │ │ └── sample.py
│ │ └── llm_train/
│ │ ├── __init__.py
│ │ ├── advanced.py
│ │ ├── dataset.py
│ │ ├── hyper.py
│ │ ├── llm_train.py
│ │ ├── lora.py
│ │ ├── model.py
│ │ ├── optimizer.py
│ │ ├── quantization.py
│ │ ├── report_to.py
│ │ ├── runtime.py
│ │ ├── save.py
│ │ ├── self_cog.py
│ │ ├── target.py
│ │ ├── task.py
│ │ ├── tuner.py
│ │ └── utils.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── dequantizer.py
│ │ ├── env.py
│ │ ├── hf_config.py
│ │ ├── hub_utils.py
│ │ ├── import_utils.py
│ │ ├── io_utils.py
│ │ ├── logger.py
│ │ ├── np_utils.py
│ │ ├── processor_utils.py
│ │ ├── safetensors.py
│ │ ├── shutdown_manager.py
│ │ ├── tb_utils.py
│ │ ├── torch_utils.py
│ │ ├── transformers_utils.py
│ │ └── utils.py
│ └── version.py
└── tests/
├── __init__.py
├── app/
│ └── test_app.py
├── deploy/
│ ├── test_dataset.py
│ └── test_logprobs.py
├── eval/
│ └── test_eval.py
├── export/
│ └── test_quant.py
├── general/
│ ├── test_arch.py
│ ├── test_dataset.py
│ ├── test_model.py
│ ├── test_stream.py
│ └── test_template.py
├── hub/
│ ├── __init__.py
│ └── test_check_model.py
├── infer/
│ ├── test_agent.py
│ ├── test_infer.py
│ ├── test_logprobs.py
│ ├── test_main.py
│ ├── test_max_memory.py
│ ├── test_mllm.py
│ └── test_sglang.py
├── llm/
│ ├── __init__.py
│ ├── config/
│ │ ├── infer.json
│ │ └── sft.json
│ ├── data/
│ │ ├── alpaca.csv
│ │ ├── alpaca.jsonl
│ │ ├── alpaca2.csv
│ │ ├── chatml.jsonl
│ │ ├── conversations.jsonl
│ │ ├── multi_modal_1.jsonl
│ │ ├── multi_modal_2.jsonl
│ │ ├── multi_modal_3.jsonl
│ │ ├── sharegpt.jsonl
│ │ ├── swift_multi.json
│ │ ├── swift_multi.jsonl
│ │ ├── swift_pre.csv
│ │ ├── swift_pre.jsonl
│ │ ├── swift_single.csv
│ │ └── swift_single.jsonl
│ ├── test_custom.py
│ ├── test_dataset.py
│ ├── test_ollama_export.py
│ ├── test_run.py
│ ├── test_template.py
│ ├── test_utils.py
│ └── test_web_ui.py
├── megatron/
│ ├── export/
│ │ └── test_export.py
│ ├── test_align/
│ │ ├── test_llm.py
│ │ └── test_mllm.py
│ ├── test_embedding.py
│ ├── test_export.py
│ ├── test_gkd.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_lora.py
│ ├── test_rlhf.py
│ └── test_train.py
├── model_tag.py
├── models/
│ ├── test_flash_attn.py
│ ├── test_llm.py
│ └── test_mllm.py
├── run.py
├── run_config.yaml
├── sample/
│ └── test_client.py
├── test_align/
│ ├── test_cls.py
│ ├── test_lmdeploy_vlm.py
│ ├── test_padding_side.py
│ ├── test_rlhf_loss.py
│ ├── test_template/
│ │ ├── test_agent.py
│ │ ├── test_audio.py
│ │ ├── test_gene.py
│ │ ├── test_llm.py
│ │ ├── test_template.py
│ │ ├── test_tool.py
│ │ ├── test_video.py
│ │ └── test_vision.py
│ └── test_vllm_vlm.py
├── test_utils.py
├── train/
│ ├── test_channel.py
│ ├── test_cls.py
│ ├── test_embedding.py
│ ├── test_export_cached_dataset.py
│ ├── test_freeze.py
│ ├── test_gkd.py
│ ├── test_grounding.py
│ ├── test_grpo.py
│ ├── test_kto.py
│ ├── test_liger.py
│ ├── test_multilabel.py
│ ├── test_packing.py
│ ├── test_ppo.py
│ ├── test_pt.py
│ ├── test_resume_from_checkpoint.py
│ ├── test_rlhf.py
│ ├── test_sample.py
│ ├── test_sft.py
│ ├── test_train_eval.py
│ ├── test_vit_lr.py
│ └── test_vllm_importance_sampling_basic.py
├── tuners/
│ ├── __init__.py
│ ├── test_extra_state_dict.py
│ ├── test_merged_linear.py
│ ├── test_neft.py
│ ├── test_peft.py
│ ├── test_scetuning.py
│ ├── test_swift_base.py
│ ├── test_swift_device_map.py
│ └── test_swift_restuning.py
└── utils/
├── __init__.py
├── test_async_rewards.py
├── test_file_utils.py
├── test_io_utils.py
├── test_rewards.py
├── test_split_str_parts_by.py
└── test_torch_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dev_scripts/build_docs.sh
================================================
pip install -r requirements/docs.txt
cd docs
rm -rf build
# update api rst
#rm -rf source/api/
#sphinx-apidoc --module-first -o source/api/ ../modelscope/
make html
================================================
FILE: .dev_scripts/ci_container_test.sh
================================================
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install -r requirements/tests.txt -i https://mirrors.aliyun.com/pypi/simple/
git config --global --add safe.directory /ms-swift
git config --global user.email tmp
git config --global user.name tmp.com
# linter test
# use internal project for pre-commit due to the network problem
if [ `git remote -v | grep alibaba | wc -l` -gt 1 ]; then
pre-commit run -c .pre-commit-config_local.yaml --all-files
if [ $? -ne 0 ]; then
echo "linter test failed, please run 'pre-commit run --all-files' to check"
echo "From the repository folder"
echo "Run 'pip install -r requirements/tests.txt' install test dependencies."
echo "Run 'pre-commit install' install pre-commit hooks."
echo "Finally run linter with command: 'pre-commit run --all-files' to check."
echo "Ensure there is no failure!!!!!!!!"
exit -1
fi
fi
pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/
pip install decord einops -U -i https://mirrors.aliyun.com/pypi/simple/
pip uninstall autoawq -y
pip install optimum
pip install diffusers
pip install "transformers<5.0"
# pip install autoawq -U --no-deps
# test with install
pip install .
pip install auto_gptq bitsandbytes deepspeed -U -i https://mirrors.aliyun.com/pypi/simple/
else
echo "Running case in release image, run case directly!"
fi
# remove torch_extensions folder to avoid ci hang.
rm -rf ~/.cache/torch_extensions
if [ $# -eq 0 ]; then
ci_command="python tests/run.py --subprocess"
else
ci_command="$@"
fi
echo "Running case with command: $ci_command"
$ci_command
================================================
FILE: .dev_scripts/dockerci.sh
================================================
#!/bin/bash
MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
CODE_DIR=$PWD
CODE_DIR_IN_CONTAINER=/ms-swift
MODELSCOPE_SDK_DEBUG=True
echo "$USER"
gpus='0,1 2,3'
cpu_sets='0-15 16-31'
cpu_sets_arr=($cpu_sets)
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
echo "ci command: $CI_COMMAND"
PR_CHANGED_FILES="${PR_CHANGED_FILES:-}"
echo "PR modified files: $PR_CHANGED_FILES"
PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
idx=0
for gpu in $gpus
do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
echo "get gpu lock $gpu"
CONTAINER_NAME="swift-ci-$idx"
let is_get_file_lock=true
# pull image if there are update
docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
echo 'debugging'
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
-e MODELSCOPE_SDK_DEBUG=True \
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_ENVIRONMENT='ci' \
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \
-e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
-e PR_CHANGED_FILES=$PR_CHANGED_FILES \
--workdir=$CODE_DIR_IN_CONTAINER \
${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND
else
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-e TEST_LEVEL=$TEST_LEVEL \
-e MODELSCOPE_ENVIRONMENT='ci' \
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \
-e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
-e PR_CHANGED_FILES=$PR_CHANGED_FILES \
--workdir=$CODE_DIR_IN_CONTAINER \
${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND
fi
if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1
fi
break
done
if [ "$is_get_file_lock" = false ] ; then
echo 'No free GPU!'
exit 1
fi
================================================
FILE: .dev_scripts/dockerci_npu.sh
================================================
#!/bin/bash
MODELSCOPE_CACHE_DIR=/modelscope_cache
CODE_DIR=$PWD
MODELSCOPE_SDK_DEBUG=True
echo "$USER"
gpus='0,1 2,3'
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
echo "ci command: $CI_COMMAND"
PR_CHANGED_FILES="${PR_CHANGED_FILES:-}"
echo "PR modified files: $PR_CHANGED_FILES"
PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
idx=0
for gpu in $gpus
do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
echo "get gpu lock $gpu"
let is_get_file_lock=true
# 设置环境变量
export CI_TEST=True
export TEST_LEVEL=$TEST_LEVEL
export MODELSCOPE_CACHE=${MODELSCOPE_CACHE:-$MODELSCOPE_CACHE_DIR}
export MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN
export HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT
export TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST
export TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV
export MODELSCOPE_ENVIRONMENT='ci'
export TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN
export MODEL_TAG_URL=$MODEL_TAG_URL
export MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN
export PR_CHANGED_FILES=$PR_CHANGED_FILES
export CUDA_VISIBLE_DEVICES=$gpu
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
export MODELSCOPE_SDK_DEBUG=True
echo 'debugging'
fi
# 切换到代码目录并执行命令
cd $CODE_DIR
eval $CI_COMMAND
if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1
fi
break
done
if [ "$is_get_file_lock" = false ] ; then
echo 'No free GPU!'
exit 1
fi
================================================
FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml
================================================
name: "🐛 Bug Report"
description: Create a bug report to help us improve ms-swift
labels: ["bug"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues,确认这是一个新的 bug report。
required: true
- type: textarea
id: bug-description
validations:
required: true
attributes:
label: Bug Description / Bug 描述
description: |
Please describe the issue you encountered. It's better to include error screenshots or stack trace information.
请详细描述你遇到的问题,最好包含报错截图或报错栈信息。
- type: textarea
id: reproduction-steps
validations:
required: true
attributes:
label: How to Reproduce / 如何复现
description: |
Please provide steps to reproduce the issue, including ms-swift version, runtime environment, and detailed reproduction steps.
请提供复现问题的步骤,包括 ms-swift 的版本、运行环境、详细的复现步骤等。
- type: textarea
id: additional-information
attributes:
label: Additional Information / 补充信息
description: |
Please provide any additional information here.
在这里补充其他相关信息。
================================================
FILE: .github/ISSUE_TEMPLATE/2-feature-request.yml
================================================
name: "🚀 Feature Request"
description: Submit a request for a new feature
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new feature request. / 我已经搜索过现有的 issues,确认这是一个新的 Feature Request。
required: true
- type: textarea
id: feature-request-description
validations:
required: true
attributes:
label: Feature Request Description / Feature Request 描述
description: |
Please provide a detailed description of the new feature you would like to see added.
请详细描述您希望添加的新功能特性。
- type: textarea
id: pull-request
attributes:
label: Pull Request / Pull Request 信息
description: |
Have you already submitted or plan to submit a Pull Request? Please share your plans.
你是否已经提交或即将提交 Pull Request?请说明你的计划。
================================================
FILE: .github/ISSUE_TEMPLATE/3-question-discussion.yml
================================================
name: "🤔 Question & Discussion"
description: Create an issue for questions and discussions
labels: ["question"]
body:
- type: markdown
attributes:
value: |
Thank you for supporting ms-swift and taking the time to submit this issue.
感谢你对 ms-swift 的支持和抽出时间提交相关 issue。
- type: checkboxes
id: checklist
attributes:
label: Checklist / 检查清单
options:
- label: I have searched existing issues, and this is a new question or discussion topic. / 我已经搜索过现有的 issues,确认这是一个新的问题与讨论。
required: true
- type: textarea
id: question-description
validations:
required: true
attributes:
label: Question Description / 问题描述
description: |
Please describe the question or topic you would like to discuss.
请描述你想要讨论的问题或话题。
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# PR type
- [ ] Bug Fix
- [ ] New Feature
- [ ] Document Updates
- [ ] More Models or Datasets Support
# PR information
Write the detail information belongs to this PR.
## Experiment results
Paste your experiment result here(if needed).
================================================
FILE: .github/SECURITY.md
================================================
# Reporting Security Issues
Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
================================================
FILE: .github/workflows/citest.yaml
================================================
name: citest
on:
push:
branches:
- master
- "release/**"
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
pull_request:
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unittest:
# The type of runner that the job will run on
runs-on: [self-hosted]
timeout-minutes: 240
steps:
- name: ResetFileMode
shell: bash
run: |
# reset filemode to allow action runner to delete files
# generated by root in docker
set -e
source ~/.bashrc
sudo chown -R $USER:$USER $GITHUB_WORKSPACE
- name: Checkout
uses: actions/checkout@v3
env:
GIT_CONFIG_PARAMETERS: "'core.hooksPath='"
with:
lfs: 'true'
submodules: 'false'
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
- name: Get changed files
id: changed-files
run: |
if ${{ github.event_name == 'pull_request' }}; then
echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV
else
echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV
fi
- name: Checkout LFS objects
run: git lfs checkout
- name: Run unittest
shell: bash
run: |
set -e
source /mnt/modelscope/ci_env.sh
bash .dev_scripts/dockerci.sh
================================================
FILE: .github/workflows/citest_npu.yaml
================================================
name: citest-npu
on:
push:
branches:
- master
- "release/**"
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
pull_request:
paths-ignore:
- "setup.*"
- "requirements.txt"
- "requirements/**"
- "docs/**"
- "tools/**"
- ".dev_scripts/**"
- "README.md"
- "README_*.md"
- "NOTICE"
- ".github/workflows/lint.yaml"
- ".github/workflows/publish.yaml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unittest:
# The type of runner that the job will run on
runs-on: [linux-aarch64-a2-1]
timeout-minutes: 240
container:
image: 'ascendai/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
steps:
- name: Config mirrors
run: |
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
- name: Get changed files
id: changed-files
run: |
if ${{ github.event_name == 'pull_request' }}; then
echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV
else
echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV
fi
- name: Run unittest
shell: bash
run: |
set -e
export IMAGE_NAME=ascendai/cann
export IMAGE_VERSION=8.3.rc2-910b-ubuntu22.04-py3.11
export TEST_LEVEL=0
mkdir -p ~/.cache
export MODELSCOPE_CACHE=~/.cache
export CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --subprocess --run_config tests/run_config.yaml'
bash .dev_scripts/dockerci_npu.sh
================================================
FILE: .github/workflows/close_tale_issue.yaml
================================================
name: Close Stale Issues
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
jobs:
close-stale:
runs-on: ubuntu-latest
steps:
- name: Close stale issues
uses: actions/stale@v8
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
days-before-stale: 90
days-before-close: 7
stale-issue-message: 'This issue has been inactive for over 3 months and will be automatically closed in 7 days. If this issue is still relevant, please reply to this message.'
close-issue-message: 'This issue has been automatically closed due to inactivity. If needed, it can be reopened.'
stale-issue-label: 'stale'
exempt-all-issue-labels: true
================================================
FILE: .github/workflows/lint.yaml
================================================
name: Lint test
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install pre-commit hook
run: |
pip install pre-commit
- name: Linting
run: pre-commit run --all-files
================================================
FILE: .github/workflows/publish.yaml
================================================
name: release
on:
push:
tags:
- 'v**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-publish
cancel-in-progress: true
jobs:
build-n-publish:
runs-on: ubuntu-22.04
#if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install wheel
run: pip install wheel packaging setuptools==69.5.1
- name: Build ModelScope Swift
run: python setup.py sdist bdist_wheel
- name: Publish package to PyPI
run: |
pip install twine
twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
tmp
*.ttf
__pycache__/
*.py[cod]
*$py.class
test.py
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
/package
/temp
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.vscode
.idea
.run
# custom
*.pkl
*.pkl.json
*.log.json
*.whl
*.tar.gz
*.swp
*.log
*.tar.gz
source.sh
tensorboard.sh
.DS_Store
replace.sh
result.png
result.jpg
result.mp4
output/
outputs/
wandb/
swanlog/
*.out
benchmarks/
eval_output/
eval_outputs/
vlmeval/
my_model/
/data
result/
images
/custom/
megatron_output/
/*-mcore/
/*-hf/
/*_cached_dataset/
/sample_output/
# Pytorch
*.pth
*.pt
# ast template
ast_index_file.py
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pycqa/flake8.git
rev: 7.3.0
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort.git
rev: 8.0.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf.git
rev: v0.32.0
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks.git
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: mixed-line-ending
args: ["--fix=lf"]
================================================
FILE: .pre-commit-config_local.yaml
================================================
repos:
- repo: /home/admin/pre-commit/flake8
rev: 7.3.0
hooks:
- id: flake8
- repo: /home/admin/pre-commit/isort
rev: 8.0.0
hooks:
- id: isort
- repo: /home/admin/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
- repo: /home/admin/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: mixed-line-ending
args: ["--fix=lf"]
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
contact@modelscope.cn.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations
================================================
FILE: CONTRIBUTING.md
================================================
# Contributor Guide
_Welcome to offer PRs, bug reports, documentation supplements or other types of contributions to SWIFT!_
## Table of Contents
- [Code of Conduct](#-code-of-conduct)
- [Contribution Process](#-contribution-process)
- [Hardware support](#-Hardware-support)
## 📖 Code of Conduct
Please refer to our [Code of Conduct documentation](./CODE_OF_CONDUCT.md).
## 🔁 Contribution Process
### What We Need
- New Technologies and New Models: SWIFT needs to support more open-source models and datasets, or new technologies that we have not paid attention to. If you are interested please submit a PR to us.
- Technical Propagation: If you are interested in technical propagation, you are welcome to help us write tutorials, documents or videos on any website, and send us the link.
- Community Contribution: You can write technical articles related to SWIFT, and submit them to us. After review and approval, we will publish them on the official ModelScope accounts (Zhihu, WeChat, etc.), with your name assigned.
### Incentives
- we will issue electronic certificates to contributors on behalf of the ModelScope community, to encourage your selfless contributions.
- We will offer small souvenirs related to the ModelScope Community.
- We will provide free A10 computing power during the development period. For more details, please refer to [Hardware-support](#-Hardware-support) section.
### Submitting PR (Pull Requests)
Any feature development is carried out in the form of Fork and then PR on GitHub.
1. Fork: Go to the [ms-swift](https://github.com/modelscope/ms-swift) page and click the **Fork button**. After completion, a SWIFT code repository will be cloned under your personal organization.
2. Clone: Clone the code repository generated in the first step to your local machine and **create a new branch** for development. During development, please click the **Sync Fork button** in time to synchronize with the `main` branch to prevent code expiration and conflicts.
3. Submit PR: After development and testing, push the code to the remote branch. On GitHub, go to the **Pull Requests page**, create a new PR, select your code branch as the source branch, and the `modelscope/ms-swift:main` branch as the target branch.
4. Write Description: It is necessary to provide a good feature description in the PR, so that the reviewers know the content of your modification.
5. Review: We hope that the code to be merged is concise and efficient, so we may raise some questions and discuss them. Please note that any issues raised in the review are aimed at the code itself, not at you personally. Once all issues are discussed and resolved, your code will be approved.
### Code Standards and Development Approach
SWIFT has conventional variable naming conventions and development approaches. Please follow these approaches as much as possible during development.
1. Variable names are separated by underscores, and class names are named with the first letter of each word capitalized.
2. All Python indentation uses four spaces instead of a tab.
3. Choose well-known open-source libraries, avoid using closed-source libraries or unstable open-source libraries, and avoid repeating the existing code.
After the PR is submitted, SWIFT will perform two types of tests:
- Code Lint Test: A static code compliance check test. please make sure that you have performed code lint locally in advance.
```shell
pip install pre-commit # In the swift folder
pre-commit run --all-files # Fix the errors reported by pre-commit until all checks are successful
```
- CI Tests: Smoke tests and unit tests, please refer to the next section.
### Running CI Tests
Before submitting the PR, please ensure that your development code is protected by test cases, such as smoke tests for new features, or unit tests for various edge cases. Reviewers will also pay attention to this during code review. At the same time, there will be dedicated services running CI Tests, running all test cases, and the code can only be merged after the test cases pass.
## ✅ Hardware support
SWIFT will provide hardware support for developers, including free GPUs. If needed, please email us ([contact@modelscope.cn](mailto:contact@modelscope.cn)) or join our WeChat group:
================================================
FILE: CONTRIBUTING_CN.md
================================================
# 贡献者指引
*欢迎帮SWIFT提供Feature PR、Bug反馈、文档补充或其他类型的贡献!*
## 目录
- [代码规约](#-代码规约)
- [贡献流程](#-贡献流程)
- [资源支持](#-资源支持)
## 📖 代码规约
请查看我们的[代码规约文档](./CODE_OF_CONDUCT.md).
## 🔁 贡献流程
### 我们需要什么
- 新技术和新模型:SWIFT需要支持更多的开源模型和数据集,或我们没有关注到的新技术,如果您对此有兴趣,可以提交PR给我们。
- 技术布道:如果您对技术布道有兴趣,欢迎在任何网站上帮我们撰写教程文档或视频等,并将链接发给我们。
- 社区供稿:您可以撰写和SWIFT有关的技术文章,并供稿给我们,我们审核通过后会在魔搭官方账号(知乎、公众号等)上进行发布,并属上您的名字。
### 激励
- 我们会以魔搭社区的身份给贡献者颁发电子证书,以鼓励您的无私贡献。
- 我们会赠送相关魔搭社区相关周边小礼品。
- 我们会赠送开发期间的免费A10算力,具体可以查看[资源支持](#-资源支持)章节。
### 提交PR(Pull Requests)
任何feature开发都在github上以先Fork后PR的形式进行。
1. Fork:进入[ms-swift](https://github.com/modelscope/ms-swift)页面后,点击**Fork按钮**执行。完成后会在您的个人组织下克隆出一个SWIFT代码库
2. Clone:将第一步产生的代码库clone到本地并**拉新分支**进行开发,开发中请及时点击**Sync Fork按钮**同步`main`分支,防止代码过期并冲突
3. 提交PR:开发、测试完成后将代码推送到远程分支。在github上点击**Pull Requests页面**,新建一个PR,源分支选择您提交的代码分支,目标分支选择`modelscope/ms-swift:main`分支
4. 撰写描述:在PR中填写良好的feature描述是必要的,让Reviewers知道您的修改内容
5. Review:我们希望合入的代码简洁高效,因此可能会提出一些问题并讨论。请注意,任何review中提出的问题是针对代码本身,而非您个人。在所有问题讨论通过后,您的代码会被通过
### 代码规范和开发方式
SWIFT有约定俗成的变量命名方式和开发方式。在开发中请尽量遵循这些方式。
1. 变量命名以下划线分割,类名以所有单词首字母大写方式命名
2. 所有的python缩进都是四个空格取代一个tab
3. 选用知名的开源库,避免使用闭源库或不稳定的开源库,避免重复造轮子
SWIFT在PR提交后会进行两类测试:
- Code Lint测试 对代码进行静态规范走查的测试,为保证改测试通过,请保证本地预先进行了Code lint。方法是:
```shell
pip install pre-commit
# 在swift文件夹内
pre-commit run --all-files
# 对pre-commit报的错误进行修改,直到所有的检查都是成功状态
```
- CI Tests 冒烟测试和单元测试,请查看下一章节
### Running CI Tests
在提交PR前,请保证您的开发代码已经受到了测试用例的保护。例如,对新功能的冒烟测试,或者各种边缘case的单元测试等。在代码review时Reviewers也会关注这一点。同时,也会有服务专门运行CI Tests,运行所有的测试用例,测试用例通过后代码才可以合并。
## ✅ 资源支持
SWIFT会为开发者提供资源支持,包括免费的GPU算力。如果需要请邮件联系我们([contact@modelscope.cn](mailto:contact@modelscope.cn))或加入我们的微信群:
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
recursive-include requirements *.txt
================================================
FILE: Makefile
================================================
WHL_BUILD_DIR :=package
DOC_BUILD_DIR :=docs/build/
# default rule
default: whl docs
.PHONY: docs
docs:
bash .dev_scripts/build_docs.sh
.PHONY: linter
linter:
bash .dev_scripts/linter.sh
.PHONY: test
test:
bash .dev_scripts/citest.sh
.PHONY: whl
whl:
python setup.py sdist bdist_wheel
.PHONY: clean
clean:
rm -rf $(WHL_BUILD_DIR) $(DOC_BUILD_DIR)
================================================
FILE: README.md
================================================
# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
ModelScope Community Website
中文   |   English  
Paper   | English Documentation   |   中文文档  
## 📖 Table of Contents
- [Groups](#-Groups)
- [Introduction](#-introduction)
- [News](#-news)
- [Installation](#%EF%B8%8F-installation)
- [Quick Start](#-quick-Start)
- [Usage](#-Usage)
- [License](#-License)
- [Citation](#-citation)
## ☎ Groups
You can contact us and communicate with us by adding our group:
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group
:-------------------------:|:-------------------------:
|
## 📝 Introduction
🍲 **ms-swift** is a large model and multimodal large model fine-tuning and deployment framework provided by the ModelScope community. It now supports training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment for 600+ text-only large models and 400+ multimodal large models. Large models include: Qwen3, Qwen3.5, InternLM3, GLM4.5, Mistral, DeepSeek-R1, Llama4, etc. Multimodal large models include: Qwen3-VL, Qwen3-Omni, Llava, InternVL3.5, MiniCPM-V-4, Ovis2.5, GLM4.5-V, DeepSeek-VL2, etc.
🍔 In addition, ms-swift integrates the latest training technologies, including Megatron parallelism techniques such as TP, PP, CP, EP to accelerate training, as well as numerous GRPO algorithm family reinforcement learning algorithms including: GRPO, DAPO, GSPO, SAPO, CISPO, RLOO, Reinforce++, etc. to enhance model intelligence. ms-swift supports a wide range of training tasks, including preference learning algorithms such as DPO, KTO, RM, CPO, SimPO, ORPO, as well as Embedding, Reranker, and sequence classification tasks. ms-swift provides full-pipeline support for large model training, including acceleration for inference, evaluation, and deployment modules using vLLM, SGLang, and LMDeploy, as well as model quantization using GPTQ, AWQ, BNB, and FP8 technologies.
**Why Choose ms-swift?**
- 🍎 **Model Types**: Supports **600+ text-only large models**, **400+ multimodal large models**, and All-to-All full modality models from training to deployment full pipeline, with Day-0 support for popular models.
- **Dataset Types**: Built-in 150+ datasets for pre-training, fine-tuning, human alignment, multimodal and various other tasks, with support for custom datasets. Users only need to prepare datasets for one-click training.
- **Hardware Support**: Supports A10/A100/H100, RTX series, T4/V100, CPU, MPS, and domestic hardware Ascend NPU, etc.
- **Lightweight Training**: Supports lightweight fine-tuning methods such as LoRA, QLoRA, DoRA, LoRA+, LLaMAPro, LongLoRA, LoRA-GA, ReFT, RS-LoRA, Adapter, LISA, etc.
- **Quantized Training**: Supports training on BNB, AWQ, GPTQ, AQLM, HQQ, EETQ quantized models, requiring only 9GB training resources for 7B models.
- **Memory Optimization**: GaLore, Q-Galore, UnSloth, Liger-Kernel, Flash-Attention 2/3, and **Ulysses and Ring-Attention sequence parallelism techniques** support, reducing memory consumption for long-text training.
- **Distributed Training**: Supports distributed data parallelism (DDP), device_map simple model parallelism, DeepSpeed ZeRO2 ZeRO3, FSDP/FSDP2, and Megatron distributed training technologies.
- 🍓 **Multimodal Training**: Supports multimodal packing technology to improve training speed by 100%+, supports mixed modality data training with text, images, video and audio, and supports independent control of vit/aligner/llm.
- **Agent Training**: Supports Agent templates, allowing one dataset to be used for training different models.
- 🍊 **Training Tasks**: Supports pre-training and instruction fine-tuning, as well as training tasks such as DPO, GKD, KTO, RM, CPO, SimPO, ORPO, and supports **Embedding/Reranker** and sequence classification tasks.
- 🥥 **Megatron Parallelism**: Provides TP/PP/SP/CP/ETP/EP/VPP parallel strategies to significantly boost **MoE model training speed**. Supports full-parameter and LoRA training methods for 300+ pure text large models and 100+ multimodal large models. Supports CPT/SFT/GRPO/DPO/KTO/RM training tasks.
- 🍉 **Reinforcement Learning**: Built-in **rich GRPO family algorithms**, including GRPO, DAPO, GSPO, SAPO, CISPO, CHORD, RLOO, Reinforce++, etc. Supports synchronous and asynchronous vLLM engine inference acceleration, with extensible reward functions, multi-turn inference Schedulers, and environments through plugins.
- **Full-Pipeline Capabilities**: Covers the entire workflow of training, inference, evaluation, quantization, and deployment.
- **UI Training**: Provides Web-UI interface for training, inference, evaluation, and quantization, completing the full pipeline for large models.
- **Inference Acceleration**: Supports Transformers, vLLM, SGLang, and LmDeploy inference acceleration engines, providing OpenAI interfaces for accelerating inference, deployment, and evaluation modules.
- **Model Evaluation**: Uses EvalScope as the evaluation backend, supporting 100+ evaluation datasets for evaluating text-only and multimodal models.
- **Model Quantization**: Supports quantization export for AWQ, GPTQ, FP8, and BNB. Exported models support inference acceleration using vLLM/SGLang/LmDeploy.
## 🎉 News
- 🎁 2026.03.03: **ms-swift v4.0** major version is officially released. For release notes, please refer to [here](https://github.com/modelscope/ms-swift/releases/tag/v4.0.0). You can provide your suggestions to us in [this issue](https://github.com/modelscope/ms-swift/issues/7250). Thank you for your support.
- 🎁 2025.11.14: Megatron GRPO is now available! Check out the [docs](./docs/source_en/Megatron-SWIFT/GRPO.md) and [examples](examples/megatron/grpo).
- 🎁 2025.11.04: Support for [Mcore-Bridge](docs/source_en/Megatron-SWIFT/Mcore-Bridge.md), making Megatron training as simple and easy to use as transformers.
- 🎁 2025.10.28: Ray [here](docs/source_en/Instruction/Ray.md).
- 🎁 2025.09.07: Added support for CHORD training algorithm. See the [documentation](./docs/source_en/Instruction/GRPO/AdvancedResearch/CHORD.md).
- 🎁 2025.09.06: Ulysses can now be used with ring-attention, allowing sequences to be sharded into any number of chunks (no longer limited by the number of heads). The argument remains `--sequence_parallel_size N`.
- 🎁 2025.09.02: Megatron-SWIFT now supports multimodal model training. Documentation can be found [here](./docs/source_en/Megatron-SWIFT/Multimodal-Model.md).
- 🎁 2025.08.12: Support [Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT) in SFT training, use parameter `--enable_dft_loss true`. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh).
- 🎁 2025.07.09: Megatron-SWIFT supports LoRA training. Compared to ms-swift, it achieves significant speedup on MoE models. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/lora).
- 🎁 2025.06.23: Fine-tuning of reranker models is supported. Training scripts can be found here: [Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh).
- 🎁 2025.06.15: Support for GKD training on both pure text large models and multimodal models. Training scripts can be found here: [Pure Text](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd), [Multimodal](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd).
More
- 🎁 2025.06.11: Support for using Megatron parallelism techniques for RLHF training. The training script can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf).
- 🎁 2025.05.29: Support sequence parallel in pretrain, sft, dpo and grpo, check script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel).
- 🎁 2025.05.11: GRPO now supports custom processing logic for reward models. See the GenRM example [here](./docs/source_en/Instruction/GRPO/DeveloperGuide/reward_model.md).
- 🎁 2025.04.15: The ms-swift paper has been accepted by AAAI 2025. You can find the paper at [this link](https://ojs.aaai.org/index.php/AAAI/article/view/35383).
- 🎁 2025.03.23: Multi-round GRPO is now supported for training multi-turn dialogue scenarios (e.g., agent tool calling). Please refer to the [doc](./docs/source_en/Instruction/GRPO/DeveloperGuide/multi_turn.md).
- 🎁 2025.03.16: Support for Megatron's parallel training techniques is now available. Please see the [Megatron-SWIFT training documentation](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/Quick-start.html).
- 🎁 2025.03.15: Fine-tuning of embedding models for both pure text and multimodal models is supported. Please check the [training script](examples/train/embedding).
- 🎁 2025.03.05: The hybrid mode for GRPO is supported, with a script for training a 72B model on 4 GPUs (4*80G) available [here](examples/train/grpo/internal/vllm_72b_4gpu.sh). Tensor parallelism with vllm is also supported, with the training script available [here](examples/train/grpo/internal).
- 🎁 2025.02.21: The GRPO algorithm now supports LMDeploy, with the training script available [here](examples/train/grpo/internal/full_lmdeploy.sh). Additionally, the performance of the GRPO algorithm has been tested, achieving a training speed increase of up to 300% using various tricks. Please check the WanDB table [here](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz).
- 🎁 2025.02.21: The `swift sample` command is now supported. The reinforcement fine-tuning script can be found [here](docs/source_en/Instruction/Reinforced-Fine-tuning.md), and the large model API distillation sampling script is available [here](examples/sampler/distill/distill.sh).
- 🔥 2025.02.12: Support for the GRPO (Group Relative Policy Optimization) training algorithm has been added. Documentation is available [here](docs/source_en/Instruction/GRPO/GetStarted/GRPO.md).
- 🎁 2024.12.04: Major update to **ms-swift 3.0**. Please refer to the [release notes and changes](docs/source_en/Instruction/ReleaseNote3.0.md).
- 🎉 2024.08.12: The ms-swift paper has been published on arXiv and can be read [here](https://arxiv.org/abs/2408.05517).
- 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
- 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
- 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
## 🛠️ Installation
To install using pip:
```shell
pip install ms-swift -U
# Using uv
pip install uv
uv pip install ms-swift -U --torch-backend=auto
```
To install from source:
```shell
# pip install git+https://github.com/modelscope/ms-swift.git
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
# The main branch is for swift 4.x. To install swift 3.x, please run the following command:
# git checkout release/3.12
pip install -e .
# Using uv
uv pip install -e . --torch-backend=auto
```
Running Environment:
| | Range | Recommended | Notes |
|--------------|--------------|---------------------|-------------------------------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | No need to install if using CPU, NPU, MPS |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| trl | >=0.15,<0.29 | 0.28.0 | RLHF |
| deepspeed | >=0.14 | 0.18.8 | Training |
| vllm | >=0.5.1 | 0.11.0/0.17.1 | Inference/Deployment |
| sglang | >=0.4.6 | | Inference/Deployment |
| lmdeploy | >=0.5 | 0.10.1 | Inference/Deployment |
| evalscope | >=1.0 | | Evaluation |
| gradio | | 5.32.1 | Web-UI/App |
For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
## 🚀 Quick Start
10 minutes of self-cognition fine-tuning of Qwen3-4B-Instruct-2507 on a single 3090 GPU:
### Command Line Interface (Recommended)
```shell
# 13GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-4B-Instruct-2507 \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
Tips:
- If you want to train with a custom dataset, you can refer to [this guide](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) to organize your dataset format and specify `--dataset `.
- The `--model_author` and `--model_name` parameters are only effective when the dataset includes `swift/self-cognition`.
- To train with a different model, simply modify `--model `.
- By default, **ModelScope** is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
After training is complete, use the following command to infer with the trained weights:
- Here, `--adapters` should be replaced with the last checkpoint folder generated during training. Since the adapters folder contains the training parameter file `args.json`, there is no need to specify `--model`, `--system` separately; Swift will automatically read these parameters. To disable this behavior, you can set `--load_args false`.
```shell
# Using an interactive command line for inference.
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
# merge-lora and use vLLM for inference acceleration
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--merge_lora true \
--infer_backend vllm \
--vllm_max_model_len 8192 \
--temperature 0 \
--max_new_tokens 2048
```
Finally, use the following command to push the model to ModelScope:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token '' \
--use_hf false
```
### Web-UI
The Web-UI is a **zero-threshold** training and deployment interface solution based on Gradio interface technology. For more details, you can check [here](https://swift.readthedocs.io/en/latest/GetStarted/Web-UI.html).
```shell
SWIFT_UI_LANG=en swift web-ui
```

### Using Python
ms-swift also supports training and inference using Python. Below is pseudocode for training and inference. For more details, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb).
Training:
```python
from peft import LoraConfig, get_peft_model
from swift import get_model_processor, get_template, load_dataset, EncodePreprocessor
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Retrieve the model and template, and add a trainable LoRA module
model, tokenizer = get_model_processor(model_id_or_path, ...)
template = get_template(tokenizer, ...)
lora_config = LoraConfig(...)
model = get_peft_model(model, lora_config)
# Download and load the dataset, and encode the text into tokens
train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
# Train the model
training_args = Seq2SeqTrainingArguments(...)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
template=template,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
```
Inference:
```python
from swift import TransformersEngine, InferRequest, RequestConfig
# Perform inference using the native Transformers engine
engine = TransformersEngine(model_id_or_path, adapters=[lora_checkpoint])
infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
resp_list = engine.infer([infer_request], request_config)
print(f'response: {resp_list[0].choices[0].message.content}')
```
## ✨ Usage
Here is a minimal example of training to deployment using ms-swift. For more details, you can check the [examples](https://github.com/modelscope/ms-swift/tree/main/examples).
- If you want to use other models or datasets (including multimodal models and datasets), you only need to modify `--model` to specify the corresponding model's ID or path, and modify `--dataset` to specify the corresponding dataset's ID or path.
- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
| Useful Links |
| ------ |
| [🔥Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html) |
| [Megatron-SWIFT](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/Quick-start.html) |
| [GRPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/GetStarted/GRPO.html) |
| [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) |
| [Custom Models](https://swift.readthedocs.io/en/latest/Customization/Custom-model.html), [🔥Custom Datasets](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) |
| [LLM Tutorial](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
### Training
Supported Training Methods:
| Method | Full-Parameter | LoRA | QLoRA | Deepspeed | Multi-Machine | Multimodal |
| ------------------------------------------------------------ | ------------------------------------------------------------ | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| [Pre-training](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Supervised Fine-Tuning](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
| [GRPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd) |
| [PPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [DPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo) |
| [KTO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
| [Reward Model](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [CPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SimPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [ORPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Sequence Classification](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
Pre-training:
```shell
# 8*A100
NPROC_PER_NODE=8 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift pt \
--model Qwen/Qwen2.5-7B \
--dataset swift/chinese-c4 \
--streaming true \
--tuner_type full \
--deepspeed zero2 \
--output_dir output \
--max_steps 10000 \
...
```
Fine-tuning:
```shell
CUDA_VISIBLE_DEVICES=0 swift sft \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset AI-ModelScope/alpaca-gpt4-data-en \
--tuner_type lora \
--output_dir output \
...
```
RLHF:
```shell
CUDA_VISIBLE_DEVICES=0 swift rlhf \
--rlhf_type dpo \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
--tuner_type lora \
--output_dir output \
...
```
### Megatron-SWIFT
ms-swift supports using Megatron parallelism techniques to accelerate training, including large-scale cluster training and MoE model training. The following training methods are supported:
| Method | Full-Parameter | LoRA | MoE | Multimodal | FP8 |
| ---------------------- | -------------- | ---- | ---- | ---------- | ---- |
| Pre-training | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Supervised Fine-Tuning](https://github.com/modelscope/ms-swift/tree/main/examples/megatron) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GRPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [DPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [KTO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/kto) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [RM](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/rm) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/embedding) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/reranker) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Sequence Classification](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ |
```shell
NPROC_PER_NODE=2 CUDA_VISIBLE_DEVICES=0,1 megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
### Reinforcement Learning
ms-swift supports a rich set of GRPO family algorithms:
| Method | Full-Parameter | LoRA | Multimodal | Multi-Machine |
| ------------------------------------------------------------ | -------------- | ---- | ---------- | ------------- |
| [GRPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/GetStarted/GRPO.html) | ✅ | ✅ | ✅ | ✅ |
| [DAPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/DAPO.html) | ✅ | ✅ | ✅ | ✅ |
| [GSPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/GSPO.html) | ✅ | ✅ | ✅ | ✅ |
| [SAPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/SAPO.html) | ✅ | ✅ | ✅ | ✅ |
| [CISPO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/CISPO.html) | ✅ | ✅ | ✅ | ✅ |
| [CHORD](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/CHORD.html) | ✅ | ✅ | ✅ | ✅ |
| [RLOO](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/RLOO.html) | ✅ | ✅ | ✅ | ✅ |
| [Reinforce++](https://swift.readthedocs.io/en/latest/Instruction/GRPO/AdvancedResearch/REINFORCEPP.html) | ✅ | ✅ | ✅ | ✅ |
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--tuner_type lora \
--use_vllm true \
--vllm_mode colocate \
--dataset AI-MO/NuminaMath-TIR#10000 \
--output_dir output \
...
```
### Inference
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
# LoRA
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--adapters swift/test_lora \
--stream true \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048
```
### Interface Inference
```shell
CUDA_VISIBLE_DEVICES=0 swift app \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
```
### Deployment
```shell
CUDA_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm
```
### Sampling
```shell
CUDA_VISIBLE_DEVICES=0 swift sample \
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
--sampler_engine transformers \
--num_return_sequences 5 \
--dataset AI-ModelScope/alpaca-gpt4-data-zh#5
```
### Evaluation
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend lmdeploy \
--eval_backend OpenCompass \
--eval_dataset ARC_c
```
### Quantization
```shell
CUDA_VISIBLE_DEVICES=0 swift export \
--model Qwen/Qwen2.5-7B-Instruct \
--quant_bits 4 --quant_method awq \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--output_dir Qwen2.5-7B-Instruct-AWQ
```
### Push Model
```shell
swift export \
--model \
--push_to_hub true \
--hub_model_id '' \
--hub_token ''
```
## 🏛 License
This framework is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/ms-swift/blob/master/LICENSE). For models and datasets, please refer to the original resource page and follow the corresponding License.
## 📎 Citation
```bibtex
@misc{zhao2024swiftascalablelightweightinfrastructure,
title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
year={2024},
eprint={2408.05517},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.05517},
}
```
## Star History
[](https://star-history.com/#modelscope/ms-swift&Date)
================================================
FILE: README_CN.md
================================================
# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
魔搭社区官网
中文  |  English  
论文   | English Documentation   |   中文文档  
## 📖 目录
- [用户群](#-用户群)
- [简介](#-简介)
- [新闻](#-新闻)
- [安装](#%EF%B8%8F-安装)
- [快速开始](#-快速开始)
- [如何使用](#-如何使用)
- [License](#-license)
- [引用](#-引用)
## ☎ 用户群
请扫描下面的二维码来加入我们的交流群:
[Discord Group](https://discord.com/invite/D27yfEFVz5) | 微信群
:-------------------------:|:-------------------------:
|
## 📝 简介
🍲 **ms-swift**是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持600+纯文本大模型与400+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen3、Qwen3.5、InternLM3、GLM4.5、Mistral、DeepSeek-R1、Llama4等模型,多模态大模型包括:Qwen3-VL、Qwen3-Omni、Llava、InternVL3.5、MiniCPM-V-4、Ovis2.5、GLM4.5-V、DeepSeek-VL2等模型。
🍔 除此之外,ms-swift汇集了最新的训练技术,包括集成Megatron并行技术,包括TP、PP、CP、EP等为训练提供加速,以及众多GRPO算法族强化学习的算法,包括:GRPO、DAPO、GSPO、SAPO、CISPO、RLOO、Reinforce++等提升模型智能。ms-swift支持广泛的训练任务,包括DPO、KTO、RM、CPO、SimPO、ORPO等偏好学习算法,以及Embedding、Reranker、序列分类任务。ms-swift提供了大模型训练全链路的支持,包括使用vLLM、SGLang和LMDeploy对推理、评测、部署模块提供加速,以及使用GPTQ、AWQ、BNB、FP8技术对大模型进行量化。
**为什么选择ms-swift?**
- 🍎 **模型类型**:支持**600+纯文本大模型**、**400+多模态大模型**以及All-to-All全模态模型训练到部署全流程,热门模型Day0支持。
- **数据集类型**:内置150+预训练、微调、人类对齐、多模态等各种任务数据集,并支持自定义数据集,用户只需准备数据集即可一键训练。
- **硬件支持**:支持A10/A100/H100、RTX系列、T4/V100、CPU、MPS以及国产硬件Ascend NPU等。
- **轻量训练**:支持了LoRA、QLoRA、DoRA、LoRA+、LLaMAPro、LongLoRA、LoRA-GA、ReFT、RS-LoRA、Adapter、LISA等轻量微调方式。
- **量化训练**:支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练,7B模型训练只需9GB训练资源。
- **显存优化**: GaLore、Q-Galore、UnSloth、Liger-Kernel、Flash-Attention 2/3 以及 **Ulysses和Ring-Attention序列并行技术**支持,降低长文本训练显存占用。
- **分布式训练**:支持分布式数据并行(DDP)、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP/FSDP2以及Megatron等分布式训练技术。
- 🍓 **多模态训练**:支持多模态packing技术提升训练速度100%+,支持文本、图像、视频和语音混合模态数据训练,支持vit/aligner/llm单独控制。
- **Agent训练**:支持Agent template,准备一套数据集可用于不同模型的训练。
- 🍊 **训练任务**:支持预训练和指令微调,以及DPO、GKD、KTO、RM、CPO、SimPO、ORPO等训练任务,支持**Embedding/Reranker**和序列分类任务。
- 🥥 **Megatron并行技术**:提供TP/PP/SP/CP/ETP/EP/VPP并行策略,显著提升**MoE模型训练速度**。支持300+纯文本大模型和100+多模态大模型的全参数和LoRA训练方法。支持CPT/SFT/GRPO/DPO/KTO/RM训练任务。
- 🍉 **强化学习**:内置**丰富GRPO族算法**,包括GRPO、DAPO、GSPO、SAPO、CISPO、CHORD、RLOO、Reinforce++等,支持同步和异步vLLM引擎推理加速,可使用插件拓展奖励函数、多轮推理调度器以及环境等。
- **全链路能力**:覆盖训练、推理、评测、量化和部署全流程。
- **界面训练**:提供使用Web-UI界面的方式进行训练、推理、评测、量化,完成大模型的全链路。
- **推理加速**:支持Transformers、vLLM、SGLang和LmDeploy推理加速引擎,并提供OpenAI接口,为推理、部署和评测模块提供加速。
- **模型评测**:以EvalScope作为评测后端,支持100+评测数据集对纯文本和多模态模型进行评测。
- **模型量化**:支持AWQ、GPTQ、FP8和BNB的量化导出,导出的模型支持使用vLLM/SGLang/LmDeploy推理加速。
## 🎉 新闻
- 🎁 2026.03.03: **ms-swift v4.0**大版本正式发布,release note参考[这里](https://github.com/modelscope/ms-swift/releases/tag/v4.0.0),您的建议可以在[这个issue](https://github.com/modelscope/ms-swift/issues/7250)中反馈给我们,感谢您的支持。
- 🎁 2025.11.14: Megatron GRPO现已支持!查看[文档](./docs/source/Megatron-SWIFT/GRPO.md)和[示例](examples/megatron/grpo)。
- 🎁 2025.11.04: 支持[Mcore-Bridge](docs/source/Megatron-SWIFT/Mcore-Bridge.md),使Megatron训练像transformers一样简单易用。
- 🎁 2025.10.28: Ray [已支持](docs/source/Instruction/Ray.md)。
- 🎁 2025.09.07: 支持CHORD训练算法,请查看[文档](docs/source/Instruction/GRPO/AdvancedResearch/CHORD.md)。
- 🎁 2025.09.06: Ulysses现已支持与ring-attention结合使用,使得输入序列可以被切分成任意数量的块(不再受限于num_heads),命令参数仍然是`--sequence_parallel_size N`。
- 🎁 2025.09.02: Megatron-SWIFT支持多模态模型训练。文档参考[这里](./docs/source/Megatron-SWIFT/Mcore-Bridge.md)。
- 🎁 2025.08.12: 支持在SFT训练中使用[Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT),使用参数 `--enable_dft_loss true`。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh)
- 🎁 2025.07.09: Megatron-SWIFT支持LoRA训练。相比ms-swift,在MoE模型提速显著。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/lora)。
- 🎁 2025.06.23: 支持Reranker模型训练,训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh)。
- 🎁 2025.06.15: 支持对纯文本大模型和多模态模型进行GKD训练。训练脚本参考这里:[纯文本](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd), [多模态](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd)。
更多
- 🎁 2025.06.11: 支持使用Megatron并行技术进行RLHF训练,训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf)。
- 🎁 2025.05.29: 支持pt、sft、dpo、grpo的序列并行,具体请查看[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel)。
- 🎁 2025.05.11: GRPO中的奖励模型支持自定义处理逻辑,GenRM的例子参考[这里](./docs/source/Instruction/GRPO/DeveloperGuide/reward_model.md)。
- 🎁 2025.04.15: ms-swift论文已经被AAAI 2025接收,论文地址在[这里](https://ojs.aaai.org/index.php/AAAI/article/view/35383)。
- 🎁 2025.03.23: 支持了多轮GRPO,用于构建多轮对话场景的训练(例如agent tool calling),请查看[文档](docs/source/Instruction/GRPO/DeveloperGuide/multi_turn.md)。
- 🎁 2025.03.16: 支持了Megatron的并行技术进行训练,请查看[Megatron-SWIFT训练文档](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Quick-start.html)。
- 🎁 2025.03.15: 支持纯文本和多模态模型的embedding模型的微调,请查看[训练脚本](examples/train/embedding)。
- 🎁 2025.03.05: 支持GRPO的hybrid模式,4GPU(4*80G)训练72B模型的脚本参考[这里](examples/train/grpo/internal/vllm_72b_4gpu.sh)。同时支持vllm的tensor并行,训练脚本参考[这里](examples/train/grpo/internal)。
- 🎁 2025.02.21: GRPO算法支持使用LMDeploy,训练脚本参考[这里](examples/train/grpo/internal/full_lmdeploy.sh)。此外测试了GRPO算法的性能,使用一些tricks使训练速度提高到300%。WanDB表格请查看[这里](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz)。
- 🎁 2025.02.21: 支持`swift sample`命令。强化微调脚本参考[这里](docs/source/Instruction/Reinforced-Fine-tuning.md),大模型API蒸馏采样脚本参考[这里](examples/sampler/distill/distill.sh)。
- 🔥 2025.02.12: 支持GRPO (Group Relative Policy Optimization) 训练算法,文档参考[这里](docs/source/Instruction/GRPO/GetStarted/GRPO.md)。
- 🎁 2024.12.04: **ms-swift3.0**大版本更新。请查看[发布说明和更改](docs/source/Instruction/ReleaseNote3.0.md)。
- 🎉 2024.08.12: ms-swift论文已经发布到arXiv上,可以点击[这里](https://arxiv.org/abs/2408.05517)阅读。
- 🔥 2024.08.05: 支持使用[evalscope](https://github.com/modelscope/evalscope/)作为后端进行大模型和多模态模型的评测。
- 🔥 2024.07.29: 支持使用[vllm](https://github.com/vllm-project/vllm), [lmdeploy](https://github.com/InternLM/lmdeploy)对大模型和多模态大模型进行推理加速,在infer/deploy/eval时额外指定`--infer_backend vllm/lmdeploy`即可。
- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练,包括DPO/ORPO/SimPO/CPO/KTO/RM/PPO。
- 🔥 2024.02.01: 支持Agent训练!训练算法源自这篇[论文](https://arxiv.org/pdf/2309.00986.pdf)。
## 🛠️ 安装
使用pip进行安装:
```shell
pip install ms-swift -U
# 使用uv
pip install uv
uv pip install ms-swift -U --torch-backend=auto
```
从源代码安装:
```shell
# pip install git+https://github.com/modelscope/ms-swift.git
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
# main分支为swift4.x。若安装swift3.x,请运行以下命令
# git checkout release/3.12
pip install -e .
# 使用uv
uv pip install -e . --torch-backend=auto
```
运行环境:
| | 范围 | 推荐 | 备注 |
|--------------|--------------|---------------------|--------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | 使用cpu、npu、mps则无需安装 |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| trl | >=0.15,<0.29 | 0.28.0 | RLHF |
| deepspeed | >=0.14 | 0.18.8 | 训练 |
| vllm | >=0.5.1 | 0.11.0/0.17.1 | 推理/部署 |
| sglang | >=0.4.6 | | 推理/部署 |
| lmdeploy | >=0.5 | 0.10.1 | 推理/部署 |
| evalscope | >=1.0 | | 评测 |
| gradio | | 5.32.1 | Web-UI/App |
更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
## 🚀 快速开始
**10分钟**在单卡3090上对Qwen3-4B-Instruct-2507进行自我认知微调:
### 命令行(推荐)
```shell
# 13GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-4B-Instruct-2507 \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
小贴士:
- 如果要使用自定义数据集进行训练,你可以参考[这里](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html)组织数据集格式,并指定`--dataset `。
- `--model_author`和`--model_name`参数只有当数据集中包含`swift/self-cognition`时才生效。
- 如果要使用其他模型进行训练,你只需要修改`--model `即可。
- 默认使用**ModelScope**进行模型和数据集的下载。如果要使用HuggingFace,指定`--use_hf true`即可。
训练完成后,使用以下命令对训练后的权重进行推理:
- 这里的`--adapters`需要替换成训练生成的last checkpoint文件夹。由于adapters文件夹中包含了训练的参数文件`args.json`,因此不需要额外指定`--model`,`--system`,swift会自动读取这些参数。如果要关闭此行为,可以设置`--load_args false`。
```shell
# 使用交互式命令行进行推理
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
# merge-lora并使用vLLM进行推理加速
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--merge_lora true \
--infer_backend vllm \
--vllm_max_model_len 8192 \
--temperature 0 \
--max_new_tokens 2048
```
最后,使用以下命令将模型推送到ModelScope:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token '' \
--use_hf false
```
### Web-UI
Web-UI是基于gradio界面技术的**零门槛**训练、部署界面方案,具体可以查看[这里](https://swift.readthedocs.io/zh-cn/latest/GetStarted/Web-UI.html)。
```shell
swift web-ui
```

### 使用Python
ms-swift也支持使用python的方式进行训练和推理。下面给出训练和推理的**伪代码**,具体可以查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)。
训练:
```python
from peft import LoraConfig, get_peft_model
from swift import get_model_processor, get_template, load_dataset, EncodePreprocessor
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# 获取模型和template,并加入可训练的LoRA模块
model, tokenizer = get_model_processor(model_id_or_path, ...)
template = get_template(tokenizer, ...)
lora_config = LoraConfig(...)
model = get_peft_model(model, lora_config)
# 下载并载入数据集,并将文本encode成tokens
train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
# 进行训练
training_args = Seq2SeqTrainingArguments(...)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
template=template,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
```
推理:
```python
from swift import TransformersEngine, InferRequest, RequestConfig
# 使用原生 transformers 引擎进行推理
engine = TransformersEngine(model_id_or_path, adapters=[lora_checkpoint])
infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
resp_list = engine.infer([infer_request], request_config)
print(f'response: {resp_list[0].choices[0].message.content}')
```
## ✨ 如何使用
这里给出使用ms-swift进行训练到部署的最简示例,具体可以查看[examples](https://github.com/modelscope/ms-swift/tree/main/examples)。
- 若想使用其他模型或者数据集(含多模态模型和数据集),你只需要修改`--model`指定对应模型的id或者path,修改`--dataset`指定对应数据集的id或者path即可。
- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace,指定`--use_hf true`即可。
| 常用链接 |
| ------ |
| [🔥命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html) |
| [Megatron-SWIFT](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Quick-start.html) |
| [GRPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/GetStarted/GRPO.html) |
| [支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html) |
| [自定义模型](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-model.html), [🔥自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html) |
| [大模型教程](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
### 训练
支持的训练方法:
| 方法 | 全参数 | LoRA | QLoRA | Deepspeed | 多机 | 多模态 |
| ------ | ------ |---------------------------------------------------------------------------------------------| ----- | ------ | ------ |----------------------------------------------------------------------------------------------|
| [预训练](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
| [GRPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd) |
| [PPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [DPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo) |
| [KTO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
| [奖励模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [CPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SimPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ | ✅ | ✅| ✅ | ✅ |
| [ORPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
预训练:
```shell
# 8*A100
NPROC_PER_NODE=8 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift pt \
--model Qwen/Qwen2.5-7B \
--dataset swift/chinese-c4 \
--streaming true \
--tuner_type full \
--deepspeed zero2 \
--output_dir output \
--max_steps 10000 \
...
```
微调:
```shell
CUDA_VISIBLE_DEVICES=0 swift sft \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
RLHF:
```shell
CUDA_VISIBLE_DEVICES=0 swift rlhf \
--rlhf_type dpo \
--model Qwen/Qwen2.5-7B-Instruct \
--dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
--tuner_type lora \
--output_dir output \
...
```
### Megatron-SWIFT
ms-swift支持使用Megatron并行技术加速训练,包括大规模集群训练和MoE模型训练。以下为支持的训练方法:
| 方法 | 全参数 | LoRA | MoE | 多模态 | FP8 |
| ------ | ------ | ---- | ----- | ----- | ----- |
| 预训练 | ✅ | ✅| ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/tree/main/examples/megatron) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GRPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/grpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd) | ✅ | ✅| ✅ | ✅ | ✅ |
| [DPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/dpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [KTO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/kto) | ✅ | ✅| ✅ | ✅ | ✅ |
| [RM](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/rm) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/embedding) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/reranker) | ✅ | ✅| ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/seq_cls) | ✅ | ✅| ✅ | ✅ | ✅ |
```shell
NPROC_PER_NODE=2 CUDA_VISIBLE_DEVICES=0,1 megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--tuner_type lora \
--output_dir output \
...
```
### 强化学习
ms-swift支持丰富GRPO族算法:
| 方法 | 全参数 | LoRA | 多模态 | 多机 |
| ------ | ------ | ---- | ----- | ----- |
| [GRPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/GetStarted/GRPO.html) | ✅ | ✅| ✅ | ✅ |
| [DAPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/DAPO.html) | ✅ | ✅| ✅ | ✅ |
| [GSPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/GSPO.html) | ✅ | ✅| ✅ | ✅ |
| [SAPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/SAPO.html) | ✅ | ✅| ✅ | ✅ |
| [CISPO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/CISPO.html) | ✅ | ✅| ✅ | ✅ |
| [CHORD](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/CHORD.html) | ✅ | ✅| ✅ | ✅ |
| [RLOO](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/RLOO.html) | ✅ | ✅| ✅ | ✅ |
| [Reinforce++](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/AdvancedResearch/REINFORCEPP.html) | ✅ | ✅| ✅ | ✅ |
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--tuner_type lora \
--use_vllm true \
--vllm_mode colocate \
--dataset AI-MO/NuminaMath-TIR#10000 \
--output_dir output \
...
```
### 推理
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
# LoRA
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--adapters swift/test_lora \
--stream true \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048
```
### 界面推理
```shell
CUDA_VISIBLE_DEVICES=0 swift app \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048 \
--lang zh
```
### 部署
```shell
CUDA_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm
```
### 采样
```shell
CUDA_VISIBLE_DEVICES=0 swift sample \
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
--sampler_engine transformers \
--num_return_sequences 5 \
--dataset AI-ModelScope/alpaca-gpt4-data-zh#5
```
### 评测
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend lmdeploy \
--eval_backend OpenCompass \
--eval_dataset ARC_c
```
### 量化
```shell
CUDA_VISIBLE_DEVICES=0 swift export \
--model Qwen/Qwen2.5-7B-Instruct \
--quant_bits 4 --quant_method awq \
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
--output_dir Qwen2.5-7B-Instruct-AWQ
```
### 推送模型
```shell
swift export \
--model \
--push_to_hub true \
--hub_model_id '' \
--hub_token ''
```
## 🏛 License
本框架使用[Apache License (Version 2.0)](https://github.com/modelscope/ms-swift/blob/master/LICENSE)进行许可。模型和数据集请查看原资源页面并遵守对应License。
## 📎 引用
```bibtex
@misc{zhao2024swiftascalablelightweightinfrastructure,
title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
year={2024},
eprint={2408.05517},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.05517},
}
```
## Star History
[](https://star-history.com/#modelscope/ms-swift&Date)
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
## maintain docs
1. build docs
```shell
# in root directory:
make docs
```
2. doc string format
We adopt the google style docstring format as the standard, please refer to the following documents.
1. Google Python style guide docstring [link](http://google.github.io/styleguide/pyguide.html#381-docstrings)
2. Google docstring example [link](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
3. sample:torch.nn.modules.conv [link](https://pytorch.org/docs/stable/_modules/torch/nn/modules/conv.html#Conv1d)
4. load function as an example:
```python
def load(file, file_format=None, **kwargs):
"""Load data from json/yaml/pickle files.
This method provides a unified api for loading data from serialized files.
Args:
file (str or :obj:`Path` or file-like object): Filename or a file-like
object.
file_format (str, optional): If not specified, the file format will be
inferred from the file extension, otherwise use the specified one.
Currently supported formats include "json", "yaml/yml".
Examples:
>>> load('/path/of/your/file') # file is stored in disk
>>> load('https://path/of/your/file') # file is stored on internet
>>> load('oss://path/of/your/file') # file is stored in petrel
Returns:
The content from the file.
"""
```
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
================================================
FILE: docs/source/.readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.10"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/framework.txt
================================================
FILE: docs/source/BestPractices/Elastic.md
================================================
# Elastic
## 安装依赖
集群部署K8S,并在集群中部署DLrover,[DLRover](https://github.com/intelligent-machine-learning/dlrover),
`pip install dlrover && pip install tornado && pip install kubernetes && pip install ms-swift`
经过反复测试验证的训练镜像中的其它依赖以及版本:
deepspeed 0.16.5(需参考https://github.com/deepspeedai/DeepSpeed/pull/7585/files 修复universal checkpoint 相关问题)
pytorch 2.6.0
## 如何启动
通过在`--callbacks`中添加`deepspeed_elastic`(可选`graceful_exit`)启用弹性训练,并配置DeepSpeed弹性参数。
命令组成=dlrover-run +dlrover 命令参数+swift 启动命令 +swift参数,dlrover-run除自定义的参数外,其他参数与torchrun一致;
dlrover-run 参数如下:
```
usage: dlrover-run [-h] [--nnodes NNODES] [--nproc-per-node NPROC_PER_NODE]
[--rdzv-backend RDZV_BACKEND] [--rdzv-endpoint RDZV_ENDPOINT] [--rdzv-id RDZV_ID]
[--rdzv-conf RDZV_CONF] [--standalone] [--max-restarts MAX_RESTARTS]
[--monitor-interval MONITOR_INTERVAL] [--start-method {spawn,fork,forkserver}]
[--role ROLE] [-m] [--no-python] [--run-path] [--log-dir LOG_DIR] [-r REDIRECTS]
[-t TEE] [--local-ranks-filter LOCAL_RANKS_FILTER] [--node-rank NODE_RANK]
[--master-addr MASTER_ADDR] [--master-port MASTER_PORT] [--local-addr LOCAL_ADDR]
[--logs-specs LOGS_SPECS] [--precheck {0,1,2}] [--node_unit NODE_UNIT]
[--auto_config] [--auto_tunning] [--exclude-straggler] [--save_at_breakpoint]
[--accelerator {nvidia.com/gpu,ascend-npu}] [--training_port TRAINING_PORT]
[--switchbox-check] [--box-pairs PAIR [PAIR ...]] [--min-bandwidth MIN_BANDWIDTH]
[--min-channels MIN_CHANNELS] [--numa-affinity] [--network-check]
[--comm-perf-test] [--ucp_device_type UCP_DEVICE_TYPE]
training_script
```
在弹性训练中我们需要关注的参数为:
--nnodes NNODES Number of nodes, or the range of nodes in form
:.
--nproc-per-node NPROC_PER_NODE Number of processes per node.
示例:
```bash
model=your model path
dataset=your dataset
output= your output dir
export CUDA_VISIBLE_DEVICES=0 根据实际使用的GPU情况设置
deepspeed_config_or_type=deepspeed类型或者配置文件的路径,如 zero1 或者/xxx/ms-swift/swift/llm/ds_config/zero1.json
dlrover-run --nnodes 1:$NODE_NUM --nproc_per_node=1 \
/opt/conda/lib/python3.10/site-packages/swift/cli/sft.py --model $model \
--model_type qwen3 \
--tuner_type lora \
--torch_dtype bfloat16 \
--dataset $dataset \
--num_train_epochs 4 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 5e-7 \
--gradient_accumulation_steps 8 \
--eval_steps 500 \
--save_steps 10 \
--save_total_limit 20 \
--logging_steps 1 \
--output_dir $output \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--temperature 1.0 \
--system 'You are a helpful assistant.' \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--dataset_num_proc 1 \
--use_flash_ckpt true \
--callbacks deepspeed_elastic graceful_exit \
--deepspeed $deepspeed_config_or_type \
```
## 配置文件示例
默认情况下的zero1为以下示例配置,
```json
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
}
```
如果用户需要自定义,可以在启动命令中deepspeed_config_or_type指定自定义的zero1.json的存放路径,其中弹性相关的配置为:
```json
...
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
```
- ignore_non_elastic_batch_info:代表在elasticity里的配置会忽略外层的batch_size相关的配置,训练过程中会根据实际的训练进程个数实时修改batch_size等相关的参数
计算原则为:
global-training-batch-size = micro-batch-size * gradient-accumulation-steps * world-size
- max_train_batch_size:最大batch_size数
- micro_batch_sizes:elasticity下允许的每卡micro-batch size列表,相当于train_micro_batch_size_per_gpu的候选值
- min_gpus:最小gpu数目
- max_gpus:最大gpu数目
更详细的内容见:[Deepspeed](https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02)
## 启动训练
```yaml
---
apiVersion: elastic.iml.github.io/v1alpha1
kind: ElasticJob
metadata:
name: deepspeed-elastic-swift
namespace: dlrover
spec:
distributionStrategy: AllreduceStrategy
optimizeMode: single-job
replicaSpecs:
worker:
replicas: 1 #【这里需要与启动命令中的--nnodes NNODES的最大值一致】
template:
spec:
restartPolicy: Never
containers:
- name: main
image: #【训练镜像,需要安装deepspeed,dlrover 和swift 】
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- sh start.sh # 启动脚本
resources:
limits:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '1'
volumeMounts:
- mountPath: /model
name: volume-model
- mountPath: /dev/shm
name: volume-shm
restartPolicy: Never
volumes:
- hostPath:
path: /model
type: Directory
name: volume-model
- emptyDir:
medium: Memory
sizeLimit: 200Gi
name: volume-shm
```
================================================
FILE: docs/source/BestPractices/Embedding.md
================================================
# Embedding训练
SWIFT已经支持Embedding模型的训练,包括纯文本和多模态两个类型。目前已经支持的模型有:
1. modernbert embedding模型
- [ModelScope](https://modelscope.cn/models/iic/gte-modernbert-base) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)
2. gte embedding模型
- 1.5B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-1.5B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)
3. gme embedding模型
- 2B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)
4. qwen3-embedding模型
- 0.6B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)
- 4B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-4B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-4B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-8B)
5. qwen3-vl-embedding模型
- 2B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-2B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)
开发者可以自行集成自己的模型,模型forward输出值需要满足:
```text
{"last_hidden_state": some-embedding-tensor}
```
返回值是一个json,具有`last_hidden_state` key,value是embedding tensor即可,输入部分可以使用我们已经支持的template。用户也可以通过指定
```shell
--task_type embedding
```
参数来将任意一个其他模型转换为embedding模型进行训练。
需要注意的是,SWIFT目前支持的embedding模型均为符合纯文本或多模态LLM,目前并不支持CLIP类型的模型训练。
此外,SWIFT支持的所有embedding模型在模型forward最后都增加了normalize,如自行增加新模型请注意增加normalize层。
## loss
目前SWIFT支持的Embedding模型可以使用的loss有:
- cosine_similarity: cosine相似度loss,计算两个embedding的相似度,并根据label的值拟合,实际为MSE loss
- contrastive: 可调margin的对比学习loss,label仅支持0和1两个值
- online_contrastive: 考虑hard negative和hard positive部分的contrastive loss,label仅支持0和1两个值
- infonce: 在同一个batch中不同row两两计算cosine相似度,并使row内部相似度最大,不同row相似度最小,不需要label
loss的源代码可以在[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)找到。
## 数据集格式
> 注:
> 1. ``标签可以出现在`messages`/`positive_messages`/`negative_messages`的任意位置;它们各自拥有独立的`images`/`positive_images`/`negative_images`字段用于提供图片路径或URL。
> 2. 不再需要跨字段的“对应顺序”。对齐规则为:`images`的长度等于`messages`中``标签的数量;`positive_images`与`negative_images`均为“list of list”,其外层长度分别等于`positive_messages`与`negative_messages`的长度;并且外层每一项的内层列表长度等于该条消息序列中``标签的数量。
> 3. `messages`代表anchor样本(anchor sample);`positive_messages`/`negative_messages`为“list of messages”(因此多一层`[]`);相应地,`positive_images`/`negative_images`也多一层`[]`并与之逐项对齐。
> 4. 也支持``, ``标签;可按相同规则分别通过`videos`/`positive_videos`/`negative_videos`与`audios`/`positive_audios`/`negative_audios`提供对应模态数据。
> 5. 当前约束:`positive_messages`的外层长度必须为1(即仅提供一个positive样本);对应地,`positive_images`的外层长度也必须为1。
### cosine_similarity loss对应的格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 0.8}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images1.jpg"],"positive_messages": [[{"role": "user", "content": "sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 0.7}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0.7}
```
### contrastive/online_contrastive loss对应的格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 1}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images1.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 1}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0}
```
评测的指标分别是两个embedding的欧式距离、点积等的pearson系数以及spearman系数,共八个指标。
### infonce 格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]]}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]]}
{"messages": [{"role": "user", "content": "sentence1"}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "positive_images": [["/some/positive_images.jpg"]], "negative_messages": [[{"role": "user", "content": "sentence3"}], [{"role": "user", "content": "sentence4"}]], "negative_images": [["/some/negative_images1.jpg", "/some/negative_images2.jpg"], ["/some/negative_images3.jpg"]]}
```
infonce loss支持几个环境变量:
1. `INFONCE_TEMPERATURE`: temperature参数,不设置的话默认值是0.1
2. `INFONCE_USE_BATCH`: 使用sample内部的`negative_messages`(hard negative样例)还是使用一个batch内其他样本作为in-batch negatives;默认为True,表示使用batch内部的样本作为负例
3. `INFONCE_HARD_NEGATIVES`: hard negatives的数量;如果不设置会使用数据中提供的所有`negative_messages`。由于长度未必一致,因此会采用for循环计算loss(计算会慢)。若设置为某个数值,则不足会随机采样补齐,超长会选用前`INFONCE_HARD_NEGATIVES`个
4. `INFONCE_MASK_FAKE_NEGATIVE`: mask掉假negative。默认为False,开启时会判断 `positive_similarity + INFONCE_FAKE_NEG_MARGIN`,比该阈值大的样本相似度会被设置为 `-inf`,以防止正样本泄露问题
5. `INFONCE_FAKE_NEG_MARGIN`:假负样本屏蔽的边际,默认 `0.1`。
6. `INFONCE_INCLUDE_QQ`:是否在分母中加入 q–q 分量(query 间相似度)作为负例,默认 `False`。
7. `INFONCE_INCLUDE_DD`:是否在分母中加入 d–d 分量(正样本文档与 batch 内所有文档的相似度)作为负例,默认 `False`。
> 也可以在数据集中将hard negatives数量设置为数量相等,这样即使不设置也不会使用for循环方式,加快计算速度
> `negative_messages`也可以不提供。在这种情况下,保持`INFONCE_USE_BATCH=True`,会使用一个batch内部的其他样本作为负例
infonce loss的评测会有下面几个指标:
- mean_neg 所有hard_negative的平均值
- mean_pos 所有positive的平均值
- margin positive-max_hard_negative的平均值
## 训练
SWIFT提供的脚手架训练脚本:
- [Qwen3-Embedding/Qwen3-VL-Embedding模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/qwen3)
- [GME模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh)
## 推理
SWIFT已经支持GME、GTE、Qwen3-Embedding模型的部署,请查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/embedding/client.py)。
- 推理脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_embedding.py)。
也可以使用原模型的代码进行推理:
https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct
https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct
如果使用了其他模型从0训练embedding(例如,原版`qwen2-vl`模型+`--task_type embedding`),也可以使用gme的推理代码,但请注意:
https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct/file/view/master/gme_inference.py?status=1#L111
这里的模板请修改为模型自身的template,以免最后的embedding对不上。需要额外注意的是,gme模型的template和`qwen2-vl`或`qwen2.5-vl`系列的chatml template并不相同,其推理代码最后的结束字符是`<|endoftext|>`而非`<|im_end|>`.
## 高级功能
- Qwen3-Embedding 自定义 Instruction:
- 默认无 Instruction,输入模板为:`{Query}<|endoftext|>`。
- 通过在 system message 中添加 Instruction,可将输入改为:`{Instruction} {Query}<|endoftext|>`。
- 示例:
```json lines
{"messages": [
{"role": "system", "content": "请用中文回答,并输出简洁要点"},
{"role": "user", "content": "介绍一下Qwen3-Embedding"}
]}
```
> 说明:Qwen3-Embedding 模板会将 system 内容前置拼接到首条 user 消息中,并使用 `<|endoftext|>` 作为结束标记。
### 转换前后示例
- 不加 Instruction:
输入数据(messages):
```json lines
{"messages": [
{"role": "user", "content": "北京明天天气如何?"}
]}
```
模板转换后(送入模型的实际文本):
```text
北京明天天气如何?<|endoftext|>
```
- 加 Instruction:
输入数据(messages,包含system):
```json lines
{"messages": [
{"role": "system", "content": "请使用中文、精炼输出要点"},
{"role": "user", "content": "北京明天天气如何?"}
]}
```
模板转换后(送入模型的实际文本):
```text
请使用中文、精炼输出要点 北京明天天气如何?<|endoftext|>
```
- positive/negative 同理:
若在某个 positive/negative 的消息序列中提供 system,则会将该 system 内容前置到该序列首条 user 内容之前;未提供 system 则不前置。
输入数据(包含一个 positive 带 system,和一个 negative 无 system):
```json lines
{
"messages": [
{"role": "user", "content": "Anchor"}
],
"positive_messages": [[
{"role": "system", "content": "指令"},
{"role": "user", "content": "Positive"}
]],
"negative_messages": [[
{"role": "user", "content": "Negative"}
]]
}
```
模板转换后(送入模型的实际文本):
```text
Anchor<|endoftext|>
指令 Positive<|endoftext|>
Negative<|endoftext|>
```
================================================
FILE: docs/source/BestPractices/GRPO-Code-Training.md
================================================
# GRPO代码训练
本文档介绍如何使用GRPO对模型进行代码训练
模型:[Qwen/Qwen2.5-7B-Instruct](https://www.modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)
数据集:[open-r1/verifiable-coding-problems-python-10k](https://www.modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k/dataPeview)
数据集样例
```json
{
"problem": "Solve the following coding problem using the programming language python: Polycarp has $n$ different binary words. A word called binary if it contains only characters '0' and '1'. For example, these words are binary: \"0001\", \"11\", \"0\" and \"0011100\". Polycarp wants to offer his set of $n$ binary words to play a game \"words\". In this game, players name words and each next word (starting from the second) must start with the last character of the previous word. The first word can be any. For example, these sequence of words can be named during the game: \"0101\", \"1\", \"10\", \"00\", \"00001\". Word reversal is the operation of reversing the order of the characters. For example, the word \"0111\" after the reversal becomes \"1110\", the word \"11010\" after the reversal becomes \"01011\". Probably, Polycarp has such a set of words that there is no way to put them in the order correspondent to the game rules. In this situation, he wants to reverse some words from his set so that: the final set of $n$ words still contains different words (i.e. all words are unique); there is a way to put all words of the final set of words in the order so that the final sequence of $n$ words is consistent with the game rules. Polycarp wants to reverse minimal number of words. Please, help him. -----Input----- The first line of the input contains one integer $t$ ($1 \\le t \\le 10^4$) — the number of test cases in the input. Then $t$ test cases follow. The first line of a test case contains one integer $n$ ($1 \\le n \\le 2\\cdot10^5$) — the number of words in the Polycarp's set. Next $n$ lines contain these words. All of $n$ words aren't empty and contains only characters '0' and '1'. The sum of word lengths doesn't exceed $4\\cdot10^6$. All words are different. Guaranteed, that the sum of $n$ for all test cases in the input doesn't exceed $2\\cdot10^5$. Also, guaranteed that the sum of word lengths for all test cases in the input doesn't exceed $4\\cdot10^6$. -----Output----- Print answer for all of $t$ test cases in the order they appear. If there is no answer for the test case, print -1. Otherwise, the first line of the output should contain $k$ ($0 \\le k \\le n$) — the minimal number of words in the set which should be reversed. The second line of the output should contain $k$ distinct integers — the indexes of the words in the set which should be reversed. Words are numerated from $1$ to $n$ in the order they appear. If $k=0$ you can skip this line (or you can print an empty line). If there are many answers you can print any of them. -----Example----- Input 4 4 0001 1000 0011 0111 3 010 101 0 2 00000 00001 4 01 001 0001 00001 Output 1 3 -1 0 2 1 2 The input will be stdin and you should print your solution to stdout Now solve the problem and return the code.",
"verification_info": {
"language": "python",
"test_cases": [
{
"input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n",
"output": "1\n3 \n-1\n0\n\n2\n1 2 \n",
"type": "stdin_stdout"
}
]
}
}
```
`verification_info` 提供了程序语言以及测试用例,其中包含输入和预期的输出。
## 奖励函数
使用`code_reward`和`code_format`奖励进行训练,实现细节见[代码](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)
- `code_reward`通过[e2b](https://e2b.dev/)或[judge0](https://judge0.com/)执行生成的代码,根据数据集中的测试用例对代码进行验证给出奖励值。
- `code_format`要求模型输出包含代码块的格式化回答。
注:当前通过e2b执行代码仅支持python语言,如需执行其他语言,可以使用judge0执行([judge0支持语言列表](https://github.com/judge0/judge0?tab=readme-ov-file#supported-languages))。
## 训练脚本
### e2b
- 在[e2b](https://e2b.dev/dashboard)注册获取E2B_API_KEY,并设置为环境变量。
- `--reward_funcs`添加`external_code_reward`作为奖励函数。
- `--external_plugins`设置为plugin.py的路径。
首先拉起 vLLM server
```bash
CUDA_VISIBLE_DEVICES=7 \
swift rollout \
--model Qwen/Qwen2.5-7B-Instruct \
--vllm_enable_lora true \
--vllm_max_lora_rank 16
```
```bash
E2B_API_KEY=xxx \
WANDB_API_KEY=xxx \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
NPROC_PER_NODE=7 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_code_reward external_code_format \
--reward_weights 1.0 0.1 \
--vllm_mode server \
--use_vllm true \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type lora \
--lora_rank 16 \
--lora_alpha 32 \
--torch_dtype bfloat16 \
--dataset 'open-r1/verifiable-coding-problems-python-10k' \
--load_from_cache_file true \
--max_completion_length 2048 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 1 \
--eval_steps 200 \
--save_steps 200 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 4 \
--num_generations 14 \
--temperature 0.9 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero2 \
--log_completions true \
--report_to wandb
```
### judge0
- 设置环境变量:
- (必需)JUDGE0_ENDPOINT: judge0访问地址。
- (可选)JUDGE0_X_AUTH_TOKEN: judge0访问Token。
- `--reward_funcs`添加`external_code_reward_by_judge0`作为奖励函数。
- `--external_plugins`设置为plugin.py的路径。
```bash
JUDGE0_ENDPOINT=xxx \
JUDGE0_X_AUTH_TOKEN=xxx \
WANDB_API_KEY=xxx \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 \
NPROC_PER_NODE=7 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-7B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_code_reward_by_judge0 external_code_format \
--reward_weights 1.0 0.1 \
--vllm_mode server \
--use_vllm true \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type lora \
--torch_dtype bfloat16 \
--dataset 'open-r1/verifiable-coding-problems-python-10k' \
--load_from_cache_file true \
--max_completion_length 2048 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 1 \
--eval_steps 200 \
--save_steps 200 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 4 \
--num_generations 14 \
--temperature 0.9 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero2 \
--log_completions true \
--report_to wandb
```
训练奖励曲线图

================================================
FILE: docs/source/BestPractices/GRPO-Multi-Modal-Training.md
================================================
# 多模态GRPO完整实验流程
本文介绍如何使用SWIFT GRPO进行多模态模型和任务的训练。目标是对多个多模态任务进行训练,提升任务精度,任务定义和训练参数等参考了 [R1-V](https://github.com/Deep-Agent/R1-V.git) 和 [open-r1-multimodal](https://github.com/EvolvingLMMs-Lab/open-r1-multimodal.git)
## ClevrCount 任务
### 任务与数据集定义
本任务从clevr_cogen_a_train数据集出发,模型的目标是输出图像中包含的物体数量,因此,我们定义数据集如下:
```python
class ClevrPreprocessor(ResponsePreprocessor):
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
query = row.get('query', '')
query = f"""{query} Output the thinking process in and
final answer (number) in tags."""
row.update({'query': query})
return super().preprocess(row)
register_dataset(
DatasetMeta(
ms_dataset_id='AI-ModelScope/clevr_cogen_a_train',
subsets=[
SubsetDataset(
name='default',
subset='default',
split=['train'],
),
],
preprocess_func=ClevrPreprocessor(),
tags=['qa', 'math']))
```
这里重新定义dataset preprocessor的目的是修改query。数据集示例样本如下,包含messages,images和solution字段,solution会送入后续的奖励函数中,而messages和images则会作为模型输入。
- 注意:`{'role': 'assistant', 'content': ' 3 '}`将会在GRPOTrainer中被移除,可以忽略。'solution'字段将会透传入ORM中。在自定义数据集时,'images'字段组织成`["image_path1", "image_path2"]`即可。
```json
{
"images": ["image_path1", "image_path2"],
"messages": [
{
"role": "user",
"content": "How many items are there in the image? Output the thinking process in and \n final answer (number) in tags."
}
],
"solution": " 3 "
}
```
## 奖励函数定义:
本任务使用的奖励函数有两个,一个是 Deepseek-R1 中提到的格式奖励函数,另一是 ClevrCount 的准确性奖励函数。前者已经在swift中内置,通过 `--reward_funcs format` 可以直接使用,而后者需要我们自己定义,在这里我们使用 external_plugin 的方式定义准确性奖励函数,将代码放在`swift/examples/train/grpo/plugin/plugin.py`中。
在这里,奖励函数的输入包括completions和solution两个字段,分别表示模型生成的文本和真值。每个都是list,支持多个completion同时计算。注意,在这里,solution字段是数据集中定义的字段透传而来,如果有任务上的变动,可以分别对数据集和奖励函数做对应的改变即可。
```python
class MultiModalAccuracyORM(ORM):
def __call__(self, completions, solution, **kwargs) -> List[float]:
"""
Reward function that checks if the completion is correct.
Args:
completions (list[str]): Generated outputs
solution (list[str]): Ground Truths.
Returns:
list[float]: Reward scores
"""
rewards = []
from math_verify import parse, verify
for content, sol in zip(completions, solution):
reward = 0.0
# Try symbolic verification first
try:
answer = parse(content)
if float(verify(answer, parse(sol))) > 0:
reward = 1.0
except Exception:
pass # Continue to next verification method if this fails
# If symbolic verification failed, try string matching
if reward == 0.0:
try:
# Extract answer from solution if it has think/answer tags
sol_match = re.search(r'(.*?) ', sol)
ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
# Extract answer from content if it has think/answer tags
content_match = re.search(r'(.*?) ', content)
student_answer = content_match.group(1).strip() if content_match else content.strip()
# Compare the extracted answers
if student_answer == ground_truth:
reward = 1.0
except Exception:
pass # Keep reward as 0.0 if both methods fail
rewards.append(reward)
return rewards
orms['external_r1v_acc'] = MultiModalAccuracyORM
```
### GRPO训练实验记录
#### 训练参数:
我们选取 Qwen2.5-VL-3B-Instruct 作为基础模型进行训练,选取 Instruct 而不是基模的主要原因是可以更快地获取 format reward。我们在八卡 GPU 上进行实验。如果遇到vllm部署qwen2.5-vl报错,可以参考[issue](https://github.com/vllm-project/vllm/issues/13285)
由于任务简单,我们设置max_completion_length为1024,奖励函数选择external_r1v_acc和format,学习率和beta分别设置为1e-6和0.001。其他设置如下所示,batch_size和num_generations的设置原则可以参考[GRPO完整流程](./GRPO.md)。
首先拉起 external vLLM server
```bash
CUDA_VISIBLE_DEVICES=6,7 \
swift rollout \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--vllm_data_parallel_size 2
```
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'AI-ModelScope/clevr_cogen_a_train' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 1000 \
--save_steps 1000 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_CLEVR_COUNTDOWN \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--num_generations 24 \
--temperature 1.0 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 1 \
--async_generate false \
--beta 0.001 \
```
#### 实验现象

由于数据集和任务比较简单,模型训练了500个epoch已经基本收敛,有以下观察。
1. 自定义的ClevrORM在不断增加,证明模型学习到了如何完成这一任务,最终任务成功率从初始的0.4攀升到1左右。
2. 由于是Instruct模型,Format Reward基本稳定在1,这可能也与数据集所有样本都是一样的query有一定关系。
3. reward_std 稳定到0.1以下。
4. completion length最终稳定在60-80,模型学习到的固定输出范式是逐个物体列举数数。
step 1:
```
" The image shows 5 distinct items: a cyan metallic cylinder, a red metallic cylinder, a small purple metallic cylinder, a large gray metallic sphere, and a small yellow metallic cube. To count the number of items, I add the counts of each type of object: 1 cyan cylinder, 1 red cylinder, 1 purple cylinder, 1 gray sphere, and 1 yellow cube. Therefore, the total number of items is 5. \n5 "
"Let's count the number of items in the image:\n1. A large cyan metallic cylinder.\n2. A small purple metallic cylinder.\n3. A large red metallic cylinder.\n4. A large gray metallic sphere.\n5. A small yellow metallic cube.\n\n5 "
" The image shows four distinct objects: a large cyan metallic cylinder, a smaller purple metallic cylinder, a large red metallic cylinder, and a small yellow metallic cube. Counting each of these items individually, there are five items in total. \n5 "
```
step 400:
```
" Counting the number of items in the image:\n1. Green matte sphere\n2. Large metallic yellow sphere\n3. Small metallic brown cube\n\nThere are three distinct objects in total.\n \n3 "
" Counting the objects in the image:\n1. Large yellow metallic sphere\n2. Small green rubber sphere\n3. Small brown metallic cube\n\nThere are three distinct items in the image.\n \n3 "
```
总体来讲,这一任务比较简单,reward的收敛也比较典型。
## Geometric QA任务
### 任务与数据集定义
本任务为Geometric QA任务,任务描述为:给定一个几何图形,回答有关几何图形的数学问题。原始数据来自于[论文](https://arxiv.org/pdf/2312.11370),[R1-V](https://github.com/Deep-Agent/R1-V.git)对数据进行了预处理,将所有数据全部处理成了problem-solution的格式,而图像则保留在image字段中,因此,我们不需要额外定义数据集,直接使用`--dataset AI-ModelScope/GEOQA_R1V_Train_8K`即可。
### 奖励函数
由于也是数学题,同时,答案也处理成了最终结果,因此,我们直接使用以上定义过的`MultiModalAccuracyORM`奖励函数。
### GRPO训练实验记录
#### 训练参数:
选取的模型和大部分超参数与上一个实验相似,主要有两点不同:
1. SWIFT 已支持`--num_iteration`参数,单次rollout可以进行多次更新,这里设置为2。
2. 在实验时发现,在数学问题中,训练可能会出现不稳定现象,导致模型训崩,具体表现为所有rewar迅速降低,loss、grad_norm和kl都迅速增大,后续也难以恢复正常状态。因此,这里设置`--max_grad_norm 0.5`,保证稳定训练,当然,这种现象的出现也有一定的随机性。
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
MAX_PIXELS=401408 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 400 \
--save_steps 400 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_GEOQA \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--repetition_penalty 1.1 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 2 \
--async_generate false \
--beta 0.001 \
--max_grad_norm 0.5 \
```
#### 实验现象

训练曲线如上图所示。
1. 相比于Count任务,Geometric QA任务的难度明显更大,因此,Accuracy Reward收敛明显更慢,同时震荡幅度也明显更大。
2. completion_length的变化趋势比较明显,最终收敛到250 token左右。
3. reward基本收敛,标准差在一共1300个step时稳定在0.2左右。
以下是训练后的模型completition示例:
```
User:
"If AB is the tangent to circle O with the tangent point A, and BO intersects circle O at point C, and point D is located on circle O, what is the measure of angle ADC if the degree of angle ABO is 32.0?"
Assistant:
"\nTo find the measure of angle ADC, we first need to understand the properties of the tangent and the circle. Since AB is a tangent to the circle at point A, the line segment OA (which is perpendicular to the tangent AB) is perpendicular to AB. This means that angle OAB is 90 degrees.\n\nGiven that angle ABO is 32 degrees, we can find angle BAO by subtracting angle ABO from 90 degrees:\n∠BAO = 90° - ∠ABO = 90° - 32° = 58°\n\nSince angle BAO is an angle in the minor arc AD, the angle ADC, which is an angle in the minor arc AD, is half the angle BAO. Therefore, angle ADC is:\n∠ADC = 1/2 × ∠BAO = 1/2 × 58° = 29°\n\nSo, the measure of angle ADC is 29 degrees.\n \n\nThe measure of angle ADC is 29 degrees.\n "
```
## Multimodal Open R1 数据集实验
### 任务与数据集定义
本任务为参考[open-r1-multimodal](https://github.com/EvolvingLMMs-Lab/open-r1-multimodal.git)的实验,使用数据集:[lmms-lab/multimodal-open-r1-8k-verified](https://www.modelscope.cn/datasets/lmms-lab/multimodal-open-r1-8k-verified),该数据集专注于多模态的数学推理任务,数据由GPT4o基于`Math360K`和`Geo170K`数据集生成,包含推理路径和可验证答案。数据集中已包含了image, problem和solution字段,我们也不需要针对prompt进行修改,因此无需额外定义数据集。
### 奖励函数
我们直接使用以上定义过的`MultiModalAccuracyORM`奖励函数。
### GRPO训练实验记录
#### 训练参数:
选取的模型和大部分超参数与上一个实验相似,由于训练的时候出现了OOM,我们设置`MAX_PIXELS=262144`以降低显存占用。
```shell
WANDB_API_KEY=your_wandb_api_key \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
MAX_PIXELS=262144 \
MASTER_PORT=29600 \
NPROC_PER_NODE=6 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs external_r1v_acc format \
--use_vllm true \
--vllm_mode server \
--vllm_server_host 127.0.0.1 \
--vllm_server_port 8000 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
--load_from_cache_file true \
--max_completion_length 1024 \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--learning_rate 1e-6 \
--gradient_accumulation_steps 2 \
--save_strategy 'steps' \
--eval_strategy 'steps' \
--eval_steps 400 \
--save_steps 400 \
--save_total_limit 10 \
--logging_steps 1 \
--output_dir output/GRPO_GEOQA \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--repetition_penalty 1.1 \
--system 'examples/train/grpo/prompt.txt' \
--deepspeed zero3 \
--log_completions true \
--report_to wandb \
--num_iterations 2 \
--async_generate false \
--beta 0.001 \
--max_grad_norm 0.5 \
```
#### 实验现象

训练曲线如上图所示。
一共8k数据,训练了1268个step,Accuracy Reward收敛到0.5左右,completion_length基本收敛到200token左右,reward_std最终在0.2左右震荡。
以下是训练后的模型completition示例:
```
User:
"Based on the map, which state falls into the lowest percentage range of lots, and what is that percentage range?"
Assistant:
"\nThe image provided shows a color-coded map of the USA indicating different ranges of lot percentages across various states. According to the legend at the bottom, the lowest percentage range (6.0% - 6.5%) is represented by white. In the image, Alabama (AL) is shaded in white, which corresponds to the 6.0% - 6.5% category. Therefore, based on the map, the state that falls into the lowest percentage range of lots is Alabama, with the percentage range of 6.0% - 6.5%.\nTherefore, the answer is 6.0% - 6.5%.\n \nAlabama "
```
================================================
FILE: docs/source/BestPractices/GRPO.md
================================================
# GRPO完整实验流程
本文从较为简单的数学任务 Coundown Game 出发,从数据集定义、奖励函数定义和GRPO训练几个步骤介绍完整的GRPO训练流程。任务定义和训练参数等参考了 [mini-deepseek-r1](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb)。
## 任务与数据集定义
Coundown Game 的任务目标是根据给定的几个数字和加减乘除四种运算,得到目标数字,因此,我们定义数据集如下:
```python
class CoundownTaskPreprocessor(ResponsePreprocessor):
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
numbers = row['nums']
target = row.pop('response', None)
query = f"""
Using the numbers {numbers}, create an equation that equals {target}.
You can use basic arithmetic operations (+, -, *, /) and each number can only be used once.
Show your work in tags. And return the final equation and answer in tags,
for example (1 + 2) / 3 * 4 = 4 .
"""
row.update({'target': target, 'query': query})
return super().preprocess(row)
register_dataset(
DatasetMeta(
ms_dataset_id='zouxuhong/Countdown-Tasks-3to4',
subsets=['default'],
preprocess_func=CoundownTaskPreprocessor(),
tags=['math']))
```
通过 template, 使用 numbers 和 target 完成任务定义,并给到 query 字段供模型采样使用。同时,我们需要保留 nums 和 target 两个字段,用于后续的奖励函数计算。
## 奖励函数定义:
本任务使用的奖励函数有两个,一个是 Deepseek-R1 中提到的格式奖励函数,另一是 Coundown Game 的准确性奖励函数。前者已经在swift中内置,通过 `--reward_funcs format` 可以直接使用,而后者需要我们自己定义,在这里我们使用 external_plugin 的方式定义准确性奖励函数,将代码放在`swift/examples/train/grpo/plugin/plugin.py`中。
在这里,奖励函数的输入包括 completions、target 和 nums 三个字段,分别表示模型生成的文本、目标答案和可用的数字。每个都是list,支持多个 completion 同时计算。注意,在这里,除了 completions 之外的参数都是数据集中定义的字段透传而来,如果有任务上的变动,可以分别对数据集和奖励函数做对应的改变即可。
```python
class CountdownORM(ORM):
def __call__(self, completions, target, nums, **kwargs) -> List[float]:
"""
Evaluates completions based on Mathematical correctness of the answer
Args:
completions (list[str]): Generated outputs
target (list[str]): Expected answers
nums (list[str]): Available numbers
Returns:
list[float]: Reward scores
"""
rewards = []
for completion, gt, numbers in zip(completions, target, nums):
try:
# Check if the format is correct
match = re.search(r"(.*?)<\/answer>", completion)
if match is None:
rewards.append(0.0)
continue
# Extract the "answer" part from the completion
equation = match.group(1).strip()
if '=' in equation:
equation = equation.split('=')[0]
# Extract all numbers from the equation
used_numbers = [int(n) for n in re.findall(r'\d+', equation)]
# Check if all numbers are used exactly once
if sorted(used_numbers) != sorted(numbers):
rewards.append(0.0)
continue
# Define a regex pattern that only allows numbers, operators, parentheses, and whitespace
allowed_pattern = r'^[\d+\-*/().\s]+$'
if not re.match(allowed_pattern, equation):
rewards.append(0.0)
continue
# Evaluate the equation with restricted globals and locals
result = eval(equation, {'__builtins__': None}, {})
# Check if the equation is correct and matches the ground truth
if abs(float(result) - float(gt)) < 1e-5:
rewards.append(1.0)
else:
rewards.append(0.0)
except Exception as e:
# If evaluation fails, reward is 0
rewards.append(0.0)
return rewards
orms['external_countdown'] = CountdownORM
```
## GRPO训练实验记录
首先贴上GRPO的公式:
$$
{\scriptstyle
\begin{aligned}
\mathcal{J}_{G R P O}(\theta) & =\mathbb{E}\left[q \sim P(Q),\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{o l d}}(O \mid q)\right] \\
& \frac{1}{G} \sum_{i=1}^G \frac{1}{\left|o_i\right|} \sum_{t=1}^{\left|o_i\right|}\left\{\min \left[\frac{\pi_\theta\left(o_{i, t} \mid q, o_{i,\nWe need to create an equation using the given numbers [70, 5, 42, 30] exactly once and make it equal to 77. \n\nFirst, let's consider the possibility of multiplication or division to get closer to 77. Notice that 77 is close to 70 and 42. If we can use these numbers together, it might work.\n\nMultiplying 70 by 1.1 would give us approximately 77, which is very close. So, let's try to make 1.1 using the remaining numbers 5 and 30. \nIf we take 30/5 = 6, and then 5/6 = 0.8333..., this isn't useful here, so let's look at other possibilities.\n\nNext, let's consider if there's another way to achieve 77 using addition, subtraction, or another combination of our numbers. Given the numbers provided, it seems that multiplication or division will be key. \n\nWe can try subtracting 5 from 30 to get 25. Then, if we divide 42 by 25, we don't get a whole number, but we can round it to the nearest whole number, 1. Now, we need to make the equation fit 77. Let's multiply 25 by 3 and add 7 (since 77 = 25*3 + 7). However, we only have one 7 available and we need to find a way to use the remaining numbers to get 7.\n\nLet's try: \n70 - 42 = 28 \nThen, 28 + 5 = 33 \nNext, 33 + 30 = 63 \nFinally, 77 - 63 = 14, which doesn't help.\n\nConsidering different combinations, one viable way is:\n(70 - 42) * (30 / 5) = 28 * 6 = 168 / 2 = 84 / 2 = 42 / 2 = 21 / 3 = 7\n\nSo, (70 - 42) * (30 / 5) = 77.\n\n(70 - 42) * (30 / 5) = 77 "
```
step 200:
```
"\nWe need to use all the given numbers in a way that the resultant value of the calculation is 99. Given these restrictions, it's clear we should look into how the numbers relate and the mathematical operations that will bring them together to meet our condition.\n\nFirstly, let's evaluate the sums, differences, and multiplication/division among the numbers and their relations to see how this could lead us towards an outcome of 99.\n- Summing the three numbers: 96 + 70 + 73 = 239, which is far from 99. This shows addition isn't practical.\n- Subtracting numbers: For instance, subtracting the lowest two (i.e., 73 - 70 = 3) and combining with the highest one (96) does not work. Also, there is no other combination of subtraction that would result in the number needing to add up to 99.\n- Multiplying or dividing the numbers alone wouldn’t reach to any near solution since none of the products or division results of the numbers themselves is very close to 99. We might think more carefully about the arrangement of them using a combination of the operations provided.\nTherefore, considering the possible arrangement, a combination like taking (73 - 70 + 96) may produce an effective solution as this will sum the values in such a manner that it aligns with the goal of 99.\n\nSo we conclude that the operation: 73 - 70 + 96 gives the right sequence to equal 99.\n \n\n73 - 70 + 96 = 99 "
```
step 500:
```
"\nLet's begin by exploring various possible combinations of the given numbers using the arithmetic operators we have access to.\n\n- If I add the numbers, I get: 56 + 73 + 51 = 180 which is not equal to our target number (68)\n- If I subtract one number from another I will still exceed our target value as well since the maximum difference is: |(difference between max-min):| | (73 - 51) or | (73 - 56), etc.|\n- Let's multiply the numbers individually and see if the division operation can bring us closer to the goal:\n - (73 * 51) - 56 = 3723 - 56 = 3667 which isn't the goal either.\n - The only operator left is addition and subtraction; I'll try the combination: \n ((73 - 56) + 51) gives me (17 + 51) = 68.\n \nThus, our solution fits the provided problem constraints and meets the required outcome.\n \n(73 - 56) + 51 = 68 "
```
step 1000:
```
"\nWe need to form an expression using the given numbers (50, 86, and 49) such that it equals 85. Let's try different combinations:\n\n(86 - 50) + 49 = 76 + 49 = 125 (too big)\n(86 - 49) + 50 = 37 + 50 = 87 (too big)\n(86 - 50) + 49 = 37 + 49 = 86 (too big)\n(86 + 50) - 49 = 136 - 49 = 87 (too big)\n(86 - 50) + (49 - x)? Not working!\n(50 + 86) - 49: this is 136 which isn't our aim.\n50 + (86 - 49) = 50 + 37 = 87 which wasn't correct.\n50 - 86 + 49 = -16 + 49: This doesn't match up as well.\n86 - (50 + 49) = 86 - 99 which doesn't work too.\n\nWe try one more:\n(86 + 49) - 50 = 135 - 50 = 85!\n\nThat's it!\n \n\n(86 + 49) - 50 = 85 "
```
step 2000:
```
"\nWe need to form an expression using the numbers 58, 9, 78, and 44 which evaluates to 73. Let's try different combinations:\n(78 - 58) + (9 + 44) = 10 + 53 = 63 (too low)\n(78 - 58) + (9 - 44) = 20 - 35 = -15 (too low)\n(78 - 58) + (44 - 9) = 20 + 35 = 55 (too low)\n(78 + 58) - (9 + 44) = 136 - 53 = 83 (too high)\n(78 + 58) - (9 - 44) = 136 + 35 = 171 (too high)\n(78 + 58) - (44 + 9) = 136 + 53 = 189 (too high)\n(78 + 9) - (58 + 44) = 87 - 102 = -15 (too low)\n(78 + 9) - (58 - 44) = 87 - 14 = 73\n\nSo our solution is: (78 + 9) - (58 - 44) = 73 \n\n(78 + 9) - (58 - 44) = 73 "
```
附learning_rate和beta分别取值1e-6和0.04的不稳定实验记录,模型在step 200左右出现了震荡,format和CountdownORM瞬间走低:

================================================
FILE: docs/source/BestPractices/MLLM-Registration.md
================================================
# 注册多模态模型最佳实践
本文将介绍如何在ms-swift中注册多模态模型,并成功推理和训练。本文将以Qwen2.5-Omni为例子,注册新的model_type和template `my_qwen2_5_omni`,并支持文本、图片、视频和音频的训练。由于Qwen2.5-Omni已经在ms-swift中注册,我们可以通过显式指定model_type和template来使用我们自定义的部分。
## 环境准备
```shell
# 避免未来出现与文档的不兼容情况
pip install "ms-swift>=4.0"
pip install "transformers==4.57.*" "qwen_omni_utils==0.0.8"
```
## 注册模型
第一步,我们需要注册模型,来获取模型和processor。
```python
from transformers import PretrainedConfig, PreTrainedModel
from swift.model import (Model, ModelGroup, ModelMeta, MultiModelKeys, get_model_processor, register_model,
register_model_arch, ModelLoader)
from swift.model.models.qwen import patch_qwen_vl_utils
from swift.model.patcher import patch_get_input_embeddings
from swift.model.utils import use_submodel_func
from swift.utils import get_env_args, Processor
register_model_arch(
MultiModelKeys(
'my_qwen2_5_omni',
# `freeze_llm`, `freeze_vit`, `freeze_aligner`将根据下面的值来决定其行为。
# 例如:全参数训练,若设置`freeze_vit=True`,将冻结以`thinker.audio_tower`和`thinker.visual`为前缀的模型层的参数。
# LoRA训练,若设置`freeze_vit=False`,将额外为以`thinker.audio_tower`和`thinker.visual`为前缀的Linear层添加LoRA。
language_model=['thinker.model', 'thinker.lm_head'],
vision_tower=['thinker.audio_tower', 'thinker.visual'],
aligner=['thinker.audio_tower.proj', 'thinker.visual.merger'],
# generator的部分将永远不进行训练或处于冻结状态。
# 如果你希望`thinker.audio_tower`, `thinker.audio_tower.proj`永远不进行训练,你可以放置到generator中,并将其从vision_tower, aligner中移除。
generator=['talker', 'token2wav'],
))
class Qwen2_5OmniLoader(ModelLoader):
def get_config(self, model_dir: str) -> PretrainedConfig:
from transformers import Qwen2_5OmniConfig
config = Qwen2_5OmniConfig.from_pretrained(model_dir, trust_remote_code=True)
enable_audio_output = get_env_args('ENABLE_AUDIO_OUTPUT', bool, None)
if enable_audio_output is not None:
config.enable_audio_output = enable_audio_output
return config
def get_processor(self, model_dir: str, config: PretrainedConfig) -> Processor:
from transformers import Qwen2_5OmniProcessor
from qwen_omni_utils import vision_process
processor = Qwen2_5OmniProcessor.from_pretrained(model_dir, trust_remote_code=True)
# Control constants in qwen_omni_utils library via environment variables,
# e.g., `MAX_PIXELS`, etc.
patch_qwen_vl_utils(vision_process)
return processor
def get_model(self, model_dir: str, config: PretrainedConfig, processor: Processor,
model_kwargs) -> PreTrainedModel:
from transformers import Qwen2_5OmniForConditionalGeneration
print('Run my_qwen2_5_omni...')
self.auto_model_cls = self.auto_model_cls or Qwen2_5OmniForConditionalGeneration
model = super().get_model(model_dir, config, processor, model_kwargs)
# For multimodal model consistency, we replace the model's forward/generate functions
# with those of its language_model.
# Handle additional parts separately.
use_submodel_func(model, 'thinker')
# Avoid inplace operations on leaf_variable during training
# (replacing parts of input_embeds with images_embeds)
patch_get_input_embeddings(model.thinker.visual, 'patch_embed')
# Some custom settings for model/config (usually not needed; configure based on
# specific model if errors occur during training/inference)
model.config.keys_to_ignore_at_inference += ['hidden_states', 'attention_mask']
model.config.talker_config.pad_token_id = None
return model
register_model(
ModelMeta(
'my_qwen2_5_omni',
[
ModelGroup([
Model('Qwen/Qwen2.5-Omni-3B', 'Qwen/Qwen2.5-Omni-3B'),
Model('Qwen/Qwen2.5-Omni-7B', 'Qwen/Qwen2.5-Omni-7B'),
]),
],
# 用来获取model和processor的函数。
Qwen2_5OmniLoader,
template='my_qwen2_5_omni',
is_multimodal=True, # 是否是多模态模型
model_arch='my_qwen2_5_omni', # 通常只为多模态模型设置
# 用于model_type的自动匹配
architectures=['Qwen2_5OmniModel', 'Qwen2_5OmniForConditionalGeneration'],
# 用来提示用户依赖版本(可删除)
requires=['transformers>=4.50', 'soundfile', 'qwen_omni_utils', 'decord'],
# 用来提示用户(可删除)
tags=['vision', 'video', 'audio'],
# 全参数训练/merge-lora需要额外保存的文件
additional_saved_files=['spk_dict.pt'],
))
if __name__ == '__main__':
# 测试与debug
model, processor = get_model_processor('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni')
```
## 注册模板
第二步,我们需要注册模板,来自定义如何将文本、图片、视频和音频进行预处理(`_encode`和`_data_collator`方法)。这是ms-swift支持多模态模型训练的关键模块。预处理方式请参考transformers推理实现,并进行对齐。
template的功能如下:
1. 支持正常推理与训练,预处理文本和多模态信息,并支持grounding任务。
2. 支持padding_free和packing训练。
3. 支持混合模态数据训练。
```python
from functools import partial
from typing import Any, Dict, List, Literal, Optional
import torch
from transformers.integrations import is_deepspeed_zero3_enabled
from swift import get_model_processor
from swift.template import StdTemplateInputs, Template, TemplateMeta, get_template, register_template
from swift.template.utils import Context, findall
from swift.template.vision_utils import load_audio
from swift.utils import Processor, get_env_args, get_logger, get_packed_seq_params, is_deepspeed_enabled, to_float_dtype
logger = get_logger()
class Qwen2_5OmniTemplate(Template):
use_model = True # 是否在预处理的过程中需要model参与
# 需要注意是:并不是所有的多模态模型都能支持padding_free/packing。`transformers`库内的模型通常可以支持
support_padding_free = True # 是否支持padding_free和packing(多模态模型)
norm_bbox = 'none' # grounding任务使用绝对坐标还是norm1000坐标
# 这里的tokens将不会被裁剪(例如设置`--truncation_strategy left/right`)
# 并会使用简略方式打印(调用`template.safe_decode`)
placeholder_tokens = ['<|IMAGE|>', '<|AUDIO|>', '<|VIDEO|>']
def init_processor(self, processor: Processor) -> None:
"""在初始化processor时,额外初始化所需的一些常量"""
if processor is None:
return
super().init_processor(processor)
from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import Qwen2_5OmniProcessorKwargs
default = Qwen2_5OmniProcessorKwargs._defaults
self.seconds_per_chunk = default['videos_kwargs']['seconds_per_chunk']
self.position_id_per_seconds = default['videos_kwargs']['position_id_per_seconds']
self.use_audio_in_video = get_env_args('use_audio_in_video', bool, False)
self.sampling_rate = get_env_args('sampling_rate', int, self.processor.feature_extractor.sampling_rate)
# `QWENVL_BBOX_FORMAT`的含义参考grounding数据集自定义文档
self.bbox_format = get_env_args('QWENVL_BBOX_FORMAT', str, 'legacy')
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
inputs: StdTemplateInputs) -> List[Context]:
"""读取多模态数据,并替换通用多模态tag。
例如:图像tag从`` -> `<|vision_bos|><|IMAGE|><|vision_eos|>`"""
# 读取多模态数据也可以在`_encode`函数中进行,怎么方便怎么来。
from qwen_omni_utils import fetch_image, fetch_video
if media_type == 'image':
inputs.images[index] = fetch_image({'image': inputs.images[index]})
return ['<|vision_bos|><|IMAGE|><|vision_eos|>']
elif media_type == 'audio':
if self.mode != 'vllm': # 'vllm'推理场景下不需要处理
inputs.audios[index] = load_audio(inputs.audios[index], self.sampling_rate)
return ['<|audio_bos|><|AUDIO|><|audio_eos|>']
elif media_type == 'video':
video = inputs.videos[index]
_video = fetch_video({'video': video})
if isinstance(_video, torch.Tensor):
_video = _video.to(torch.uint8)
inputs.videos[index] = _video
if self.use_audio_in_video:
import librosa
if video.startswith('http://') or video.startswith('https://'):
import audioread
video = audioread.ffdec.FFmpegAudioFile(video)
video = librosa.load(video, sr=self.sampling_rate)[0]
inputs.audios.insert(inputs.audio_idx, (video, 'video'))
inputs.audio_idx += 1
return ['<|vision_bos|><|audio_bos|><|VIDEO|><|audio_eos|><|vision_eos|>']
else:
return ['<|vision_bos|><|VIDEO|><|vision_eos|>']
def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
"""替换grounding任务的通用tag: ``"""
if self.bbox_format == 'legacy':
return [f'<|object_ref_start|>{ref}<|object_ref_end|>']
else:
return [ref]
def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]:
"""替换grounding任务的通用tag: ``"""
if self.bbox_format == 'legacy':
return [f'<|box_start|>{self._get_bbox_str(bbox)}<|box_end|>']
else:
return [str(bbox)]
def packing_row(self, row: List[Dict[str, Any]]) -> Dict[str, Any]:
"""支持packing & mrope。通常情况不需要继承该函数,这里为了自定义mrope的position_ids。"""
position_ids = []
for r in row:
r = r.copy()
r['input_ids'] = torch.tensor(r['input_ids'])[None]
position_ids.append(self._get_position_ids(r))
packed = super().packing_row(row)
packed['position_ids'] = torch.concat(position_ids, dim=-1)
return packed
def _get_new_tokens_use_audio_in_video(self, i, *, video_grid_thw, video_second_per_grid, audio_lengths,
video_token_id, audio_token_id):
"""辅助函数,用于支持`use_audio_in_video`为True的情况"""
merge_size = self.processor.image_processor.merge_size
grid_thw = video_grid_thw[i]
height = grid_thw[1] // merge_size
width = grid_thw[2] // merge_size
audio_token_indices = torch.arange(audio_lengths[i])
video_token_indices = torch.arange(grid_thw[0]).reshape(-1, 1, 1)
video_token_indices = torch.broadcast_to(video_token_indices,
(video_token_indices.shape[0], height, width)).reshape(-1)
video_token_indices = (video_token_indices * video_second_per_grid[i] * self.position_id_per_seconds)
tokens_per_chunk = int(self.position_id_per_seconds * self.seconds_per_chunk)
video_chunk_indexes = self.processor.get_chunked_index(video_token_indices, tokens_per_chunk)
audio_chunk_indexes = self.processor.get_chunked_index(audio_token_indices, tokens_per_chunk)
res = []
for j in range(max(len(video_chunk_indexes), len(audio_chunk_indexes))):
if j < len(video_chunk_indexes):
video_seq_length = video_chunk_indexes[j][1] - video_chunk_indexes[j][0]
res += video_token_id * video_seq_length
if j < len(audio_chunk_indexes):
audio_seq_length = audio_chunk_indexes[j][1] - audio_chunk_indexes[j][0]
res += audio_token_id * audio_seq_length
return res
def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
"""这里决定如何将text/images/audios/videos -> input_ids、labels、loss_scale以及pixel_values等多模态内容
这里的处理逻辑通常可以从对应模型的预处理代码实现中借鉴。
推荐:请先做推理对齐再做训练"""
encoded = Template._encode(self, inputs) # 处理纯文本部分,具体请参考自定义模型文档
logger.info_once('Run qwen2_5_omni template')
processor = self.processor
# 获取多模态内容
media_inputs = processor(
text='',
audio=inputs.audios or None,
images=inputs.images or None,
videos=inputs.videos or None,
do_resize=False,
return_tensors='pt')
# 我们不使用`processor`产生的input_ids和attention_mask。因为其不产生`labels`。
media_inputs.pop('input_ids')
media_inputs.pop('attention_mask')
media_inputs = to_float_dtype(media_inputs, self.model_info.torch_dtype)
input_ids = encoded['input_ids']
labels = encoded['labels']
loss_scale = encoded.get('loss_scale', None)
# audio模态
audio_token_id = self._tokenize('<|AUDIO|>')
idx_list = findall(input_ids, audio_token_id) # 查找所有的audio_token
feature_attention_mask = media_inputs.get('feature_attention_mask')
if feature_attention_mask is not None:
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
audio_lengths = ((audio_feature_lengths - 1) // 2 + 1 - 2) // 2 + 1
else:
audio_lengths = None
audio_lengths_origin = audio_lengths
# video_audios_mask用于处理`use_audio_in_video`,区分是纯audio(0)还是video中的audio(1)
video_audios_mask = []
for i, audio in enumerate(inputs.audios):
if isinstance(audio, tuple) and audio[1] == 'video':
inputs.audios[i] = audio[0]
video_audios_mask.append(True)
else:
video_audios_mask.append(False)
video_audios_mask = torch.tensor(video_audios_mask)
if idx_list:
# 过滤掉video中的audio的内容(将在video部分处理)
if self.use_audio_in_video:
audio_lengths = audio_lengths[~video_audios_mask]
def _get_new_audio_tokens(i):
return audio_token_id * audio_lengths[i]
# 对input_ids的多模态tokens进行展开
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_audio_tokens)
# image和video模态
for media_type in ['image', 'video']:
token = f'<|{media_type.upper()}|>'
token_id = self._tokenize(token)
idx_list = findall(input_ids, token_id)
if idx_list:
merge_size = processor.image_processor.merge_size
media_grid_thw = media_inputs.get(f'{media_type}_grid_thw')
if media_type == 'video' and self.use_audio_in_video:
audio_lengths = audio_lengths_origin[video_audios_mask]
video_second_per_grid = media_inputs['video_second_per_grid']
_get_new_tokens_use_audio_in_video = partial(
self._get_new_tokens_use_audio_in_video,
video_grid_thw=media_grid_thw,
video_second_per_grid=video_second_per_grid,
audio_lengths=audio_lengths,
video_token_id=token_id,
audio_token_id=audio_token_id)
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_tokens_use_audio_in_video)
else:
def _get_new_tokens(i):
token_len = (media_grid_thw[i].prod() // (merge_size**2))
return token_id * token_len
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
_get_new_tokens)
encoded['input_ids'] = input_ids
encoded['labels'] = labels
encoded['loss_scale'] = loss_scale
encoded.update(media_inputs) # 将多模态内容加入其中
return encoded
def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""该函数通常用于解决混合模型训练zero2/zero3卡住的问题,
即有的进程为纯文本数据未过vit,有的进程含图片数据过了vit。这里将创建dummy_image来解决。
该函数将被注册在`model.forward`前的pre_forward_hook中。
该函数需返回 含多模态信息的input_embeds。
"""
if not self.is_training:
return inputs
input_ids = inputs['input_ids']
input_features = inputs.get('input_features')
feature_attention_mask = inputs.get('feature_attention_mask')
base_model = self.get_base_model(model)
inputs_embeds = base_model.thinker.model.embed_tokens(input_ids)
thinker_config = model.config.thinker_config
# 辅助函数,用于处理text/image/video混合模态数据场景。(内部会创建dummy_image)
inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.thinker.visual, self.processor,
thinker_config)
# 含audio的混合模态数据场景
if input_features is None:
if is_deepspeed_enabled() and not is_deepspeed_zero3_enabled():
# 注意: 由于transformers实现中,经过audio部分模型层的次数与audio数量相关
# 因此zero3在不同进程audios数不同场景下会卡住(需修改transformers代码修复)。此场景请使用zero2。
input_features = input_ids.new_zeros([1, 128, 128], dtype=model.thinker.audio_tower.dtype)
feature_attention_mask = input_ids.new_ones([1, 128], dtype=torch.bool)
audio_res = model.thinker.get_audio_features(input_features, feature_attention_mask)
# 兼容transformers 5.0
if hasattr(audio_res, 'last_hidden_state'):
audio_embeds = audio_res.last_hidden_state
else:
audio_embeds = audio_res
inputs_embeds = inputs_embeds + audio_embeds.mean() * 0.
else:
audio_res = model.thinker.get_audio_features(input_features, feature_attention_mask)
# 兼容transformers 5.0
if hasattr(audio_res, 'last_hidden_state'):
audio_embeds = audio_res.last_hidden_state
else:
audio_embeds = audio_res
audio_mask = (input_ids == thinker_config.audio_token_index).unsqueeze(-1).expand_as(inputs_embeds)
audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_embeds)
return {'inputs_embeds': inputs_embeds}
def _get_position_ids(self, inputs: Dict[str, Any]):
"""辅助函数,用来获取mrope的position_ids"""
feature_attention_mask = inputs.get('feature_attention_mask')
if feature_attention_mask is not None:
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
else:
audio_feature_lengths = None
video_second_per_grid = inputs.pop('video_second_per_grid', None)
input_ids = inputs['input_ids']
attention_mask = inputs.get('attention_mask')
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
position_ids, _ = self.model.thinker.get_rope_index(
input_ids,
inputs.get('image_grid_thw'),
inputs.get('video_grid_thw'),
attention_mask,
self.use_audio_in_video,
audio_feature_lengths,
video_second_per_grid,
)
return self._concat_text_position_ids(position_ids) # 第一维为text_position_ids
def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
"""传入dataloader的`collate_fn`"""
res = super()._data_collator(batch, padding_to=padding_to)
if not self.padding_free and self.is_training:
# 其中padding_free/packing场景将会在packing_row中处理position_ids。
res['position_ids'] = self._get_position_ids(res)
if 'position_ids' in res:
# 创建`packed_seq_params`以支持padding_free/packing & flash-attn
position_ids = res['position_ids']
res['position_ids'] = position_ids[1:]
res['text_position_ids'] = text_position_ids = position_ids[0]
# https://github.com/huggingface/transformers/pull/40194
res.update(get_packed_seq_params(text_position_ids))
return res
def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
"""处理`_data_collator`函数中的多模态部分。(该函数兼容padding_free/packing)"""
res = super()._data_collator_mm_data(batch)
video_second_per_grid = self.gather_list(batch, 'video_second_per_grid')
if video_second_per_grid:
res['video_second_per_grid'] = video_second_per_grid
input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
feature_attention_mask = [
b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None
]
if input_features:
res['input_features'] = torch.concat(input_features)
res['feature_attention_mask'] = torch.concat(feature_attention_mask)
return res
def generate(self, model, *args, **kwargs):
"""`TransformersEngine`会调用template.generate方法进行文本生成,这里继承进行自定义。"""
if kwargs.get('video_grid_thw') is not None:
kwargs['use_audio_in_video'] = self.use_audio_in_video
return super().generate(model, *args, **kwargs)
register_template(
TemplateMeta('my_qwen2_5_omni', prefix=[], prompt=['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'],
chat_sep=['<|im_end|>\n'], suffix=['<|im_end|>'],
system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'],
default_system='You are a helpful assistant.', stop_words=['<|endoftext|>'],
agent_template='hermes',
template_cls=Qwen2_5OmniTemplate))
if __name__ == '__main__':
# 测试与debug
model, processor = get_model_processor('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni')
template = get_template(processor, template_type='my_qwen2_5_omni')
data = {
'messages': [
{'role': 'user', 'content': '描述视频与图片内容。'},
{'role': 'assistant', 'content': '一个小孩和一只猫咪。'},
],
'videos': ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'],
'images': ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'],
}
template.set_mode('train')
encoded = template.encode(data)
print('input_ids: ' + template.safe_decode(encoded['input_ids']))
print('labels: ' + template.safe_decode(encoded['labels']))
print('keys: ' + str(encoded.keys()))
```
## 推理对齐
接下来,你需要进行TransformersEngine与transformers的推理对齐。通常你需要对齐`input_ids`以及输出内容。你可以书写以下测试函数:
```python
import os
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
from modelscope import snapshot_download
from swift.infer_engine import TransformersEngine, InferRequest, RequestConfig
import requests
def infer_hf():
model_dir = snapshot_download('Qwen/Qwen2.5-Omni-7B')
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
model_dir, torch_dtype="auto", device_map="auto", attn_implementation='flash_attention_2')
processor = Qwen2_5OmniProcessor.from_pretrained(model_dir)
# 使用decord读取视频(暂不支持url)
resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
with open('_baby.mp4', 'wb') as f:
f.write(resp.content)
conversation = [
{
"role": "user",
"content": [
{"type": "video", "video": "_baby.mp4"},
{"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"},
{"type": "text", "text": "描述视频和图像。"},
],
},
]
USE_AUDIO_IN_VIDEO = False
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True,
use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)
text_ids = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, thinker_do_sample=False,
return_audio=False)
text = processor.batch_decode(text_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
return inputs['input_ids'][0].tolist(), text[0]
def test_my_qwen2_5_omni():
engine = TransformersEngine('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni', attn_impl='flash_attention_2')
infer_request = InferRequest(messages=[{
"role": "user",
"content": "描述视频和图像。",
}],
videos=["https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4"],
images=["http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"],
)
request_config = RequestConfig(temperature=0, max_tokens=512)
input_ids = engine.template.encode(infer_request)['input_ids']
resp_list = engine.infer([infer_request], request_config)
resp = resp_list[0].choices[0].message.content
return input_ids, resp
if __name__ == '__main__':
# 开启debug模式,会打印`TransformersEngine.infer`的input_ids和generate_ids
os.environ['SWIFT_DEBUG'] = '1'
input_ids_hf, response_hf = infer_hf()
input_ids_swift, response_swift = test_my_qwen2_5_omni()
# 测试input_ids和response对齐
assert input_ids_hf == input_ids_swift
assert response_hf == response_swift
```
## 开始训练
使用python代码训练,这通常更容易debug:
```python
from swift import sft_main, SftArguments
import os
if __name__ == '__main__':
os.environ['MAX_PIXELS'] = '1003520'
sft_main(SftArguments(
model='Qwen/Qwen2.5-Omni-7B',
dataset=['AI-ModelScope/LaTeX_OCR#5000'],
model_type='my_qwen2_5_omni',
template='my_qwen2_5_omni',
load_from_cache_file=True,
split_dataset_ratio=0.01,
tuner_type='lora',
torch_dtype='bfloat16',
attn_impl='flash_attn',
padding_free=True,
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=1e-4,
lora_rank=8,
lora_alpha=32,
target_modules=['all-linear'],
freeze_vit=True,
freeze_aligner=True,
gradient_accumulation_steps=1,
eval_steps=50,
save_steps=50,
save_total_limit=2,
logging_steps=5,
max_length=2048,
output_dir='output',
warmup_ratio=0.05,
dataloader_num_workers=4,
dataset_num_proc=1,
))
```
使用命令行训练:
```shell
# 4 * 35GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
MAX_PIXELS=1003520 \
swift sft \
--model Qwen/Qwen2.5-Omni-7B \
--model_type my_qwen2_5_omni \
--template my_qwen2_5_omni \
--external_plugins 'examples/custom/my_qwen2_5_omni/my_register.py' \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#2000' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \
'speech_asr/speech_asr_aishell1_trainsets:validation#2000' \
'swift/VideoChatGPT:all#2000' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--tuner_type lora \
--torch_dtype bfloat16 \
--attn_impl flash_attn \
--padding_free true \
--packing true \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--freeze_vit true \
--freeze_aligner true \
--gradient_accumulation_steps 1 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 1 \
--deepspeed zero2
```
训练后对验证集进行推理:(环境变量请与训练时对齐)
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
MAX_PIXELS=1003520 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--max_new_tokens 512 \
--load_data_args true
```
使用以下命令将训练权重推送到 Modelscope:
```shell
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token ''
```
================================================
FILE: docs/source/BestPractices/Metax-support.md
================================================
# Metax支持
## 1. 在 Metax 平台上使用 Swift
你可以选择构建自己的镜像,也可以直接拉取已有的预构建镜像。本文以拉取预构建镜像为例,演示如何在 Metax 上使用 ms-swift。
### 1.1. 启动 ms-swift 容器
```bash
docker pull mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
# 你可以根据需要调整 --privileged 参数,并仅挂载特定的 GPU 卡。
# 更多信息请参考我们的官方文档:https://developer.metax-tech.com
# 必须通过 --device 挂载 Metax GPU 设备:--device=/dev/dri --device=/dev/mxcd
docker run -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
--shm-size 100gb --ulimit memlock=-1 \
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
--device=/dev/dri --device=/dev/mxcd \
-v /root/workspace:/external \
--name swift_test \
mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
```
## 2. 环境检查
### 2.1. 检查 Metax GPU 是否可用
得益于与 CUDA 的兼容性,我们可以像使用 NVIDIA GPU 一样检查 Metax 设备是否可用:
```python
import torch
print(torch.cuda.is_available())
# True
```
### 2.2. 检查 GPU 之间的 P2P 连接拓扑
```bash
mx-smi topo -m
# output
=================== MetaX System Management Interface Log ===================
Timestamp : Wed Feb 11 16:37:10 2026
Attached GPUs : 8
Device link type matrix
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 Node Affinity CPU Affinity
GPU0 X MX MX MX NODE NODE NODE NODE 0 0-31,64-95
GPU1 MX X MX MX NODE NODE NODE NODE 0 0-31,64-95
GPU2 MX MX X MX NODE NODE NODE NODE 0 0-31,64-95
GPU3 MX MX MX X NODE NODE NODE NODE 0 0-31,64-95
GPU4 NODE NODE NODE NODE X MX MX MX 0 0-31,64-95
GPU5 NODE NODE NODE NODE MX X MX MX 0 0-31,64-95
GPU6 NODE NODE NODE NODE MX MX X MX 0 0-31,64-95
GPU7 NODE NODE NODE NODE MX MX MX X 0 0-31,64-95
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
MX = Connection traversing MetaXLink
ETH = Connection traversing Eth
NA = Connection type is unknown
```
### 2.3. 查看 GPU 状态
```bash
mx-smi
# output
=================== MetaX System Management Interface Log ===================
Timestamp : Wed Feb 11 09:55:49 2026
Attached GPUs : 8
+---------------------------------------------------------------------------------+
| MX-SMI 2.2.9 Kernel Mode Driver Version: 3.4.4 |
| MACA Version: 3.3.0.15 BIOS Version: 1.30.0.0 |
|------------------+-----------------+---------------------+----------------------|
| Board Name | GPU Persist-M | Bus-id | GPU-Util sGPU-M |
| Pwr:Usage/Cap | Temp Perf | Memory-Usage | GPU-State |
|==================+=================+=====================+======================|
| 0 MetaX C500 | 0 Off | 0000:0e:00.0 | 0% Disabled |
| 57W / 350W | 35C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 1 MetaX C500 | 1 Off | 0000:0f:00.0 | 0% Disabled |
| 58W / 350W | 37C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 2 MetaX C500 | 2 Off | 0000:10:00.0 | 0% Disabled |
| 58W / 350W | 36C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 3 MetaX C500 | 3 Off | 0000:12:00.0 | 0% Disabled |
| 60W / 350W | 35C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 4 MetaX C500 | 4 Off | 0000:35:00.0 | 0% Disabled |
| 57W / 350W | 33C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 5 MetaX C500 | 5 Off | 0000:36:00.0 | 0% Disabled |
| 56W / 350W | 34C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 6 MetaX C500 | 6 Off | 0000:37:00.0 | 0% Disabled |
| 55W / 350W | 34C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
| 7 MetaX C500 | 7 Off | 0000:38:00.0 | 0% Disabled |
| 56W / 350W | 36C P0 | 826/65536 MiB | Available |
+------------------+-----------------+---------------------+----------------------+
+---------------------------------------------------------------------------------+
| Process: |
| GPU PID Process Name GPU Memory |
| Usage(MiB) |
|=================================================================================|
| no process found |
+---------------------------------------------------------------------------------+
```
## 3. 运行示例
我们支持直接使用社区版 Swift,同时在镜像中 /workspace 目录下提供了经过更多优化的版本。强烈建议优先使用该目录下的软件包。
### 3.1. 运行 Swift 示例
在大多数场景下,可直接运行 Swift 的训练示例:
```bash
# We assume that the ms-swift code is under /workspace
cd /workspace/ms-swift/
bash examples/train/full/train.sh
```
运行输出示例(节选):
```bash
# output:
{'loss': 1.47077751, 'grad_norm': 10.5625, 'learning_rate': 2e-06, 'token_acc': 0.65511727, 'epoch': 0.01, 'global_step/max_steps': '1/94', 'percentage': '1.06%', 'elapsed_time': '2s', 'remaining_time': '4m 28s', 'memory(GiB)': 4.87, 'train_speed(iter/s)': 0.345807}
{'loss': 1.58882141, 'grad_norm': 10.75, 'learning_rate': 1e-05, 'token_acc': 0.61763144, 'epoch': 0.05, 'global_step/max_steps': '5/94', 'percentage': '5.32%', 'elapsed_time': '10s', 'remaining_time': '3m 12s', 'memory(GiB)': 5.64, 'train_speed(iter/s)': 0.461462}
{'loss': 1.56617603, 'grad_norm': 12.8125, 'learning_rate': 9.92e-06, 'token_acc': 0.61519274, 'epoch': 0.11, 'global_step/max_steps': '10/94', 'percentage': '10.64%', 'elapsed_time': '20s', 'remaining_time': '2m 52s', 'memory(GiB)': 5.64, 'train_speed(iter/s)': 0.485796}
{'loss': 1.63347206, 'grad_norm': 13.6875, 'learning_rate': 9.69e-06, 'token_acc': 0.60373975, 'epoch': 0.16, 'global_step/max_steps': '15/94', 'percentage': '15.96%', 'elapsed_time': '30s', 'remaining_time': '2m 39s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.493855}
{'loss': 1.60613976, 'grad_norm': 11.0, 'learning_rate': 9.32e-06, 'token_acc': 0.59997221, 'epoch': 0.21, 'global_step/max_steps': '20/94', 'percentage': '21.28%', 'elapsed_time': '39s', 'remaining_time': '2m 27s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.500516}
{'loss': 1.45015478, 'grad_norm': 15.25, 'learning_rate': 8.8e-06, 'token_acc': 0.62373584, 'epoch': 0.27, 'global_step/max_steps': '25/94', 'percentage': '26.60%', 'elapsed_time': '49s', 'remaining_time': '2m 16s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.50548}
{'loss': 1.39427547, 'grad_norm': 13.9375, 'learning_rate': 8.18e-06, 'token_acc': 0.6357994, 'epoch': 0.32, 'global_step/max_steps': '30/94', 'percentage': '31.91%', 'elapsed_time': '59s', 'remaining_time': '2m 5s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.508409}
{'loss': 1.53672237, 'grad_norm': 11.125, 'learning_rate': 7.45e-06, 'token_acc': 0.61650612, 'epoch': 0.37, 'global_step/max_steps': '35/94', 'percentage': '37.23%', 'elapsed_time': '1m 8s', 'remaining_time': '1m 55s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.510425}
{'loss': 1.54039021, 'grad_norm': 13.8125, 'learning_rate': 6.65e-06, 'token_acc': 0.61613974, 'epoch': 0.43, 'global_step/max_steps': '40/94', 'percentage': '42.55%', 'elapsed_time': '1m 18s', 'remaining_time': '1m 45s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512302}
{'loss': 1.40159426, 'grad_norm': 9.4375, 'learning_rate': 5.79e-06, 'token_acc': 0.64041773, 'epoch': 0.48, 'global_step/max_steps': '45/94', 'percentage': '47.87%', 'elapsed_time': '1m 27s', 'remaining_time': '1m 35s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512983}
{'loss': 1.54977188, 'grad_norm': 11.9375, 'learning_rate': 4.91e-06, 'token_acc': 0.61078816, 'epoch': 0.53, 'global_step/max_steps': '50/94', 'percentage': '53.19%', 'elapsed_time': '1m 37s', 'remaining_time': '1m 25s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.514489}
{'loss': 1.6754509, 'grad_norm': 13.0625, 'learning_rate': 4.04e-06, 'token_acc': 0.58574393, 'epoch': 0.59, 'global_step/max_steps': '55/94', 'percentage': '58.51%', 'elapsed_time': '1m 46s', 'remaining_time': '1m 15s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.515752}
{'loss': 1.37204351, 'grad_norm': 9.25, 'learning_rate': 3.19e-06, 'token_acc': 0.6391937, 'epoch': 0.64, 'global_step/max_steps': '60/94', 'percentage': '63.83%', 'elapsed_time': '1m 56s', 'remaining_time': '1m 5s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.516829}
{'loss': 1.47697926, 'grad_norm': 11.375, 'learning_rate': 2.4e-06, 'token_acc': 0.62817259, 'epoch': 0.69, 'global_step/max_steps': '65/94', 'percentage': '69.15%', 'elapsed_time': '2m 5s', 'remaining_time': '55s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.517947}
{'loss': 1.4336628, 'grad_norm': 8.125, 'learning_rate': 1.69e-06, 'token_acc': 0.63453862, 'epoch': 0.75, 'global_step/max_steps': '70/94', 'percentage': '74.47%', 'elapsed_time': '2m 14s', 'remaining_time': '46s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.518833}
{'loss': 1.54315252, 'grad_norm': 9.625, 'learning_rate': 1.08e-06, 'token_acc': 0.60202073, 'epoch': 0.8, 'global_step/max_steps': '75/94', 'percentage': '79.79%', 'elapsed_time': '2m 24s', 'remaining_time': '36s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.519627}
{'loss': 1.47180223, 'grad_norm': 9.5625, 'learning_rate': 6e-07, 'token_acc': 0.62211501, 'epoch': 0.85, 'global_step/max_steps': '80/94', 'percentage': '85.11%', 'elapsed_time': '2m 33s', 'remaining_time': '26s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520284}
{'loss': 1.44068375, 'grad_norm': 10.125, 'learning_rate': 2.5e-07, 'token_acc': 0.62673112, 'epoch': 0.91, 'global_step/max_steps': '85/94', 'percentage': '90.43%', 'elapsed_time': '2m 43s', 'remaining_time': '17s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520331}
{'loss': 1.44893646, 'grad_norm': 8.375, 'learning_rate': 5e-08, 'token_acc': 0.63837478, 'epoch': 0.96, 'global_step/max_steps': '90/94', 'percentage': '95.74%', 'elapsed_time': '2m 52s', 'remaining_time': '7s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.520707}
{'train_runtime': 183.4332, 'train_samples_per_second': 8.177, 'train_steps_per_second': 0.512, 'train_loss': 1.50650934, 'token_acc': 0.6194337, 'epoch': 1.0, 'global_step/max_steps': '94/94', 'percentage': '100.00%', 'elapsed_time': '3m 3s', 'remaining_time': '0s', 'memory(GiB)': 6.5, 'train_speed(iter/s)': 0.512463}
Train: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [03:03<00:00, 1.95s/it]
[INFO:swift] last_model_checkpoint: /workspace/ms-swift/output/v0-20260211-143035/checkpoint-94
[INFO:swift] best_model_checkpoint: None
[INFO:swift] images_dir: /workspace/ms-swift/output/v0-20260211-143035/images
[INFO:swift] End time of running main: 2026-02-11 14:34:09.521336
```
### 3.2. 使用 Megatron-LM 作为 Swift 后端
若希望使用 Megatron-LM 作为 Swift 的后端,需设置 `MEGATRON_LM_PATH` 环境变量:
```bash
export MEGATRON_LM_PATH=/workspace/Megatron-LM-0.15.0
cd /workspace/ms-swift
bash examples/megatron/pretrain.sh
```
### 3.3. 使用其他版本的 ms-swift
Metax 平台要求使用与 Maca 兼容的软件包。例如,编译可能依赖 torch2.8,因此需使用 torch2.8+maca3.3.x.x 版本。
默认情况下,安装会覆盖环境中已有的 PyTorch。因此,建议使用 --no-deps 参数进行安装:
```bash
git clone -b ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install . --no-deps
```
每次环境变更后,请检查 PyTorch 版本及其可用性:
```bash
pip list |grep torch
# output:
# torch2.x.x+metax3.x.x.x
```
```python
import torch
torch.cuda.is_available()
```
### 3.4. Metax 与 NVIDIA CUDA 的差异
Metax 在大部分接口上与 NVIDIA 对齐,但在某些软件行为和环境变量上存在差异。
#### 3.4.1. MACA_MPS_MODE
默认情况下,MACA 不允许多个进程共享同一块 GPU。如果 GPU 已被占用,则无法启动新进程。
如需启用类似 MPS(Multi-Process Service)的功能,需设置:`MACA_MPS_MODE=1`
```bash
# 运行其他脚本...
export MACA_MPS_MODE=1
cd /workspace/ms-swift/
bash examples/train/full/train.sh
```
#### 3.4.2. MCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME & MCCL_IB_HCA
在多节点训练时,建议设置以下环境变量以确保节点间通信正常:
> MCCL_SOCKET_IFNAME:用于 MCCL 通信的网络接口
> GLOO_SOCKET_IFNAME:用于 GLOO 通信的网络接口
> MCCL_IB_HCA:指定使用的 InfiniBand 设备
可通过 ifconfig 和 mx-smi 确定所用网卡和 IB 设备:
```bash
ifconfig
# output
ens20f0np0: xxx
inet: your node ip
xxx
...
```
```bash
mx-smi topo -n
# output
mx-smi version: 2.2.9
=================== MetaX System Management Interface Log ===================
Timestamp : Wed Feb 11 18:53:44 2026
Attached GPUs : 8
Device link type matrix
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 NIC2 NIC3 NIC4 NIC5 Node Affinity CPU Affinity
GPU0 X MX MX MX NODE NODE NODE NODE PIX PIX NODE NODE SYS SYS 0 0-31,64-95
GPU1 MX X MX MX NODE NODE NODE NODE PIX PIX NODE NODE SYS SYS 0 0-31,64-95
GPU2 MX MX X MX NODE NODE NODE NODE PIX PIX NODE NODE SYS SYS 0 0-31,64-95
GPU3 MX MX MX X NODE NODE NODE NODE PIX PIX NODE NODE SYS SYS 0 0-31,64-95
GPU4 NODE NODE NODE NODE X MX MX MX NODE NODE PIX PIX SYS SYS 0 0-31,64-95
GPU5 NODE NODE NODE NODE MX X MX MX NODE NODE PIX PIX SYS SYS 0 0-31,64-95
GPU6 NODE NODE NODE NODE MX MX X MX NODE NODE PIX PIX SYS SYS 0 0-31,64-95
GPU7 NODE NODE NODE NODE MX MX MX X NODE NODE PIX PIX SYS SYS 0 0-31,64-95
NIC0 PIX PIX PIX PIX NODE NODE NODE NODE X PIX NODE NODE SYS SYS
NIC1 PIX PIX PIX PIX NODE NODE NODE NODE PIX X NODE NODE SYS SYS
NIC2 NODE NODE NODE NODE PIX PIX PIX PIX NODE NODE X PIX SYS SYS
NIC3 NODE NODE NODE NODE PIX PIX PIX PIX NODE NODE PIX X SYS SYS
NIC4 SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS X PIX
NIC5 SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS SYS PIX X
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
MX = Connection traversing MetaXLink
ETH = Connection traversing Eth
NA = Connection type is unknown
NIC Legend:
NIC0: mlx5_0
NIC1: mlx5_1
NIC2: mlx5_2
NIC3: mlx5_3
NIC4: mlx5_4
NIC5: mlx5_5
# 根据拓扑信息可知:
# 1. GPU0–GPU3 与 NIC0/NIC1(即 mlx5_0, mlx5_1)通信
# 2. GPU4–GPU7 与 NIC2/NIC3(即 mlx5_2, mlx5_3)通信
```
因此,推荐设置如下:
`MCCL_SOCKET_IFNAME=ens20f0np0`
`GLOO_SOCKET_IFNAME=ens20f0np0`
`MCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3`
```bash
# node 1
export MCCL_SOCKET_IFNAME=ens20f0np0
export GLOO_SOCKET_IFNAME=ens20f0np0
export MCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3
cd /workspace/ms-swift/
bash examples/train/multi-node/torchrun/train_node1.sh
```
```bash
# node 2
# 需修改脚本中的 master_addr 为节点1的IP
export MCCL_SOCKET_IFNAME=ens20f0np0
export GLOO_SOCKET_IFNAME=ens20f0np0
export MCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3
cd /workspace/ms-swift/
bash examples/train/multi-node/torchrun/train_node2.sh
```
================================================
FILE: docs/source/BestPractices/More-Best-Practices.md
================================================
# 更多最佳实践
- [Qwen2.5自我认知微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition)
- [Qwen2-VL Latex-OCR微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr)
- [Qwen2.5-VL Grounding任务微调](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding)
================================================
FILE: docs/source/BestPractices/NPU-support.md
================================================
# NPU支持
我们在 ms-swift 上增加了对昇腾 NPU 的支持,用户可以在昇腾 NPU 上进行模型的微调和推理。
本文档介绍了如何在昇腾 NPU 上进行环境准备、模型微调、推理和部署。
## 安装
基础环境准备:
| software | version |
| --------- | --------------- |
| Python | >= 3.10, < 3.12 |
| CANN | == 8.5.1 |
| torch | == 2.7.1 |
| torch_npu | == 2.7.1.post2 |
基础环境准备请参照这份 [Ascend PyTorch 安装文档](https://gitcode.com/Ascend/pytorch)。
## 环境准备
实验环境:8 * 昇腾910B3 64G
### 环境安装
```shell
# 创建新的 conda 虚拟环境(可选)
conda create -n swift-npu python=3.10 -y
conda activate swift-npu
# 注意进行后续操作前要先 source 激活 CANN 环境
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置 pip 全局镜像(可选,加速下载)
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
pip install ms-swift -U
# 使用源码安装
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e .
# 安装 torch-npu
pip install torch-npu decorator
# 如果你想要使用 deepspeed(控制显存占用,训练速度会有一定下降)
pip install deepspeed
# 如果需要使用 evaluation 功能,请安装以下包
pip install evalscope[opencompass]
# 如果需要使用 vllm-ascend 进行推理,请安装以下包
pip install vllm==0.14.0
pip install vllm-ascend==0.14.0rc1
```
测试环境是否安装正确,NPU能否被正常加载:
```python
from transformers.utils import is_torch_npu_available
import torch
print(is_torch_npu_available()) # True
print(torch.npu.device_count()) # 8
print(torch.randn(10, device='npu:0'))
```
**如果需要使用 MindSpeed(Megatron-LM),请按照下面引导安装必要依赖**
```shell
# 1. 获取并切换 Megatron-LM 至 core_v0.12.1 版本
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout core_v0.12.1
cd ..
# 2. 获取并安装 MindSpeed
git clone https://gitcode.com/Ascend/MindSpeed.git
cd MindSpeed
git checkout 2.3.0_core_r0.12.1
pip install -e .
cd ..
# 3. 设置环境变量
export PYTHONPATH=$PYTHONPATH:
export MEGATRON_LM_PATH=
```
执行如下命令验证 MindSpeed(Megatron-LM) 是否配置成功:
```shell
python -c "import mindspeed.megatron_adaptor; from swift.megatron.init import init_megatron_env; init_megatron_env(); print('✓ NPU环境下的Megatron-SWIFT配置验证成功!')"
```
### 环境查看
查看NPU的P2P连接,这里看到每个NPU都通过7条HCCS与其他NPU互联
```shell
(valle) root@valle:~/src# npu-smi info -t topo
NPU0 NPU1 NPU2 NPU3 NPU4 NPU5 NPU6 NPU7 CPU Affinity
NPU0 X HCCS HCCS HCCS HCCS HCCS HCCS HCCS 144-167
NPU1 HCCS X HCCS HCCS HCCS HCCS HCCS HCCS 144-167
NPU2 HCCS HCCS X HCCS HCCS HCCS HCCS HCCS 96-119
NPU3 HCCS HCCS HCCS X HCCS HCCS HCCS HCCS 96-119
NPU4 HCCS HCCS HCCS HCCS X HCCS HCCS HCCS 0-23
NPU5 HCCS HCCS HCCS HCCS HCCS X HCCS HCCS 0-23
NPU6 HCCS HCCS HCCS HCCS HCCS HCCS X HCCS 48-71
NPU7 HCCS HCCS HCCS HCCS HCCS HCCS HCCS X 48-71
Legend:
X = Self
SYS = Path traversing PCIe and NUMA nodes. Nodes are connected through SMP, such as QPI, UPI.
PHB = Path traversing PCIe and the PCIe host bridge of a CPU.
PIX = Path traversing a single PCIe switch
PXB = Path traversing multiple PCIe switches
HCCS = Connection traversing HCCS.
NA = Unknown relationship.
```
查看NPU状态, npu-smi命令详解可以查看[官方文档](https://support.huawei.com/enterprise/zh/doc/EDOC1100079287/10dcd668)
```shell
(valle) root@valle:~/src# npu-smi info
+------------------------------------------------------------------------------------------------+
| npu-smi 24.1.rc1.b030 Version: 24.1.rc1.b030 |
+---------------------------+---------------+----------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
+===========================+===============+====================================================+
| 0 910B3 | OK | 101.8 43 0 / 0 |
| 0 | 0000:C1:00.0 | 0 0 / 0 3318 / 65536 |
+===========================+===============+====================================================+
| 1 910B3 | OK | 92.0 39 0 / 0 |
| 0 | 0000:C2:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 2 910B3 | OK | 102.0 40 0 / 0 |
| 0 | 0000:81:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 3 910B3 | OK | 99.8 40 0 / 0 |
| 0 | 0000:82:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 4 910B3 | OK | 98.6 45 0 / 0 |
| 0 | 0000:01:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 5 910B3 | OK | 99.7 44 0 / 0 |
| 0 | 0000:02:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 6 910B3 | OK | 103.8 45 0 / 0 |
| 0 | 0000:41:00.0 | 0 0 / 0 3314 / 65536 |
+===========================+===============+====================================================+
| 7 910B3 | OK | 98.2 44 0 / 0 |
| 0 | 0000:42:00.0 | 0 0 / 0 3315 / 65536 |
+===========================+===============+====================================================+
```
## 微调
以下介绍LoRA的微调, 全参数微调设置参数`--tuner_type full`即可. **更多训练脚本**参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/ascend/train).
| 模型大小 | NPU数量 | deepspeed类型 | 最大显存占用量 |
| -------- | ------- | ------------- | -------------- |
| 7B | 1 | None | 1 * 28 GB |
| 7B | 4 | None | 4 * 22 GB |
| 7B | 4 | zero2 | 4 * 28 GB |
| 7B | 4 | zero3 | 4 * 22 GB |
| 7B | 8 | None | 8 * 22 GB |
| 14B | 1 | None | 1 * 45 GB |
| 14B | 8 | None | 8 * 51 GB |
| 14B | 8 | zero2 | 8 * 49 GB |
| 14B | 8 | zero3 | 8 * 31 GB |
### 单卡训练
通过如下命令启动单卡微调: (注意: 如果微调期间出现nan的情况, 请设置`--torch_dtype float32`.)
```shell
# 实验环境: 昇腾910B3
# 显存需求: 28 GB
# 运行时长: 8小时
ASCEND_RT_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen2-7B-Instruct \
--dataset AI-ModelScope/blossom-math-v2 \
--split_dataset_ratio 0.01 \
--num_train_epochs 5 \
--tuner_type lora \
--output_dir output \
--learning_rate 1e-4 \
--gradient_accumulation_steps 16 \
--save_steps 100 \
--eval_steps 100
```
### 数据并行训练
我们使用其中的4卡进行ddp训练
```shell
# 实验环境: 4 * 昇腾910B3
# 显存需求: 4 * 22 GB
# 运行时长: 2小时
NPROC_PER_NODE=4 \
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model Qwen/Qwen2-7B-Instruct \
--dataset AI-ModelScope/blossom-math-v2 \
--split_dataset_ratio 0.01 \
--num_train_epochs 5 \
--tuner_type lora \
--output_dir output \
...
```
### Deepspeed训练
ZeRO2:
```shell
# 实验环境: 4 * 昇腾910B3
# 显存需求: 4 * 28GB
# 运行时长: 3.5小时
NPROC_PER_NODE=4 \
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model Qwen/Qwen2-7B-Instruct \
--dataset AI-ModelScope/blossom-math-v2 \
--split_dataset_ratio 0.01 \
--num_train_epochs 5 \
--tuner_type lora \
--output_dir output \
--deepspeed zero2 \
...
```
ZeRO3:
```shell
# 实验环境: 4 * 昇腾910B3
# 显存需求: 4 * 22 GB
# 运行时长: 8.5小时
NPROC_PER_NODE=4 \
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model Qwen/Qwen2-7B-Instruct \
--dataset AI-ModelScope/blossom-math-v2 \
--split_dataset_ratio 0.01 \
--num_train_epochs 5 \
--tuner_type lora \
--output_dir output \
--deepspeed zero3 \
...
```
## 推理
原始模型:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2-7B-Instruct \
--stream true --max_new_tokens 2048
```
LoRA微调后:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift infer \
--adapters xxx/checkpoint-xxx --load_data_args true \
--stream true --max_new_tokens 2048
# merge-lora并推理
ASCEND_RT_VISIBLE_DEVICES=0 swift export --adapters xx/checkpoint-xxx --merge_lora true
ASCEND_RT_VISIBLE_DEVICES=0 swift infer \
--model xxx/checkpoint-xxx-merged --load_data_args true \
--stream true --max_new_tokens 2048
```
## 部署
### 使用原生transformers进行部署
原始模型:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy --model Qwen/Qwen2-7B-Instruct --max_new_tokens 2048
```
LoRA微调后:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy --adapters xxx/checkpoint-xxx --max_new_tokens 2048
# merge-lora并推理
ASCEND_RT_VISIBLE_DEVICES=0 swift export --adapters xx/checkpoint-xxx --merge_lora true
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy --model xxx/checkpoint-xxx-merged --max_new_tokens 2048
```
### 使用vLLM-ascend进行部署
使用pypi进行安装:
```shell
# Install vllm-project/vllm. The newest supported version is v0.11.0.
pip install vllm==0.11.0
# Install vllm-project/vllm-ascend from pypi.
pip install vllm-ascend==0.11.0rc3
```
原始模型:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm \
--max_new_tokens 2048
```
LoRA微调后:
```shell
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy \
--adapters xxx/checkpoint-xxx \
--infer_backend vllm \
--max_new_tokens 2048
# merge-lora并推理
ASCEND_RT_VISIBLE_DEVICES=0 swift export \
--adapters xx/checkpoint-xxx \
--merge_lora true
ASCEND_RT_VISIBLE_DEVICES=0 swift deploy \
--model xxx/checkpoint-xxx-merged \
--infer_backend vllm \
--max_new_tokens 2048
```
## 支持现状
| 一级特性 | 特性 | 进展 |
| -------- | ------------------- | -------- |
| 训练范式 | CPT | 已支持 |
| | SFT | 已支持 |
| | DPO | 已支持 |
| | RM | 已支持 |
| 分布式 | DDP | 已支持 |
| | FSDP | 已支持 |
| | FSDP2 | 已支持 |
| | DeepSpeed | 已支持 |
| | MindSpeed(Megatron) | 已支持 |
| 低参微调 | FULL | 已支持 |
| | LoRA | 已支持 |
| | QLoRA | 暂不支持 |
| RLHF | GRPO | 已支持 |
| | PPO | 已支持 |
| 性能优化 | FA 等融合算子 | 已支持 |
| | Liger-Kernel | 暂不支持 |
| 部署 | PT | 已支持 |
| | vLLM | 已支持 |
| | SGLang | 暂不支持 |
------
### 表 1:SFT 类算法
| algorithm | model families | strategy | hardware |
| --------- | --------------------------- | --------------------- | ----------------- |
| SFT | Qwen2.5-0.5B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen2.5-1.5B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen2.5-7B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen2.5-VL-3B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen2.5-VL-7B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen2.5-Omni-3B | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen3-8B | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen3-32B | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen3-VL-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Qwen3-Omni-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | InternVL3-8B | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
| SFT | Ovis2.5-2B | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
------
### 表 2:RL 类算法
| algorithm | model families | strategy | rollout engine | hardware |
| --------- | ------------------- | --------- | -------------- | ----------------- |
| **GRPO** | Qwen2.5-7B-Instruct | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
| **GRPO** | Qwen3-8B | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
| **DPO** | Qwen2.5-7B-Instruct | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
| **DPO** | Qwen3-8B | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
| **PPO** | Qwen2.5-7B-Instruct | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
| **PPO** | Qwen3-8B | deepspeed | vllm-ascend | Atlas 900 A2 PODc |
---
### 表 3:当前 NPU 暂不支持 / 未完全验证的模块
| item |
| --------------------------------- |
| Liger-kernel |
| 量化/QLoRA相关 |
| 使用sglang作为推理引擎 |
| 使用megatron时开启ETP进行lora训练 |
## NPU微信群
================================================
FILE: docs/source/BestPractices/Qwen3-Best-Practice.md
================================================
# Qwen3最佳实践
讨论区:[issue 4030](https://github.com/modelscope/ms-swift/issues/4030)
Qwen文档: [https://qwen.readthedocs.io/en/latest/training/ms_swift.html](https://qwen.readthedocs.io/en/latest/training/ms_swift.html)
## 推理
思考模式:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model Qwen/Qwen3-8B \
--infer_backend vllm \
--stream true \
--max_new_tokens 2048 \
--vllm_max_model_len 8192
```
```text
<<< who are you?
Okay, the user is asking "who are you?" Let me start by introducing myself as Qwen, the large language model developed by Alibaba Cloud. I should mention my capabilities, like answering questions, creating content, and engaging in conversations. But I need to keep it concise. Also, the user might want to know how I can assist them. Maybe I should ask how I can help them today. Let me check if there's anything else important to include. Oh, I should make sure the tone is friendly and approachable. Alright, that should cover it.
Hello! I am Qwen, a large language model developed by Alibaba Cloud. I can assist with a wide range of tasks, such as answering questions, creating content, writing stories, coding, and more. How can I help you today? 😊
<<< clear
<<< who are you? /no_think
I am Qwen, a large language model developed by Alibaba Cloud. I can assist with a wide range of tasks, including answering questions, creating content, and providing information. How can I help you today?
```
非思考模式:
- 其中`--response_prefix`代表模型的输出会在其前缀后继续生成。等价于enable_thinking设置为False。
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model Qwen/Qwen3-8B \
--infer_backend vllm \
--stream true \
--max_new_tokens 2048 \
--vllm_max_model_len 8192 \
--response_prefix '\n\n \n\n'
```
```text
<<< who are you?
I am Qwen, a large-scale language model developed by Alibaba Cloud. I am designed to assist with a wide range of tasks, including answering questions, creating content, and providing information. How can I assist you today?
```
## 训练
在开始训练之前,请确保您的环境已正确配置。
```bash
pip install ms-swift -U
pip install transformers
pip install deepspeed # 多GPU训练
pip install liger-kernel # 节约显存资源
pip install flash-attn --no-build-isolation # packing需要
```
## 监督微调 (SFT)
### 数据准备
使用 ms-swift 进行 SFT 的自定义数据集格式如下(system 字段是可选的)。您可以将其组织为 JSON、JSONL 或 CSV 格式。在训练脚本中指定 `--dataset `。有关完整的数据集格式指南,请参考[自定义数据集文档](../Customization/Custom-dataset.md)。
```text
# 通用格式
{"messages": [
{"role": "system", "content": ""},
{"role": "user", "content": ""},
{"role": "assistant", "content": ""}
]}
# 带think的格式
{"messages": [
{"role": "user", "content": "Where is the capital of Zhejiang?"},
{"role": "assistant", "content": "\n...\n \n\nThe capital of Zhejiang is Hangzhou."}
]}
```
如果您想使用不含思维链的数据进行训练,同时保留模型的推理能力,可以通过以下两种方法尽量减少微调的影响:
**选项 1**:【推荐】在训练期间,指定 `--loss_scale ignore_empty_think`,以忽略对 `'\n\n \n\n'` 的损失计算,从而避免推理能力的丧失。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/think_model/qwen3_demo1.sh)。该方式同样适用于deepseek-r1等模型。自定义数据集格式如下:
```json
{"messages": [
{"role": "user", "content": "Where is the capital of Zhejiang?"},
{"role": "assistant", "content": "\n\n \n\nThe capital of Zhejiang is Hangzhou."}
]}
```
**选项 2**:在数据集的查询中添加 `/no_think`,以避免推理能力的丧失。训练脚本请参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/think_model/qwen3_demo2.sh)。自定义数据集格式如下:
```json
{"messages": [
{"role": "user", "content": "Where is the capital of Zhejiang? /no_think"},
{"role": "assistant", "content": "\n\n \n\nThe capital of Zhejiang is Hangzhou."}
]}
```
你可以使用以下命令获取蒸馏的推理数据集,在训练时,与不含思维链数据集进行混合,进一步缓解推理能力的丧失:
- 其中`--val_dataset`的选择任意。推理产生的`result_path`,可以直接在训练时指定`--dataset distill_dataset.jsonl`使用。
- 该思路同样适用于其他推理模型,例如deepseek-r1。
```shell
# 4 * 80GiB
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift infer \
--model Qwen/Qwen3-32B \
--infer_backend vllm \
--val_dataset 'AI-ModelScope/alpaca-gpt4-data-en#5000' 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
--vllm_gpu_memory_utilization 0.9 \
--vllm_tensor_parallel_size 2 \
--vllm_max_model_len 8192 \
--max_new_tokens 4096 \
--write_batch_size 1000 \
--result_path distill_dataset.jsonl
```
### 30分钟自我认知微调
本节将介绍30分钟对 Qwen3-8B 进行自我认知微调。所需GPU显存为 22GB,可以在 ModelScope 提供的[免费算力](https://modelscope.cn/my/mynotebook) A10 中运行。
训练后,模型将不再认为自己是由“阿里云”训练的“Qwen”,而是由“swift”训练的“swift-robot”。
如果需要在离线环境下进行训练,可以手动下载模型和数据集,并指定 `--model ` 和 `--dataset `。数据集可以在 [Modelscope Hub](https://modelscope.cn/datasets/swift/self-cognition)上找到。对`swift/self-cognition`数据集的预处理函数可以查看[这里](https://github.com/modelscope/ms-swift/blob/36fdf381e5e88cb8a71c9d69c1d8936a989318cc/swift/llm/dataset/dataset/llm.py#L882)。
关于训练脚本中各参数的含义,请参考[命令行参数文档](../Instruction/Command-line-parameters.md)。
```bash
# 显存占用:22GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-8B \
--tuner_type lora \
--dataset 'swift/Qwen3-SFT-Mixin#2000' \
'swift/self-cognition:qwen3#600' \
--load_from_cache_file true \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
微调完成后,可以使用以下脚本来测试微调结果。注意,`--adapters` 部分需要修改为最后保存检查点的目录路径:
```bash
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
```
```text
<<< who are you?
Okay, the user asked, "who are you?" I need to introduce myself. Let me start by stating my name, swift-robot. Then, I should mention that I'm an AI assistant developed by swift. I should explain my purpose, which is to provide information and assistance. I should also highlight my capabilities, like answering questions, generating text, and engaging in conversation. It's important to keep the tone friendly and approachable. Maybe add something about being here to help and encourage the user to ask anything. Let me check if I covered all the key points: name, developer, purpose, capabilities, and a welcoming statement. Yeah, that should do it. Now, let me put that into a concise and friendly response.
Hello! I am swift-robot, an artificial intelligence assistant developed by swift. My purpose is to provide information and assistance to users like you. I can answer questions, generate text, and engage in conversations on a wide range of topics. I am here to help, so feel free to ask me anything you need!
```
默认情况下,ms-swift 会使用 ModelScope 社区下载模型和数据集。如果想使用 HuggingFace 社区,则需要额外指定 `--use_hf true`。
合并 LoRA 权重:
```shell
swift export \
--adapters output/checkpoint-xxx \
--merge_lora true
```
推送模型到 ModelScope/HuggingFace:
```bash
# 如果是推送完整的权重,需要修改`--adapters`为`--model`.
# Modelscope的hub_token可以在这里找到: https://modelscope.cn/my/myaccesstoken
swift export \
--adapters output/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token '' \
--use_hf false
```
如果要使用多 GPU 进行训练,以下提供了多 GPU 训练的示例:
```bash
# 4 * 60GB
# 你可以通过设置`--dataset AI-ModelScope/alpaca-gpt4-data-en`跑通实验
# 注意:如果你指定了`--packing true`, 你必须额外设置`--attn_impl flash_attn`
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model Qwen/Qwen3-8B \
--tuner_type full \
--dataset '' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--torch_dtype bfloat16 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-5 \
--gradient_accumulation_steps 4 \
--packing true \
--eval_steps 100 \
--save_steps 100 \
--logging_steps 5 \
--max_length 8192 \
--warmup_ratio 0.05 \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--save_total_limit 2 \
--save_only_model true \
--output_dir output \
--deepspeed zero3 \
--use_liger_kernel true \
--attn_impl flash_attn
```
## 强化学习 (RL)
ms-swift 支持 DPO、GRPO、DAPO、PPO、KTO、GKD 等 RLHF 方法。本章将着重介绍使用 ms-swift 对 Qwen3-8B 进行 GRPO 训练。更多关于GRPO的内容,可以参考[GRPO文档](../Instruction/GRPO/GetStarted/GRPO.md)。更多RLHF训练脚本,参考[examples/train/rlhf](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf)。
### 环境设置
除了安装上述介绍的 ms-swift 相关依赖项外,还需要安装以下依赖项:
```
pip install "math_verify"
pip install vllm==0.8.5.post1
```
### 数据准备
使用 ms-swift 进行 GRPO 训练的数据集格式与 SFT 类似,但不需要最后一轮的 assistant 部分。如果使用 accuracy 作为奖励,则需要额外的 `solution` 列来计算准确率。
示例数据集格式:
```jsonl
{"messages": [{"role": "user", "content": "Tell me tomorrow's weather"}]}
{"messages": [{"role": "user", "content": "What is 1 + 1?"}, {"role": "assistant", "content": "It equals 2"}, {"role": "user", "content": "What about adding 1?"}]}
{"messages": [{"role": "user", "content": "What is your name?"}]}
```
关于其他 RLHF 算法的数据集准备,请参考[自定义数据集文档](../Customization/Custom-dataset.md#rlhf)。
数据集要求的注意事项:
- **奖励函数计算**:数据集格式取决于所使用的奖励函数。可能需要额外的列来支持特定的奖励计算。例如:
- 当使用内置的 accuracy 或 cosine 奖励时,数据集必须包含一个 `solution` 列以计算回复的准确性。
- 数据集中的其他列将作为 ``**kwargs`` 传递给奖励函数以实现进一步的自定义。
- **自定义奖励函数**:为了根据您的具体需求调整奖励函数,可以参考链接:[外部奖励插件](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin)。该插件提供了实现自定义奖励函数的示例和模板。
我们使用使 AI-MO/NuminaMath-TIR 作为数据集,并使用accuracy函数计算模型回答的准确率奖励。
在训练过程中,使用 vLLM 加速采样过程。
```bash
# 70G*8
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen3-8B \
--tuner_type full \
--dataset 'AI-MO/NuminaMath-TIR#5000' \
--load_from_cache_file true \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-6 \
--save_total_limit 2 \
--logging_steps 5 \
--output_dir output \
--gradient_accumulation_steps 1 \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--max_length 4096 \
--max_completion_length 4096 \
--vllm_max_model_len 8192 \
--reward_funcs accuracy \
--num_generations 16 \
--use_vllm true \
--vllm_gpu_memory_utilization 0.4 \
--sleep_level 1 \
--offload_model true \
--offload_optimizer true \
--deepspeed zero3 \
--vllm_tensor_parallel_size 1 \
--temperature 1.0 \
--top_p 0.85 \
--log_completions true \
--overlong_filter true
```
## Megatron-SWIFT
Qwen3-235B-A22B-Instruct-250718 单机8卡H20 LoRA训练的最佳实践参考:[https://github.com/modelscope/ms-swift/pull/5033](https://github.com/modelscope/ms-swift/pull/5033)。
ms-swift 引入了 Megatron 并行技术以加速大模型的CPT/SFT/DPO/GRPO。支持的模型可以在[支持的模型文档](../Instruction/Supported-models-and-datasets.md)中找到。
关于环境准备,可以参考[Megatron-SWIFT训练文档](../Megatron-SWIFT/Quick-start.md)。
我们将使用阿里云 DLC 启动训练。训练环境由2台配备8卡 80GiB A800 GPU 组成。关于多节点启动方法的更多信息,请参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node)。
```bash
# https://help.aliyun.com/zh/pai/user-guide/general-environment-variables
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NNODES=$WORLD_SIZE \
NODE_RANK=$RANK \
megatron sft \
--model Qwen/Qwen3-30B-A3B-Base \
--save_safetensors true \
--dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--pipeline_model_parallel_size 2 \
--expert_model_parallel_size 8 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_permute_fusion true \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 1 \
--global_batch_size 16 \
--packing true \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--train_iters 2000 \
--eval_iters 50 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--output_dir megatron_output/Qwen3-30B-A3B-Base \
--eval_steps 200 \
--save_steps 200 \
--max_length 8192 \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--attention_backend flash
```
训练loss图(部分):
效果截图:
自定义数据集格式与`swift sft`相同,详见之前章节。只需指定 `--dataset ` 即可。
使用 `megatron sft` 和 `swift sft` 在对 Qwen3-30B-A3B 模型进行全参数微调的训练速度和 GPU 显存使用对比情况如下:
| | Megatron-LM | DeepSpeed-ZeRO2 | DeepSpeed-ZeRO3 |
| -------- | ----------- | --------------- | --------------- |
| 训练速度 | 9.6s/it | - | 91.2s/it |
| 显存使用 | 16 * 60GiB | OOM | 16 * 80GiB |
================================================
FILE: docs/source/BestPractices/Qwen3-VL-Best-Practice.md
================================================
# Qwen3-VL最佳实践
## 环境准备
在开始推理和训练之前,请确保您的环境已准备就绪。
```shell
pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
pip install "ms-swift>=4.0"
# pip install "vllm>=0.11.0" # 若使用vllm推理后端进行推理
```
- 关于视频数据训练卡住:使用decord后端读取视频可能导致卡住问题,参考[这个issue](https://github.com/dmlc/decord/issues/269)。你可以使用torchcodec后端,具体参考[qwen_vl_utils](https://github.com/QwenLM/Qwen3-VL/blob/50068df2334f309979ff05d75f1078c8309c63ed/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L390-L400)库。
## 推理
使用 transformers 推理:
```python
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from modelscope import snapshot_download
from qwen_vl_utils import process_vision_info
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct')
model = Qwen3VLForConditionalGeneration.from_pretrained(
model_dir, dtype="auto", device_map="auto",
# attn_implementation='flash_attention_2',
)
processor = AutoProcessor.from_pretrained(model_dir)
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4",
"max_pixels": 128*32*32,
"max_frames": 16,
},
{"type": "text", "text": "Describe this video."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True,
image_patch_size= 16,
return_video_metadata=True)
if video_inputs is not None:
video_inputs, video_metadatas = zip(*video_inputs)
video_inputs, video_metadatas = list(video_inputs), list(video_metadatas)
else:
video_metadatas = None
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, video_metadata=video_metadatas, **video_kwargs, do_resize=False, return_tensors="pt")
inputs = inputs.to('cuda')
generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
```
使用 ms-swift 的 `TransformersEngine` 进行推理:
```python
import os
# os.environ['SWIFT_DEBUG'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
os.environ['FPS_MAX_FRAMES'] = '16'
from swift.infer_engine import TransformersEngine, InferRequest, RequestConfig
engine = TransformersEngine('Qwen/Qwen3-VL-4B-Instruct') # attn_impl='flash_attention_2'
infer_request = InferRequest(messages=[{
"role": "user",
"content": 'Describe this video.',
}], videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'])
request_config = RequestConfig(max_tokens=128, temperature=0)
resp_list = engine.infer([infer_request], request_config=request_config)
response = resp_list[0].choices[0].message.content
# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
# use stream
request_config = RequestConfig(max_tokens=128, temperature=0, stream=True)
gen_list = engine.infer([infer_request], request_config=request_config)
for chunk in gen_list[0]:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
```
使用命令行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
swift infer \
--model Qwen/Qwen3-VL-4B-Instruct \
--stream true
```
```
<<< who are you?
Hello! I'm Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. My main functions include answering questions, creating text such as stories, official documents, emails, scripts, and more, as well as performing logical reasoning, programming, and other tasks. If you have any questions or need assistance, feel free to let me know anytime, and I'll do my best to help!
--------------------------------------------------
<<< describe the image.
Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
This is a beautifully detailed, close-up portrait of an adorable tabby kitten, rendered with a soft, painterly effect that gives it a gentle, dreamy quality.
Here’s a breakdown of the image:
- **The Kitten:** The subject is a young, fluffy kitten with a classic tabby pattern. Its fur is a mix of white and soft grayish-brown stripes, with a prominent dark stripe running down the center of its forehead and over its nose. The kitten’s face is predominantly white, with delicate markings around its eyes and cheeks.
- **The Eyes:** Its most captivating feature is its large, round, and expressive eyes. They are a striking shade of bright blue-gray, with dark pupils that give it an intense, curious, and slightly innocent gaze. The eyes are wide open, suggesting the kitten is alert and attentive.
- **The Expression:** The kitten’s expression is sweet and innocent. Its small pink nose and slightly parted mouth give it a gentle, almost pleading look. Its whiskers are long and white, standing out against its fur.
- **The Style:** The image has a soft-focus, artistic quality, reminiscent of impressionist painting. The edges of the kitten’s fur are slightly blurred, creating a halo effect that draws attention to its face. The background is softly blurred with muted tones of green and gray, which helps the kitten stand out as the clear focal point.
- **Overall Impression:** The image evokes feelings of warmth, cuteness, and tenderness. The kitten appears to be looking directly at the viewer, creating a sense of connection and affection.
This is a lovely and charming depiction of a young kitten, capturing its innocence and charm in a visually appealing and emotionally engaging way.
--------------------------------------------------
<<< describe the video.
Input a video path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4
This video captures a charming and adorable moment of a young child, likely a toddler, sitting on a bed and pretending to read a book. The child is wearing glasses, which adds a humorous and endearing touch to the scene — as if they’re a little scholar or librarian.
Here’s a breakdown of what unfolds:
- The child is seated cross-legged on a bed with a patterned quilt. Behind them, a crib and some household items are visible, suggesting a cozy bedroom setting.
- The child holds an open book and appears to be turning the pages with focused attention, mimicking the behavior of a real reader.
- At one point, the child looks up, smiles, or seems to react with delight — perhaps amused by something in the book or just enjoying the activity.
- The child’s movements are gentle and deliberate, showing a sense of concentration and curiosity. They turn pages, sometimes with one hand, and occasionally lift the book slightly as if to examine it more closely.
- The video has a warm, candid feel — it’s not staged, and the child’s natural behavior makes it feel authentic and heartwarming.
Overall, this is a sweet, lighthearted video that showcases the innocence and imagination of early childhood. The child’s engagement with the book, combined with their glasses and playful demeanor, creates a delightful and memorable scene.
```
- 其中特定模型参数,例如 `VIDEO_MAX_TOKEN_NUM` 等环境变量的含义参考[命令行参数文档](../Instruction/Command-line-parameters.md#qwen3_vl-qwen3_5)。
## 训练
本文档将介绍如何使用 ms-swift 与 Megatron-SWIFT 训练 Qwen3-VL。推荐 Dense 模型使用 ms-swift(即 transformers 后端,更加方便简单),而 Moe 模型使用 Megatron-SWIFT(即 megatron 后端,更快的训练速度,benchmark查看[这里](../Megatron-SWIFT/Quick-start.md#benchmark))。
如果您需要自定义数据集微调模型,你可以将数据准备成以下格式,并在命令行中设置`--dataset train.jsonl --val_dataset val.jsonl`,其中验证集为可选。更多介绍请参考[多模态数据集文档](../Customization/Custom-dataset.md#多模态)。
```jsonl
{"messages": [{"role": "user", "content": "浙江的省会在哪?"}, {"role": "assistant", "content": "浙江的省会在杭州。"}]}
{"messages": [{"role": "user", "content": "两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫,后一张是小狗"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "图片中是什么,视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象,视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
```
Qwen3-VL的bbox输出采用归一化1000的相对坐标。你可以使用 ms-swift 提供的 grounding 数据集格式,其中"bbox"中的坐标为绝对坐标,ms-swift 会自动将绝对坐标转为归一化1000的相对坐标。更多信息请参考[grounding数据集格式文档](../Customization/Custom-dataset.md#grounding)。
```jsonl
{"messages": [{"role": "user", "content": "找到图像中的"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": , \"label\": \"\"},\n\t{\"bbox_2d\": , \"label\": \"\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
```
### Dense模型
以下提供对`Qwen3-VL-4B-Instruct`模型的微调脚本,我们使用混合模态数据作为Demo数据集,该示例脚本仅作为演示用途。训练显存为2 * 21GiB,训练时间为12分钟。
- 若觉得预处理时间太长,你可以将`--packing`去除,或者使用[cached dataset](https://github.com/modelscope/ms-swift/tree/main/examples/train/cached_dataset)。
```shell
# 2 * 21GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model Qwen/Qwen3-VL-4B-Instruct \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
'swift/VideoChatGPT:Generic#2000' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--tuner_type lora \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--attn_impl flash_attn \
--padding_free true \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--freeze_vit true \
--freeze_aligner true \
--packing true \
--gradient_checkpointing true \
--vit_gradient_checkpointing false \
--gradient_accumulation_steps 2 \
--eval_steps 100 \
--save_steps 100 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 4096 \
--output_dir output \
--warmup_ratio 0.05 \
--deepspeed zero2 \
--dataset_num_proc 4 \
--dataloader_num_workers 4
```
训练结束后,我们使用以下脚本对验证集进行推理:
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0 \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--max_new_tokens 2048 \
--load_data_args true
```
```
--------------------------------------------------
[QUERY] Using LaTeX to perform OCR on the image.
[LABELS] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
[RESPONSE] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
--------------------------------------------------
[QUERY] What color suit is the man wearing while playing the saxophone on stage?
[LABELS] The man is wearing a black suit and white shirt while playing the saxophone on the red-floored stage.
[RESPONSE] The man is wearing a black suit while playing the saxophone on stage.
--------------------------------------------------
...
```
### Moe模型
以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本,我们使用 Megatron-SWIFT 进行单机全参数训练。我们同样采用混合数据进行训练,该示例脚本仅作为演示用途。训练所需显存资源为8 * 80GiB,训练时间为20分钟。
关于 Megatron-SWIFT 的环境安装,请参考[Megatron-SWIFT文档](../Megatron-SWIFT/Quick-start.md)。Megatron-SWIFT 与 ms-swift 共用 template 和 dataset 模块,因此前面介绍的自定义数据集格式和模型特有环境变量依旧生效。
微调脚本如下,训练技巧与并行技术的调整参考[Megatron-SWIFT文档](../Megatron-SWIFT/Quick-start.md#训练技巧)。
```shell
# 8 * 80GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
OMP_NUM_THREADS=14 \
NPROC_PER_NODE=8 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
megatron sft \
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
--save_safetensors true \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
'swift/VideoChatGPT:Generic#2000' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--moe_permute_fusion true \
--tensor_model_parallel_size 4 \
--expert_model_parallel_size 8 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-6 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--num_train_epochs 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct \
--eval_steps 500 \
--save_steps 500 \
--max_length 4096 \
--packing true \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--moe_expert_capacity_factor 2 \
--attention_backend flash
```
训练结束后,我们使用以下脚本对验证集进行推理:
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0 \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
swift infer \
--model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx/checkpoint-xxx \
--stream true \
--max_new_tokens 2048 \
--load_data_args true
```
使用以下命令将训练权重推送到 Modelscope:
```shell
swift export \
--model output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token ''
```
================================================
FILE: docs/source/BestPractices/Qwen3_5-Best-Practice.md
================================================
# Qwen3.5 最佳实践
ms-swift 支持使用transformers/Megatron后端对[Qwen3.5](https://github.com/QwenLM/Qwen3.5) Dense/Moe模型进行训练。Qwen3.5 属于混合思考的多模态模型,结合了linear attention和full attention。本文将介绍如何对Qwen3.5 Dense/Moe模型进行推理、指令微调以及强化学习。
## 环境设置
```shell
pip install -U ms-swift
# "transformers==5.2.*" 会遇到与vllm的兼容问题,参考这个issue: https://github.com/modelscope/ms-swift/issues/8254
# "transformers==5.3.*" 会遇到视频训练问题,参考这个issue: https://github.com/modelscope/ms-swift/issues/8362
pip install -U "transformers==5.2.*" "qwen_vl_utils>=0.0.14" peft liger-kernel
# flash-linear-attention
# 请安装fla main分支,若出现训练缓慢的问题请参考:https://github.com/fla-org/flash-linear-attention/issues/758
pip install -U git+https://github.com/fla-org/flash-linear-attention
# causal_conv1d
pip install -U git+https://github.com/Dao-AILab/causal-conv1d --no-build-isolation
# flash-attention
pip install "flash-attn==2.8.3" --no-build-isolation
# deepspeed训练
pip install deepspeed
# vllm (torch2.10) for inference/deployment/RL
pip install -U "vllm>=0.17.0"
# 对于强化学习(RL)训练,需要覆盖 vLLM 的默认安装版本
pip install -U "transformers==5.2.*"
```
- Qwen3.5 视频数据训练卡住:使用decord后端读取视频可能导致卡住问题,参考[这个issue](https://github.com/dmlc/decord/issues/269)。你可以使用torchcodec后端,具体参考[qwen_vl_utils](https://github.com/QwenLM/Qwen3-VL/blob/50068df2334f309979ff05d75f1078c8309c63ed/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L390-L400)库。
## 推理
使用 ms-swift 的 `TransformersEngine` 进行推理:
- 其中特定模型参数,例如 `VIDEO_MAX_TOKEN_NUM` 等环境变量的含义与Qwen3-VL相同,参考[命令行参数文档](../Instruction/Command-line-parameters.md#qwen3_vl,qwen3_5)。
```python
import os
# os.environ['SWIFT_DEBUG'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['IMAGE_MAX_TOKEN_NUM'] = '1024'
os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
os.environ['FPS_MAX_FRAMES'] = '16'
from swift import get_model_processor, get_template
from swift.infer_engine import TransformersEngine, InferRequest, RequestConfig
model, processor = get_model_processor('Qwen/Qwen3.5-4B') # attn_impl='flash_attention_2'
template = get_template(processor, enable_thinking=False)
engine = TransformersEngine(model, template=template)
infer_request = InferRequest(messages=[{
"role": "user",
"content": 'Describe this video.',
}], videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'])
request_config = RequestConfig(max_tokens=128, temperature=0)
resp_list = engine.infer([infer_request], request_config=request_config)
response = resp_list[0].choices[0].message.content
print(response)
# use stream
request_config = RequestConfig(max_tokens=128, temperature=0, stream=True)
gen_list = engine.infer([infer_request], request_config=request_config)
for chunk in gen_list[0]:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
```
使用命令行进行推理:
```shell
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model Qwen/Qwen3.5-4B \
--enable_thinking false \
--stream true
```
## 微调
本章将介绍如何使用 ms-swift 与 Megatron-SWIFT 训练 Qwen3.5。推荐 Dense 模型使用 ms-swift(即 transformers 后端,更加方便简单),而 Moe 模型使用 Megatron-SWIFT(即 megatron 后端,更快的训练速度)
如果您需要自定义数据集微调模型,你可以将数据准备成以下格式,并在命令行中设置`--dataset train.jsonl --val_dataset val.jsonl`,其中验证集为可选。更多介绍请参考[多模态数据集文档](../Customization/Custom-dataset.md#多模态)。
```jsonl
{"messages": [{"role": "user", "content": "浙江的省会在哪?"}, {"role": "assistant", "content": "浙江的省会在杭州。"}]}
{"messages": [{"role": "user", "content": "两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫,后一张是小狗"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "图片中是什么,视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象,视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
```
Qwen3.5的bbox输出采用归一化1000的相对坐标。你可以使用 ms-swift 提供的 grounding 数据集格式,其中"bbox"中的坐标为绝对坐标,ms-swift 会自动将绝对坐标转为归一化1000的相对坐标。更多信息请参考[grounding数据集格式文档](../Customization/Custom-dataset.md#grounding)。
```jsonl
{"messages": [{"role": "user", "content": "找到图像中的"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": , \"label\": \"\"},\n\t{\"bbox_2d\": , \"label\": \"\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
```
### Dense模型
以下提供对Qwen3.5-4B模型的微调脚本,该示例脚本仅作为演示用途。训练显存为 4 * 20GiB,训练时间为12分钟。由于transformers的GatedDeltaNet不支持packing/padding_free(megatron支持,见下文),因此我们使用group_by_length参数来加速训练,保证DP的负载均衡并减少micro batch中的零填充,但这会导致loss曲线跳动(因数据随机不充分),当然你也可以去掉此参数。
对模型进行微调的脚本如下:
```shell
# 4 * 20GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=4 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model Qwen/Qwen3.5-4B \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \
--load_from_cache_file true \
--add_non_thinking_prefix true \
--loss_scale ignore_empty_think \
--split_dataset_ratio 0.01 \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 1 \
--group_by_length true \
--output_dir output/Qwen3.5-4B \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--warmup_ratio 0.05 \
--dataset_num_proc 4 \
--dataloader_num_workers 4 \
--deepspeed zero2 \
--model_author swift \
--model_name swift-robot
```
训练结束后,使用以下脚本对验证集进行推理:
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
swift infer \
--adapters output/Qwen3.5-4B/vx-xxx/checkpoint-xxx \
--stream true \
--enable_thinking false \
--max_new_tokens 512 \
--load_data_args true
```
```text
[QUERY] 你好,你是谁?
[RESPONSE]
你好,我是由swift开发的人工智能语言模型,我的名字叫swift-robot。很高兴能与你交流。
--------------------------------------------------
[QUERY] Using LaTeX to perform OCR on the image.
[LABELS] e = \sum _ { k = 0 } ^ { \infty } \frac { 1 } { k ! }
[RESPONSE]
e = \sum _ { k = 0 } ^ { \infty } \frac { 1 } { k ! }
```
使用python进行推理:
```python
import os
# os.environ['SWIFT_DEBUG'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['IMAGE_MAX_TOKEN_NUM'] = '1024'
os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
os.environ['FPS_MAX_FRAMES'] = '16'
from peft import PeftModel
from swift import get_model_processor, get_template
from swift.infer_engine import TransformersEngine, InferRequest, RequestConfig
adapter_dir = 'output/Qwen3.5-4B/vx-xxx/checkpoint-xxx'
enable_thinking = False
model, processor = get_model_processor('Qwen/Qwen3.5-4B') # attn_impl='flash_attention_2'
model = PeftModel.from_pretrained(model, adapter_dir)
template = get_template(processor, enable_thinking=enable_thinking)
engine = TransformersEngine(model, template=template)
infer_request = InferRequest(messages=[{
"role": "user",
"content": 'who are you?',
}])
request_config = RequestConfig(max_tokens=128, temperature=0)
resp_list = engine.infer([infer_request], request_config=request_config)
response = resp_list[0].choices[0].message.content
print(response)
# use stream
request_config = RequestConfig(max_tokens=128, temperature=0, stream=True)
gen_list = engine.infer([infer_request], request_config=request_config)
for chunk in gen_list[0]:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
# I am an artificial intelligence assistant named swift-robot, trained by swift. I am designed to understand and generate natural language text in order to provide information, answer questions, and engage in conversation with humans. How can I assist you?
```
使用transformers后端训练MoE的例子参考:https://github.com/modelscope/ms-swift/blob/main/examples/models/qwen3_5/transformers.sh
### Moe模型
Qwen3.5-35B-A3B Megatron训练,环境的准备请参考[Megatron-SWIFT快速开始文档](../Megatron-SWIFT/Quick-start.md)。你可以在15分钟内跑完以下案例:
```shell
# 4 * 40GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
megatron sft \
--model Qwen/Qwen3.5-35B-A3B \
--save_safetensors true \
--merge_lora true \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \
--load_from_cache_file true \
--add_non_thinking_prefix true \
--loss_scale ignore_empty_think \
--split_dataset_ratio 0.01 \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--expert_model_parallel_size 4 \
--moe_permute_fusion true \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-6 \
--micro_batch_size 4 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--num_train_epochs 1 \
--group_by_length true \
--finetune true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--output_dir megatron_output/Qwen3.5-35B-A3B \
--eval_steps 200 \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--attention_backend flash \
--padding_free false \
--model_author swift \
--model_name swift-robot
```
训练结束后,使用以下脚本对验证集进行推理:
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
swift infer \
--model megatron_output/Qwen3.5-35B-A3B/vx-xxx/checkpoint-xxx-merged \
--stream true \
--enable_thinking false \
--max_new_tokens 512 \
--load_data_args true
```
Megatron-SWIFT训练Qwen3.5的提示:
- 全参数训练:参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/mcore_full.sh)。
- 关于MTP训练:ms-swift暂不支持多模态MTP的训练。如果你只训练纯文本数据,请设置`SKIP_MULTIMODAL_MTP_VALIDATION=1`环境变量,忽略检查。
- TP 限制解除:使用 "megatron-core>=0.16" 可解除 TP 受到的 `num_query_groups` 限制。
- 默认 `GatedDeltaNet` 使用 transformers 实现(为保证稳定性,暂时保持默认行为不变)。使用 "megatron-core>=0.16"并设置环境变量 `SWIFT_USE_MCORE_GDN=1`可切换至 mcore 实现,支持 GDN 的 TP 并降低显存。
- padding_free/packing的支持:packing可以提升训练速度,你需要设置`SWIFT_USE_MCORE_GDN=1`环境变量。参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/packing.sh)。
- apply_wd_to_qk_layernorm:对 qk layernorm 应用权重衰减。默认为False。
## 强化学习(RL)
以 Qwen3.5-2B 模型为例,下面展示基于 [GSM8K](https://www.modelscope.cn/datasets/modelscope/gsm8k) 数据集进行 GRPO 和 GKD 训练,并以 GSM8K 评测集为标准验证训练效果。为避免模型输出过长的思维链,以下统一设置 `enable_thinking false`。
### GRPO
#### Dense 模型
使用 GRPO 进行全参数训练,以 `gsm8k_accuracy` 和 `gsm8k_format` 作为奖励函数。奖励函数的实现参考 [gsm8k_plugin.py](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/gsm8k/gsm8k_plugin.py)。
```shell
SYSTEM_PROMPT="""You are a helpful math assistant. Solve the problem step by step and put your final answer within \\boxed{}."""
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
swift rlhf \
--rlhf_type grpo \
--model Qwen/Qwen3.5-2B \
--external_plugins examples/train/grpo/plugin/gsm8k/gsm8k_plugin.py \
--reward_funcs gsm8k_accuracy gsm8k_format \
--columns '{"answer": "solution"}' \
--enable_thinking false \
--use_vllm true \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.4 \
--vllm_tensor_parallel_size 1 \
--vllm_max_model_len 10240 \
--sleep_level 1 \
--tuner_type full \
--torch_dtype bfloat16 \
--dataset 'modelscope/gsm8k' \
--load_from_cache_file true \
--max_length 2048 \
--max_completion_length 8192 \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--learning_rate 1e-6 \
--lr_scheduler_type cosine \
--save_steps 10 \
--save_total_limit 100 \
--logging_steps 1 \
--warmup_ratio 0.0 \
--dataloader_num_workers 4 \
--num_generations 8 \
--temperature 1.0 \
--system "$SYSTEM_PROMPT" \
--deepspeed zero2 \
--log_completions true \
--report_to tensorboard swanlab \
--max_grad_norm 1.0 \
--epsilon 0.2 \
--epsilon_high 0.28 \
--scale_rewards none
```
使用以下指令进行评测:
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model output/Qwen3.5-2B/vxx-xxx-xxx/checkpoint-xx \
--enable_thinking false \
--eval_dataset gsm8k \
--eval_backend Native --infer_backend vllm \
--eval_generation_config '{"max_tokens":8192,"temperature":0.0,"do_sample":false}'
```
以 10 步为间隔,前 50 步的 GSM8K 评测结果如下:
| 模型 / Steps | GSM8K Accuracy | 提升 |
|---|---|---|
| Qwen3.5-2B (baseline) | 0.7597 | - |
| GRPO 10 steps | 0.7650 | +0.53 |
| GRPO 20 steps | 0.7748 | +1.51 |
| GRPO 30 steps | 0.7779 | +1.82 |
| GRPO 40 steps | 0.7817 | +2.20 |
| GRPO 50 steps | 0.7885 | +2.88 |
#### MoE 模型
使用 Megatron 后端对 Qwen3.5-35B-A3B MoE 模型进行 GRPO LoRA 训练,在 [DAPO-Math-17k](https://www.modelscope.cn/datasets/open-r1/DAPO-Math-17k-Processed) 数据集上训练,使用 `accuracy` 作为奖励函数。
```shell
SYSTEM_PROMPT="""You are a helpful math assistant. Solve the problem step by step and put your final answer within \\boxed{}."""
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
megatron rlhf \
--rlhf_type grpo \
--model Qwen/Qwen3.5-35B-A3B \
--save_safetensors true \
--enable_thinking false \
--merge_lora true \
--context_parallel_size 1 \
--tensor_model_parallel_size 1 \
--expert_model_parallel_size 8 \
--pipeline_model_parallel_size 1 \
--moe_permute_fusion true \
--dataset open-r1/DAPO-Math-17k-Processed \
--system "$SYSTEM_PROMPT" \
--num_train_epochs 1 \
--global_batch_size 64 \
--micro_batch_size 1 \
--steps_per_generation 2 \
--num_generations 8 \
--reward_funcs accuracy \
--use_vllm true \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.5 \
--vllm_tensor_parallel_size 2 \
--vllm_max_model_len 9192 \
--max_length 1000 \
--max_completion_length 8192 \
--tuner_type lora \
--target_modules all-linear \
--lr 5e-5 \
--bf16 true \
--beta 0.00 \
--epsilon 0.2 \
--epsilon_high 0.28 \
--dynamic_sample false \
--overlong_filter true \
--loss_type grpo \
--sleep_level 1 \
--offload_model true \
--offload_bridge false \
--offload_optimizer true \
--logging_steps 1 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim \
--no_save_rng \
--save_steps 20 \
--attention_backend flash \
--moe_expert_capacity_factor 2 \
--temperature 1.0 \
--padding_free false \
--sequence_parallel true \
--log_completions true \
--report_to tensorboard swanlab
```
使用以下指令在 AIME-2025 和 MATH-500 上评测:
```shell
CUDA_VISIBLE_DEVICES=0,1 swift eval \
--model \
--enable_thinking false \
--eval_dataset aime25 math_500 \
--eval_backend Native --infer_backend vllm \
--vllm_tensor_parallel_size 2 \
--vllm_gpu_memory_utilization 0.9 \
--vllm_max_model_len 10000 \
--eval_generation_config '{"max_tokens":8192,"temperature":0.0,"do_sample":false}' \
--eval_num_proc 8
```
在 AIME-2025 和 MATH-500 上的评测结果如下:
| 模型 / Steps | AIME-2025 | MATH-500 |
|---|---|---|
| Qwen3.5-35B-A3B (baseline) | 43.33 | 92.40 |
| Megatron GRPO 20 steps | 53.33 (+10.00) | 95.80 (+3.40) |
| Megatron GRPO 40 steps | 53.33 (+10.00) | 96.60 (+4.20) |
### GKD
使用 GKD 进行 LoRA 训练,以 Qwen3.5-9B 作为 teacher 模型。首先使用 vLLM 拉起 teacher server(也可以通过 `--teacher_model` 参数直接加载模型):
```shell
CUDA_VISIBLE_DEVICES=0 \
vllm serve Qwen/Qwen3.5-9B \
--port 8000 \
--tensor-parallel-size 1 \
--max-model-len 10240 \
--gpu-memory-utilization 0.8 \
--max-logprobs 64
```
然后在其余 GPU 上启动 GKD 训练:
```shell
NPROC_PER_NODE=3 \
CUDA_VISIBLE_DEVICES=1,2,3 \
swift rlhf \
--rlhf_type gkd \
--model Qwen/Qwen3.5-2B \
--teacher_model_server http://localhost:8000 \
--gkd_logits_topk 64 \
--enable_thinking false \
--tuner_type lora \
--use_vllm true \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.5 \
--vllm_tensor_parallel_size 1 \
--vllm_max_model_len 10240 \
--sleep_level 0 \
--dataset 'modelscope/gsm8k' \
--lmbda 1 \
--seq_kd false \
--beta 0.5 \
--torch_dtype bfloat16 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 16 \
--learning_rate 5e-5 \
--logging_steps 1 \
--save_steps 100 \
--save_total_limit 10 \
--max_length 2048 \
--max_completion_length 8192 \
--warmup_ratio 0.1 \
--save_only_model true \
--dataloader_num_workers 4 \
--dataset_num_proc 4 \
--attn_impl flash_attn \
--report_to tensorboard swanlab
```
使用以下指令进行评测:
```shell
CUDA_VISIBLE_DEVICES=0 swift eval \
--model Qwen/Qwen3.5-2B \
--adapters output/Qwen3.5-2B/vxx-xxx-xxx/checkpoint-xx \
--merge_lora true \
--enable_thinking false \
--eval_dataset gsm8k \
--eval_backend Native --infer_backend vllm \
--eval_generation_config '{"max_tokens":8192,"temperature":0.0,"do_sample":false}'
```
以 100 步为间隔,前 300 步的 GSM8K 评测结果如下:
| 模型 / Steps | GSM8K Accuracy | 提升 |
|---|---|---|
| Qwen3.5-2B (baseline) | 0.7597 | - |
| GKD 100 steps | 0.7968 | +3.71 |
| GKD 200 steps | 0.8188 | +5.91 |
| GKD 300 steps | 0.8332 | +7.35 |
================================================
FILE: docs/source/BestPractices/Rapidly-Training-VL-model.md
================================================
# 快速训练VL模型
本文档提供从零开始快速训练视觉语言(Vision-Language, VL)模型的最佳实践。
涉及的模型链接:
- [Qwen2.5-VL-7B-Instruct](https://www.modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)
- [Qwen3-8B](https://www.modelscope.cn/models/Qwen/Qwen3-8B)
训练的模型链接:
- [Simple-VL-8B](https://www.modelscope.cn/models/swift/Simple-VL-8B/summary)
本训练流程基于 Qwen2.5-VL-7B-Instruct 模型架构,将其内部的语言模型(LLM)部分替换为 Qwen3-8B 的权重,训练模型的视觉理解能力。具体步骤如下:
1. 修改原始模型的配置文件 config.json,使其适配 Qwen3-8B 的模型结构。
2. 初始化并加载新的模型权重,保存为新模型。
3. 对新模型进行两阶段微调:
1. 第一阶段:仅训练视觉到语言的对齐模块(aligner),冻结 ViT 和 LLM 部分。
2. 第二阶段:解冻所有模块,联合训练提升整体性能。
## 模型修改
### 修改配置文件 config.json
因为 Qwen2.5-VL-7B-Instruct 模型的底模 Qwen2.5-7B-Instruct 与 Qwen3-8B 在模型结构上存在部分差异(比如层数,hidden_state_dims),我们首先需要基于Qwen2.5-VL-7B-Instruct的config.json文件,创建一个新的config.json文件,并修改以下参数对齐Qwen3-8B
```
修改
1. hidden_size 3584->4096
2. intermediate_size: 18944->12288
3. num_attention_heads: 28->32
4. num_key_value_heads: 4->8
5. num_hidden_layers: 28->36
6. vocab_size:152064->151936
7. max_window_layers:28->36
8. out_hidden_size: 3584->4096
新增
1. head_dim: 128
```
### 模型权重初始化与替换
使用以下 Python 脚本完成模型权重的初始化、替换与保存:
```python
import torch
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoModelForCausalLM, AutoConfig
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLPatchMerger, Qwen2_5_VLModel
from accelerate import Accelerator
# 加载原始 VL 模型和 Qwen3-8B 模型
qwen2_5_vl_7b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
device_map="cuda",
torch_dtype=torch.bfloat16
)
device = qwen2_5_vl_7b_model.device
qwen3_8b_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-8B",
device_map=device,
torch_dtype=torch.bfloat16
)
# 加载配置
old_config = AutoConfig.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
new_config = AutoConfig.from_pretrained("/path/to/new_config_dir") # 新 config 的文件夹路径
new_visual_config = new_config.vision_config
# 1. 替换 ViT 到 LLM 的 merger(aligner) 层
new_merger = Qwen2_5_VLPatchMerger(
dim=new_visual_config.out_hidden_size,
context_dim=new_visual_config.hidden_size,
spatial_merge_size=new_visual_config.spatial_merge_size,
).to(device).to(torch.bfloat16)
qwen2_5_vl_7b_model.visual.merger = new_merger
# 2. 替换 VL 模型的 LLM 部分
new_llm_model = Qwen2_5_VLModel(new_config).to(device).to(torch.bfloat16)
for name, param in qwen3_8b_model.model.named_parameters():
if name in new_llm_model.state_dict():
new_llm_model.state_dict()[name].copy_(param)
qwen2_5_vl_7b_model.model = new_llm_model
qwen2_5_vl_7b_model.lm_head = qwen3_8b_model.lm_head
# 3. 保存修改后的模型
accelerator = Accelerator()
accelerator.save_model(
model=qwen2_5_vl_7b_model,
save_directory="/path/to/save/Qwen3-VL-Model",
max_shard_size="4GB",
safe_serialization=True
)
```
保存完权重后,将原 Qwen2.5-VL-7B-Instruct 模型文件夹中除模型权重的文件(包括`model.safetensors.index.json`) 复制到新的模型权重文件夹中,并替换 config.json 为新修改的 config.json文件。
## 训练
为简化流程,我们跳过预训练(pretrain),直接进入监督微调(SFT)。训练分为两个阶段:
### stage1 训练 Aligner 层
仅训练视觉到语言的对齐层(Aligner),冻结 ViT 和 LLM 部分:
```bash
NNODES=$WORLD_SIZE \
NODE_RANK=$RANK \
NPROC_PER_NODE=8 \
MAX_PIXELS=1003520 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift sft \
--model /path/to/new_vl_model \
--model_type qwen2_5_vl \
--tuner_type full \
--dataset xxx \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--torch_dtype bfloat16 \
--attn_impl flash_attn \
--freeze_vit true \
--freeze_llm true \
--freeze_aligner false \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
--learning_rate 5e-6 \
--gradient_accumulation_steps 8 \
--eval_steps -1 \
--save_steps 1000 \
--save_total_limit 10 \
--logging_steps 5 \
--max_length 8192 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 8 \
--deepspeed zero2
```
### stage2 训练整个模型
解冻所有模块,联合训练以增强模型的整体视觉理解能力:
```bash
NNODES=$WORLD_SIZE \
NODE_RANK=$RANK \
NPROC_PER_NODE=8 \
MAX_PIXELS=1003520 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift sft \
--model /path/to/stage1_checkpoint \
--model_type qwen2_5_vl \
--tuner_type full \
--dataset xxx \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--torch_dtype bfloat16 \
--attn_impl flash_attn \
--freeze_vit false \
--freeze_llm false \
--freeze_aligner false \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
--learning_rate 5e-6 \
--gradient_accumulation_steps 8 \
--eval_steps -1 \
--save_steps 1000 \
--save_total_limit 10 \
--logging_steps 5 \
--max_length 8192 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--dataset_num_proc 8 \
--deepspeed zero2
```
## 推理/部署/评测
### 推理
通过`swift infer`来推理训练得到的模型
```bash
swift infer \
--model /path/to/stage2_checkpoint
```
### 部署
使用 vLLM 加速模型服务部署:
```
CUDA_VISIBLE_DEVICES=0 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
swift deploy \
--model /path/to/stage2_checkpoint \
--infer_backend vllm \
--vllm_gpu_memory_utilization 0.9 \
--vllm_max_model_len 8192 \
--max_new_tokens 2048 \
--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}' \
--served_model_name Qwen3-VL
```
### 评测
通过 [EvalScope](https://github.com/modelscope/evalscope/) 对训练得到的 VL 模型进行评测
以下是以 MMMU benchmark 为例的评测代码:
```python
from evalscope import TaskConfig, run_task
task_cfg_dict = TaskConfig(
work_dir='outputs',
eval_backend='VLMEvalKit',
eval_config={
'data': ['MMMU_DEV_VAL'],
'mode': 'all',
'model': [
{'api_base': 'http://localhost:8000/v1/chat/completions',
'key': 'EMPTY',
'name': 'CustomAPIModel',
'temperature': 0.6,
'type': 'Qwen3-VL',
'img_size': -1,
'video_llm': False,
'max_tokens': 512,}
],
'reuse': False,
'nproc': 64,
'judge': 'exact_matching'},
)
run_task(task_cfg=task_cfg_dict)
```
================================================
FILE: docs/source/BestPractices/Reranker.md
================================================
# Reranker训练
SWIFT已经支持Reranker模型的训练,目前已经支持的模型有:
1. modernbert reranker模型
- [ModelScope](https://www.modelscope.cn/models/iic/gte-reranker-modernbert-base) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-reranker-modernbert-base)
2. qwen3-reranker模型
- 0.6B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Reranker-0.6B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B)
- 4B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Reranker-4B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Reranker-4B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Reranker-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Reranker-8B)
3. qwen3-vl-reranker模型
- 2B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)
## 实现方式
目前SWIFT支持两种Reranker模型的实现方式,二者在架构和损失函数计算上有显著差异:
### 1. 分类式Reranker
**适用模型:** modernbert reranker模型(如gte-reranker-modernbert-base)
**核心原理:**
- 基于序列分类架构,在预训练模型基础上添加分类头
- 输入:query-document对,输出:单个相关性分数
### 2. 生成式Reranker
**适用模型:** qwen3-reranker模型(0.6B/4B/8B)
**核心原理:**
- 基于生成式语言模型架构(CausalLM)
- 输入:query-document对,输出:特定token的概率(如"yes"/"no")
- 通过对比最后位置特定token的logits进行分类
## 损失函数类型
SWIFT支持多种损失函数来训练Reranker模型:
### Pointwise损失函数
Pointwise方法将排序问题转化为二分类问题,独立处理每个query-document对:
- **核心思想:** 对每个query-document对进行二分类,判断文档是否与查询相关
- **损失函数:** 二分类交叉熵
- **适用场景:** 简单高效,适合大规模数据训练
环境变量配置:
- `GENERATIVE_RERANKER_POSITIVE_TOKEN`:正例token(默认:"yes")
- `GENERATIVE_RERANKER_NEGATIVE_TOKEN`:负例token(默认:"no")
### Listwise损失函数
Listwise方法将排序问题转化为多分类问题,从多个候选文档中选择正例:
- **核心思想:** 对每个query的候选文档组(1个正例 + n个负例)进行多分类,识别正例文档
- **损失函数:** 多分类交叉熵
- **适用场景:** 学习文档间的相对排序关系,更符合信息检索的实际需求
环境变量配置:
- `LISTWISE_RERANKER_TEMPERATURE`:softmax温度参数(默认:1.0)
- `LISTWISE_RERANKER_MIN_GROUP_SIZE`:最小组大小,如果组内文档数量小于该值,则不计算损失(默认:2)
**Listwise vs Pointwise:**
- **Pointwise:** 独立判断相关性,训练简单,但忽略了文档间的相对关系
- **Listwise:** 学习相对排序,性能更优,更适合排序任务的本质需求
loss的源代码可以在[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)找到。
## 数据集格式
```json lines
# LLM
{"messages": [{"role": "user", "content": "query"}], "positive_messages": [[{"role": "assistant", "content": "relevant_doc1"}],[{"role": "assistant", "content": "relevant_doc2"}]], "negative_messages": [[{"role": "assistant", "content": "irrelevant_doc1"}],[{"role": "assistant", "content": "irrelevant_doc2"}], ...]}
# MLLM
{"messages": [{"role": "user", "content": "query"}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "assistant", "content": "relevant_doc1"}]], "positive_images": [["/some/positive_images.jpg"]], "negative_messages": [[{"role": "assistant", "content": "irrelevant_doc1"}], [{"role": "assistant", "content": "irrelevant_doc2"}]], "negative_images": [["/some/negative_images1.jpg", "/some/negative_images2.jpg"], ["/some/negative_images3.jpg"]]}
```
**字段说明:**
- `messages`:查询文本
- `positive_messages`:与查询相关的正例文档列表,支持多个正例
- `negative_messages`:与查询不相关的负例文档列表,支持多个负例
**环境变量配置:**
- `MAX_POSITIVE_SAMPLES`:每个query的最大正例数量(默认:1)
- `MAX_NEGATIVE_SAMPLES`:每个query的最大负例数量(默认:7)
> 默认会从每条数据中取出`MAX_POSITIVE_SAMPLES`条正样本和`MAX_NEGATIVE_SAMPLES`条负样本,每条正样本会和`MAX_NEGATIVE_SAMPLES`条负样本组成一个group,因此每条数据会扩展成`MAX_POSITIVE_SAMPLES`x`(1 + MAX_NEGATIVE_SAMPLES)`条数据。
> 如果数据中正例/负例数量不足,会取全部正例/负例,如果数据中正例和负例数量超过`MAX_POSITIVE_SAMPLES`和`MAX_NEGATIVE_SAMPLES`,会进行随机采样。
> **IMPORTANT**:展开后的数据会放在同一个batch中,因此每个设备上的实际批处理大小(effective batch size)将是 `per_device_train_batch_size` × `MAX_POSITIVE_SAMPLES` × (1 + `MAX_NEGATIVE_SAMPLES`)。请注意调整 `per_device_train_batch_size` 以避免显存不足。
## 脚手架
SWIFT提供的脚手架训练脚本:
- [Qwen3-Reranker/Qwen3-VL-Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/qwen3)
- [Pointwise分类式Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh)
- [Pointwise生成式Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_generative_reranker.sh)
- [Listwise分类式Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker_listwise.sh)
- [Listwise生成式Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_generative_reranker_listwise.sh)
推理脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_reranker.py)。
## 高级功能
- Qwen3-Reranker 自定义 Instruction:
- 默认模板如下:
```text
<|im_start|>system
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
<|im_start|>user
: {Instruction}
: {Query}
: {Document}<|im_end|>
<|im_start|>assistant
```
- 默认 Instruction:
- `Given a web search query, retrieve relevant passages that answer the query`
- Instruction 优先级(就近覆盖):
- `positive_messages`/`negative_messages` 内提供的 `system` > 主 `messages` 的 `system` > 默认 Instruction。
- 即:若某个 positive/negative 的消息序列内包含 `system`,则优先使用该条;否则若主 `messages` 含 `system` 则使用之;两者都未提供时,使用默认 Instruction。
================================================
FILE: docs/source/Customization/Architecture.md
================================================
# 架构介绍
ms-swift 4.0 采用模块化设计,各功能模块分布在一级目录下,便于开发者进行自定义扩展。本文档将详细介绍各模块的功能及自定义方法。
## Agent Template
agent模板的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/agent_template/mapping.py)。agent template设计目标是,基于统一的Agent数据集格式,可以灵活切换不同模型进行训练,无需修改数据。训练时使用`--agent_template`指定对应的agent模板。
所有的AgentTemplate需要继承自`BaseAgentTemplate`,并实现其中的几个方法: `_format_tools`, `_format_tool_calls`, `_format_tool_responses`, `get_toolcall`。
- _format_tools: 将`tools`和`system`格式化,组成完整的system。
- _format_tool_calls: 将tool_call部分 `[{"role": "tool_call", "content": "..."}, {"role": "tool_call", "content": "..."}]`进行格式化,最后返回字符串。
- _format_tool_responses: 对tool(也称为tool_response)部分 `[{"role": "tool", "content": "..."}, {"role": "tool", "content": "..."}]`进行格式化。
- get_toolcall: 在部署的时候使用,用于解析模型输出内容中的工具名和参数,返回`List[Function]`。
如何debug:
```python
data = {"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"realtime_aqi\", \"description\": \"天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息\", \"parameters\": {\"type\": \"object\", \"properties\": {\"city\": {\"type\": \"string\", \"description\": \"城市名,例如:上海\"}}, \"required\": [\"city\"]}}}]", "messages": [{"role": "user", "content": "北京和上海今天的天气情况"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"北京\"}}"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"上海\"}}"}, {"role": "tool_response", "content": "{\"city\": \"北京\", \"aqi\": \"10\", \"unit\": \"celsius\"}"}, {"role": "tool_response", "content": "{\"city\": \"上海\", \"aqi\": \"72\", \"unit\": \"fahrenheit\"}"}, {"role": "assistant", "content": "根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。"}]}
from swift import get_processor, get_template
tokenizer = get_processor('Qwen/Qwen3.5-2B')
template = get_template(tokenizer) # 使用默认agent模板
# template = get_template(tokenizer, agent_template='qwen3_5')
print(f'agent_template: {template._agent_template}')
template.set_mode('train')
encoded = template.encode(data)
print(f'[INPUT_IDS] {template.safe_decode(encoded["input_ids"])}\n')
print(f'[LABELS] {template.safe_decode(encoded["labels"])}')
```
如果你想要给我们提供PR,请参考[这里](https://github.com/modelscope/ms-swift/blob/main/tests/test_align/test_template/test_agent.py)书写你的测试案例。
## Callbacks
callbacks的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/callbacks/mapping.py)。callbacks可以对trainer中的关键节点的行为进行自定义。自定义后,你需要在mapping中进行注册,训练时使用`--callbacks`指定对应的回调类。例如,你可以自定义:
```python
class CustomCallback(TrainerCallback):
def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# Doing something when the training begins.
pass
def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# Doing something when save checkpoint
pass
```
所有的回调类需继承自base.py中的`TrainerCallback`,并覆盖其方法。接口与transformers的`TrainerCallback`一致,请参考transformers的[callback文档](https://huggingface.co/docs/transformers/main_classes/callback)。
## Loss
Loss的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)。
swift支持自定义loss(当前只支持sft/pretrain/reranker/embedding任务),注册后在训练时设置`--loss_type `使用你定制的loss方法。
自定义Loss需继承自`BaseLoss`,并实现`__call__`方法,返回标量Tensor。你可以参考[CustomCrossEntropyLoss](https://github.com/modelscope/ms-swift/blob/0d7c9f5bc0e7e7d67d914ce6edeb9ce24f60746f/swift/loss/causal_lm.py#L5)进行定制。例如:
```python
class CustomLoss(BaseLoss):
def __call__(self, outputs, labels, **kwargs) -> torch.Tensor:
pass
```
## Loss Scale
loss scale的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss_scale/mapping.py)。在pretrain和sft任务中,可训练token的loss是平均的,即每个token平等地对待。但在某些情况下,某些token需要被额外关注,并设置更高的权重或者对某些token不进行训练。loss_scale可以让开发者自由地定义自己的token权重。(预训练和SFT支持使用loss_scale控制token是否参与训练以及和其权重大小,RLHF中只支持控制token是否参与训练)
你可以通过继承LossScale基类,并实现`get_loss_scale`方法来自定义loss scale。
```python
class CustomLossScale(LossScale):
def get_loss_scale(self, context: str, **kwargs) -> Tuple[List[str], List[float]]:
...
```
`get_loss_scale`函数需要返回了一个Tuple,第一个返回是拆解后的字符串的列表,第二个参数是字符串对应的loss_scale的列表,float值代表了权重。例如下面的权重设置:
```text
["学习", "好", "数学", "是", "重要", "的"]
[1.0, 0.5, 2.0, 0.5, 2.0, 0.1]
```
例子中,我们更看重数学和重要两个词,因为其loss_scale为2.0。
当然我们也需要关注`__call__`方法的核心逻辑,即loss_scale基本策略(base_strategy)all/default/last_round 对loss_scale的影响,具体参考[命令行参数文档](../Instruction/Command-line-parameters.md)的介绍。以及数据集中的'loss'字段对loss_scale的影响,参考[自定义数据集文档](../Customization/Custom-dataset.md)。
```python
if loss or loss is None and (self.base_strategy == 'all' or
(self.base_strategy == 'default' and is_assistant) or
(self.base_strategy == 'last_round' and is_assistant and is_last_round)):
new_context, loss_scale = self.get_loss_scale(context, query=query)
else:
new_context, loss_scale = [context], [0.]
```
此外你也可以使用[json配置文件](https://github.com/modelscope/ms-swift/tree/main/swift/loss_scale/config),继承内置的ConfigLossScale类,来自定义loss_scale。目前支持两种配置方式:字符串精确匹配和正则表达式匹配。你可以参考[Agent支持文档](../Instruction/Agent-support.md#loss_scale的使用)的内容进行理解。
- 字符串精确匹配,例如参考`react.json`, `qwen.json`。json中需要书写`Dict[str, List[float]]`的映射。字符串代表关键词,列表中需要有两个值。我们会根据关键词,将字符串切分成多段字符串。列表的第一个值代表关键词的权重,列表的第二个值代表该关键值后,下一关键词前的内容的权重。
- 正则表达式匹配,例如参考`ignore_empty_think.json`, `hermes.json`。json中需要书写`Dict[str, float]`的映射。字符串代表正则表达式pattern,浮点数代表匹配字符串的权重。
如何debug:
```python
from swift import get_processor, get_template
data = {"messages": [
{"role": "user", "content": "今天的日期是多少?"},
{"role": "assistant", "content": (
"\n我可以通过调用`get_date`函数来获取当前时间。\n \n"
'\n{"name": "get_date", "arguments": {}}\n '
)}
]}
template = get_template(get_processor('Qwen/Qwen3-8B'), loss_scale='hermes')
template.set_mode('train')
inputs = template.encode(data)
print(template.safe_decode(inputs['labels']))
print(inputs['loss_scale'])
```
## Metrics
metrics的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/metrics/mapping.py)。该组件在ms-swift/Megatron-SWIFT中都有被使用。
- 如果是在ms-swift中被使用,你需要继承 base.py 中`EvalMetrics`基类,并实现`compute_metrics`函数,返回字典`Dict[str, float]`。你可以参考[NlgMetrics](https://github.com/modelscope/ms-swift/blob/0d7c9f5bc0e7e7d67d914ce6edeb9ce24f60746f/swift/metrics/nlg.py#L33)进行定制。
- 如果是在Megatron-SWIFT中被使用,你需要继承 utils.py 中`Metric`基类,并实现`update`和`compute`方法,compute方法需返回字典`Dict[str, float]`。
你可以自定义metrics(当前只支持sft/pretrain/reranker/embedding任务),在训练时设置`--eval_metric `使用你定制的metrics。
## Optimizers
optimizer的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/optimizers/mapping.py)。如果你需要自定义优化器,你需要继承`OptimizerCallback`基类,并覆盖`create_optimizer`函数。训练时使用`--optimizer `指定自定义的优化器。
- 你可以参考[MultimodalOptimizerCallback](https://github.com/modelscope/ms-swift/blob/0d7c9f5bc0e7e7d67d914ce6edeb9ce24f60746f/swift/optimizers/multimodal.py#L43)进行实现,该类实现了vit_lr, aligner_lr的功能,即对vit, aligner和LLM分别使用不同的学习率。
## Tuner Plugin
Tuner插件的mapping文件可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/tuner_plugin/mapping.py)。如果你需要自定义tuner,你需要继承`Tuner`基类,并覆盖`prepare_model`, `save_pretrained`, `from_pretrained`函数。
- prepare_model: 该函数在训练前被调用,将原始模型进行处理与准备,使用tuner封装,并设置可训练参数。例如:你可以对某些层附加LoRA,对某些层进行冻结等。
- save_pretrained: 该函数在训练中被调用,对模型进行保存。
- from_pretrained: 该函数在推理/断点续训时被调用,准备模型并读取权重。
你可以参考[LoRALLMTuner](https://github.com/modelscope/ms-swift/blob/0d7c9f5bc0e7e7d67d914ce6edeb9ce24f60746f/swift/tuner_plugin/lora_llm.py#L24)进行实现,该类实现了对LLM进行LoRA训练,对ViT进行全参数训练的功能。
## ORM
example参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/rewards/orm.py)。
ORM是结果奖励模型。ORM一般使用正则表达式来进行,ORM决定了response是否是正确的。例如:
```python
class MathORM(ORM):
@staticmethod
def extract_boxed_result(text):
pattern = r'\\boxed{([^}]*)}'
match = re.search(pattern, text)
if match:
return match.group(1).strip()
else:
return None
def __call__(self, infer_requests: List[InferRequest], ground_truths: List[str],
**kwargs) -> List[float]:
rewards = []
predictions = [request.messages[-1]['content'] for request in infer_requests]
for prediction, ground_truth in zip(predictions, ground_truths):
res1 = MathORM.extract_boxed_result(prediction) or ''
res2 = MathORM.extract_boxed_result(ground_truth) or ''
rewards.append(float(res1.strip() == res2.strip()))
return rewards
orms = {
'math': MathORM,
}
```
在上面的代码中,我们定义了一个对数学response进行解析的过程,如果结果相同则返回score为1.0,否则为0.0。和PRM不同,这个类的infer中有一个额外参数`ground_truths`,
该参数是对应的infer_requests的实际label(数据集中定义的标准response)。
## PRM
example参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/rewards/prm.py)。
PRM是过程奖励模型,PRM会在`swift sample`命令中使用。PRM需要支持的接口比较简单:
```python
class PRM:
def __init__(self):
# init here
pass
def __call__(self, infer_requests: List[InferRequest], **kwargs) -> List[Union[float, List[float]]]:
raise NotImplementedError
```
其中的InferRequest来自于`swift.infer_engine`,返回的`List[Union[float, List[float]]]`,列表中可能是reward也可能是若干reward。开发者可以在infer_requests中拿到queries和responses,并按照自己的方式进行切分,例如:
```text
Let's think step by step.
Step1: xxx
Step2: xxx
So, the answer is ...
```
开发者可以在这里对过程进行切分,并按batch传入PRM中进行推理并返回rewards。更通用来说,开发者可以在这里调用一个远端URL,例如一个闭源PRM大模型并返回rewards。
## 其他目录结构介绍
- arguments: 命令行参数定义,例如:`SftArguments`, `RLHFArguments`等。
- cli: swift命令行机制以及启动文件。例如`swift sft ...`等价于`python swift/cli/main.py sft ...`也等价于`python swift/cli/sft.py ...`。
- config: deepspeed/fsdp2配置文件。
- dataloader: dataloader的实现,包括shard/dispatcher两种方式。
- dataset: 数据集相关模块实现,包括数据预处理、packing、流式数据等。内置数据集的注册在`dataset/dataset`和`dataset/data`文件夹内。具体参考[自定义数据集文档](Custom-dataset.md)。
- infer_engine: 推理引擎实现。包括transformers/vllm/sglang/lmdeploy为后端的推理引擎实现。
- megatron: Megatron-SWIFT 实现。
- model: 模型加载与注册。具体参考[自定义模型文档](Custom-model.md),[多模态模型注册最佳实践](../BestPractices/MLLM-Registration.md)。
- pipelines: `swift sft/rlhf/infer`等主函数pipeline实现,包括`sft_main/rlhf_main/infer_main`等。
- rlhf_trainers: GRPO/GKD/DPO/KTO/RM等算法的Trainer实现。
- rollout: RL算法中rollout过程的采样实现。
- rewards: RL算法中的奖励函数实现,支持自定义奖励计算逻辑。
- template: 对话模板的实现与注册,包含各个任务将messages转换成input_ids的逻辑,以及data_collator相关逻辑。具体参考[自定义模型文档](Custom-model.md),[多模态模型注册最佳实践](../BestPractices/MLLM-Registration.md)。
- trainers: 预训练/SFT/Embedding/Reranker/序列分类任务的Trainer实现。
- ui: `swift web-ui`界面训练与推理实现。
================================================
FILE: docs/source/Customization/Custom-dataset.md
================================================
# 自定义数据集
自定义数据集的接入方法有三种,对预处理函数的控制能力逐渐加强,但接入难度逐步增加。例如,方案一最为方便,但对预处理函数的控制能力最弱,需要预先对数据集进行转换,传入特定格式的数据集:
1. 【推荐】直接使用命令行传参的方式接入,即`--dataset `。这将使用AutoPreprocessor将数据集转换为标准格式(支持4种数据集格式,具体查看下面对AutoPreprocessor的介绍)。你可以使用`--columns`进行列名转换。支持传入csv、json、jsonl、txt、文件夹(例如git clone开源数据集)。该方案不需要修改dataset_info.json,适合刚接触ms-swift的用户,下面两种方案适合对ms-swift进行拓展的开发者。
2. 添加数据集到`dataset_info.json`中,可以参考ms-swift内置的[dataset_info.json](https://github.com/modelscope/ms-swift/blob/main/swift/dataset/data/dataset_info.json)。该方案也将使用AutoPreprocessor将数据集转换为标准格式。dataset_info.json为数据集元信息的list,每一项元信息必填ms_dataset_id/hf_dataset_id/dataset_path中的一项,通过`columns`字段进行列名转换。添加到`dataset_info.json`或者注册的数据集在运行[run_dataset_info.py](https://github.com/modelscope/ms-swift/blob/main/scripts/utils/run_dataset_info.py)时将自动产生[支持的数据集文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html)。此外,你可以采用外接`dataset_info.json`的方式,使用`--custom_dataset_info xxx.json`解析json文件(方便pip install而非git clone的用户),然后指定`--dataset `。
3. 手动注册数据集,具有最灵活的预处理函数定制能力,支持使用函数对数据集进行预处理,但难度较高。可以参考[内置数据集](https://github.com/modelscope/ms-swift/blob/main/swift/dataset/dataset/llm.py)或者[examples](https://github.com/modelscope/ms-swift/blob/main/examples/custom)中的样例。你可以通过指定`--external_plugins xxx.py`解析外置注册内容(方便pip install而非git clone的用户)。
- 方案一和二在实现中借助了方案三,只是注册的过程为自动发生。
以下将对`AutoPreprocessor`可以处理的数据集格式进行介绍:
ms-swift的标准数据集格式可接受的keys包括: 'messages'、'rejected_response'、'label'、'images'、'videos'、'audios'、'tools'和'objects'。其中'messages'是必需的key,'rejected_response'用于DPO等RLHF训练,'label'用于KTO训练和分类模型训练,'images'、'videos'、'audios'用于存储多模态数据的路径或者url,'tools'用于Agent任务,'objects'用于grounding任务。
ms-swift中存在三种核心预处理器:`MessagesPreprocessor`、`AlpacaPreprocessor`、`ResponsePreprocessor`。MessagesPreprocessor用于将类messages和sharegpt格式的数据集转换为标准格式,AlpacaPreprocessor则转换alpaca格式的数据集,ResponsePreprocessor则转换类query/response格式的数据集。`AutoPreprocessor`则自动选择合适的预处理进行处理。
以下四种格式在`AutoPreprocessor`处理下都会转换成ms-swift标准格式中的messages字段,即都可以直接使用`--dataset `接入:
messages格式(标准格式):
```jsonl
{"messages": [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}]}
```
- 注意:system部分是可选的。数据集中的system优先级高于命令行传入的`--system`,最后是定义在template中的`default_system`。
sharegpt格式:
```jsonl
{"system": "", "conversation": [{"human": "", "assistant": ""}, {"human": "", "assistant": ""}]}
```
query-response格式:
```jsonl
{"system": "", "query": "", "response": "", "history": [["", ""]]}
```
注意:以下字段会自动转成对应的system、query、response字段。(solution字段会保留)
- system: 'system', 'system_prompt'.
- query: 'query', 'prompt', 'input', 'instruction', 'question', 'problem'.
- response: 'response', 'answer', 'output', 'targets', 'target', 'answer_key', 'answers', 'solution', 'text', 'completion', 'content'.
alpaca格式:
```jsonl
{"system": "", "instruction": "", "input": "", "output": ""}
```
- 注意:instruction和input字段将组合成query字段。若instruction和input不等于空字符串,`query = f'{instruction}\n{input}'`
## 标准数据集格式
以下给出ms-swift的标准数据集格式,其中system字段是可选的,默认使用template中定义的`default_system`。之前介绍的4种数据集格式也可以被AutoPreprocessor处理成标准数据集格式。
### 预训练
```jsonl
{"messages": [{"role": "assistant", "content": "I love music"}]}
{"messages": [{"role": "assistant", "content": "教练我要打篮球"}]}
{"messages": [{"role": "assistant", "content": "西红柿鸡蛋盖饭和地三鲜盖饭哪个更权威"}]}
```
### 监督微调
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}]}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}]}
```
- 可以通过增加"loss"字段,控制对应的模型回复部分是否计算损失。默认该字段为None。若"loss"设置为true,则对应content进行损失计算(对应loss_scale为1);若"loss"设置为false,则对应content不进行损失计算。需要注意的是,该功能只对"role"为"assistant"的部分生效;该功能优先级高于命令行参数 `--loss_scale`。示例数据格式如下:
```jsonl
{"messages": [{"role": "user", "content": "你好"}, {"role": "assistant", "content": "你好,有什么可以帮助你的吗?", "loss": false}, {"role": "user", "content": "1+1等于几?"}, {"role": "assistant", "content": "等于2", "loss": true}]}
```
#### channel loss
如果你要使用channel loss,你需要设置`--enable_channel_loss true`,并在数据集中增加"channel"字段。channel loss兼容packing/padding_free/loss_scale等技术。
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}], "channel": "general"}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}], "channel": "math"}
```
### RLHF
#### DPO/ORPO/CPO/SimPO/RM
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}], "rejected_response": "我不知道"}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}], "rejected_response": "我不知道"}
```
多模态数据的格式参考[多模态数据集](#多模态), 额外加入如`images`的列表示其他模态输入。当需要为偏好对数据关联不同的图片信息时,可通过`rejected_images`字段标注拒绝回答对应的图片信息。
对齐数据集中要求`rejected_images`和`rejected_response`至少提供一个。
> 注: RM 额外支持 margin 列,参考[RM文档](../Instruction/RLHF.md#rm)
当然,你也可以直接使用`rejected_messages`,而不是只提供`rejected_response`/`rejected_images`,这将提供更大的灵活度(例如多模态/agent场景)。若使用rejected_messages,在多模态场景下,你需要额外传入"rejected_images","rejected_audios","rejected_videos"等内容;在Agent场景下,你需要额外传入"rejected_tools"等内容。多模态数据格式例子如下:
- 若使用`rejected_response`,'rejected_images/rejected_audios/rejected_videos/rejected_tools'的默认值为'images/audios/videos/tools';若使用`rejected_messages`,则需要额外传入。
```jsonl
{"messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "images": ["cat.png"], "rejected_messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小狗。"}], "rejected_images": ["cat.png"]}
{"messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "images": ["cat.png"], "rejected_messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "rejected_images": ["dog.png"]}
```
以上格式等价于:
```jsonl
{"messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "images": ["cat.png"], "rejected_response": "这是一只小狗。"}
{"messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "images": ["cat.png"], "rejected_images": ["dog.png"]}
# 例子一也可写成:
{"messages": [{"role": "user", "content": "这是什么"}, {"role": "assistant", "content": "这是一只小猫咪。"}], "images": ["cat.png"], "rejected_response": [{"role": "assistant", "content": "这是一只小狗。"}]}
```
你也可以将Agent数据集组织成以下形式:
```jsonl
# 会寻找`messages`最后一个user的位置,并替换之后的内容为`rejected_response`组成`rejected_messages`
{"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"realtime_aqi\", \"description\": \"天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息\", \"parameters\": {\"type\": \"object\", \"properties\": {\"city\": {\"type\": \"string\", \"description\": \"城市名,例如:上海\"}}, \"required\": [\"city\"]}}}]", "messages": [{"role": "user", "content": "北京和上海今天的天气情况"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"北京\"}}"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"上海\"}}"}, {"role": "tool_response", "content": "{\"city\": \"北京\", \"aqi\": \"10\", \"unit\": \"celsius\"}"}, {"role": "tool_response", "content": "{\"city\": \"上海\", \"aqi\": \"72\", \"unit\": \"fahrenheit\"}"}, {"role": "assistant", "content": "根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。"}], "rejected_response": [{"role": "assistant", "content": "我不知道。"}]}
```
#### KTO
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "我不知道"}], "label": false}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}], "label": true}
```
#### PPO/GRPO
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}]}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}]}
{"messages": [{"role": "user", "content": "你的名字是什么"}]}
```
- 注意:GRPO会透传所有额外的字段内容给ORM,而不像其他训练方法,默认将额外的字段删除。例如: 你可以额外传入'solution'。自定义的ORM需要包含一个位置参数completions,其他为关键词参数,由数据集额外字段透传。
#### GKD
若未开启`seq_kd`,即该参数为False。数据集格式如下(你可使用teacher模型提前蒸馏):
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}]}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}]}
```
若开启`seq_kd`,则不需要最后一轮的'assistant'部分(teacher模型在训练时生成数据):
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}]}
{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}]}
```
### 序列分类
**单标签任务**:
```jsonl
{"messages": [{"role": "user", "content": "今天天气真好呀"}], "label": 1}
{"messages": [{"role": "user", "content": "今天真倒霉"}], "label": 0}
{"messages": [{"role": "user", "content": "好开心"}], "label": 1}
```
**多标签任务**:
```jsonl
{"messages": [{"role": "user", "content": ""}], "label": []}
{"messages": [{"role": "user", "content": ""}], "label": [0, 2]}
{"messages": [{"role": "user", "content": ""}], "label": [1, 3, 5]}
```
**单回归任务**:
```jsonl
{"messages": [{"role": "user", "content": "求两句话的相似度,范围为0-1。\nsentence1: \nsentence2: "}], "label": 0.8}
```
**多回归任务**:
```jsonl
{"messages": [{"role": "user", "content": ""}], "label": [1.2, -0.6, 0.8]}
```
### Embedding
请参考[embedding训练文档](../BestPractices/Embedding.md#数据集格式)
### Reranker
请参考[Reranker训练文档](../BestPractices/Reranker.md#数据集格式)
### 多模态
对于多模态数据集,和上述任务的格式相同。区别在于增加了`images`, `videos`, `audios`几个key,分别代表多模态资源的url或者path(推荐使用绝对路径),`` `` ``标签代表了插入图片/视频/音频的位置,ms-swift支持多图片/视频/音频的情况。这些特殊tokens将在预处理的时候进行替换,参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/template/templates/qwen.py#L198)。下面给出的四条示例分别展示了纯文本,以及包含图像、视频和音频数据的数据格式。
预训练:
```
{"messages": [{"role": "assistant", "content": "预训练的文本在这里"}]}
{"messages": [{"role": "assistant", "content": "是一只小狗,是一只小猫"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
{"messages": [{"role": "assistant", "content": "描述了今天天气真不错"}], "audios": ["/xxx/x.wav"]}
{"messages": [{"role": "assistant", "content": "是一个大象,是一只狮子在跑步"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
```
微调:
```jsonl
{"messages": [{"role": "user", "content": "浙江的省会在哪?"}, {"role": "assistant", "content": "浙江的省会在杭州。"}]}
{"messages": [{"role": "user", "content": "两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫,后一张是小狗"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
{"messages": [{"role": "user", "content": "语音说了什么"}, {"role": "assistant", "content": "今天天气真好呀"}], "audios": ["/xxx/x.mp3"]}
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "图片中是什么,视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象,视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
```
- 注意:以下字段会自动转成对应的images, videos, audios字段。
- images: image, images.
- videos: video, videos.
- audios: audio, audios.
- 如果需要传入base64格式而不是文件路径,以下为样本例子:`"videos": ['data:video/mp4;base64,{base64_encoded}']`, `"images": ['data:image/jpg;base64,{base64_encoded}']`。
- 若你希望直接传入视频帧,而不是视频,你可以使用以下格式:`"videos": [["/xxx/x.png", "/xxx/y.png"], ["/xxx/a.png", "/xxx/b.png", "/xxx/c.png"]]`。该格式只有部分模型支持,包括Qwen2/2.5/3-VL、Qwen2.5/3-Omni以及其衍生模型。
多模态模型的RLHF和序列分类的数据格式可以参考纯文本大模型的格式,并在此基础上增加`images`等字段。
#### grounding
如果是grounding(物体检测)任务,ms-swift支持两种方式:
1. 直接使用对应模型grounding任务的数据集格式,例如qwen2-vl的格式如下:
```jsonl
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "描述图像"}, {"role": "assistant", "content": "<|object_ref_start|>一只狗<|object_ref_end|><|box_start|>(221,423),(569,886)<|box_end|>和<|object_ref_start|>一个女人<|object_ref_end|><|box_start|>(451,381),(733,793)<|box_end|>正在沙滩上玩耍"}], "images": ["/xxx/x.jpg"]}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "找到图像中的<|object_ref_start|>羊<|object_ref_end|>"}, {"role": "assistant", "content": "<|box_start|>(101,201),(150,266)<|box_end|><|box_start|>(401,601),(550,666)<|box_end|>"}], "images": ["/xxx/x.jpg"]}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "帮我打开谷歌浏览器"}, {"role": "assistant", "content": "Action: click(start_box='<|box_start|>(246,113)<|box_end|>')"}], "images": ["/xxx/x.jpg"]}
```
使用这种类型的数据需要注意:
- 不同模型grounding任务的特殊字符和数据集格式不同。
- 不同模型对bbox是否归一化的处理不同。例如:qwen2.5-vl使用绝对坐标,而qwen2/3-vl、internvl2.5需要对bbox的坐标进行千分位坐标归一化。
- 注意:Qwen2.5-VL采用绝对坐标,因此要小心每次的图像缩放,如果使用方案一的数据集格式,你需要预先对图像进行resize(H和W需要是28的系数),并根据该尺寸缩放坐标点。如果使用方案二的数据集格式,ms-swift会帮助你处理图像的缩放问题,你依旧可以使用`MAX_PIXELS`或者`--max_pixels`等进行图像缩放(仅训练,推理场景,你依旧需要自己处理图像的缩放问题)。
2. 使用ms-swift的grounding数据格式:
```jsonl
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "描述图像"}, {"role": "assistant", "content": "和正在沙滩上玩耍"}], "images": ["/xxx/x.jpg"], "objects": {"ref": ["一只狗", "一个女人"], "bbox": [[331.5, 761.4, 853.5, 1594.8], [676.5, 685.8, 1099.5, 1427.4]]}}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "找到图像中的"}, {"role": "assistant", "content": ""}], "images": ["/xxx/x.jpg"], "objects": {"ref": ["羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "帮我打开谷歌浏览器"}, {"role": "assistant", "content": "Action: click(start_box='')"}], "images": ["/xxx/x.jpg"], "objects": {"ref": [], "bbox": [[615, 226]]}}
```
该格式将自动转换数据集格式为对应模型的grounding任务格式,且选择对应模型的bbox归一化方式。该格式比通用格式多了objects字段,该字段包含的字段有:
- ref: 用于替换messages中的``。ref的长度需要与``的数量一致。
- bbox: 用于替换messages中的``。若bbox中每个box长度为2,则代表x和y坐标,若box长度为4,则代表2个点的x和y坐标。bbox的长度需要与``的数量一致。
- 注意:``和``并没有对应关系,ref和bbox各自替换各自的占位符。
- bbox_type: 可选项为'real','norm1'。默认为'real',即bbox为真实bbox值。若是'norm1',则bbox已经归一化为0~1。
- image_id: 通常用于多图grounding任务。该参数只有当bbox_type为'real'时生效,代表bbox对应的图片是第几张,用于缩放bbox。索引从0开始,默认全为第0张。image_id的数量需要和bbox的数量一致。例如:若bbox的长度为10,images的长度为2,那么image_id的长度需要是10,其值需要在`{0, 1}`集合内。
对于Qwen2.5-VL/Qwen3-VL,你可以使用环境`QWENVL_BBOX_FORMAT='new'`(默认为'legacy'),以兼容[官方cookbook](https://github.com/QwenLM/Qwen3-VL/blob/main/cookbooks/2d_grounding.ipynb)格式。并将数据集定义成以下格式:
```jsonl
{"messages": [{"role": "user", "content": "找到图像中的"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": , \"label\": \"\"},\n\t{\"bbox_2d\": , \"label\": \"\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
```
测试ms-swift格式的grounding数据格式的最终格式:
```python
import os
os.environ["MAX_PIXELS"] = "1003520"
from swift import get_processor, get_template
processor = get_processor('Qwen/Qwen2.5-VL-7B-Instruct')
template = get_template(processor)
data = {...}
template.set_mode('train')
encoded = template.encode(data, return_template_inputs=True)
print(f'[INPUT_IDS] {template.safe_decode(encoded["input_ids"])}\n')
print(f'[LABELS] {template.safe_decode(encoded["labels"])}')
print(f'images: {encoded["template_inputs"].images}')
```
### Agent格式
这里分别提供了纯文本Agent和多模态Agent的示例数据样本:
```jsonl
{"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"realtime_aqi\", \"description\": \"天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息\", \"parameters\": {\"type\": \"object\", \"properties\": {\"city\": {\"type\": \"string\", \"description\": \"城市名,例如:上海\"}}, \"required\": [\"city\"]}}}]", "messages": [{"role": "user", "content": "北京和上海今天的天气情况"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"北京\"}}"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"上海\"}}"}, {"role": "tool_response", "content": "{\"city\": \"北京\", \"aqi\": \"10\", \"unit\": \"celsius\"}"}, {"role": "tool_response", "content": "{\"city\": \"上海\", \"aqi\": \"72\", \"unit\": \"fahrenheit\"}"}, {"role": "assistant", "content": "根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。"}]}
{"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"click\", \"description\": \"点击屏幕中的某个位置\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"integer\", \"description\": \"横坐标,表示屏幕上的水平位置\"}, \"y\": {\"type\": \"integer\", \"description\": \"纵坐标,表示屏幕上的垂直位置\"}}, \"required\": [\"x\", \"y\"]}}}]", "messages": [{"role": "user", "content": "现在几点了?"}, {"role": "assistant", "content": "\n我可以通过打开日历App来获取当前时间。\n \n"}, {"role": "tool_call", "content": "{\"name\": \"click\", \"arguments\": {\"x\": 105, \"y\": 132}}"}, {"role": "tool_response", "content": "{\"images\": \"\", \"status\": \"success\"}"}, {"role": "assistant", "content": "成功打开日历App,现在的时间为中午11点"}], "images": ["desktop.png", "calendar.png"]}
```
- agent_template为"react_en", "hermes"等情况下,该格式适配所有模型Agent训练,可以轻松在不同模型间切换。
- 其中tools是一个包含tool列表的json字符串,messages中role为'tool_call'和'tool_response/tool'的content部分都需要是json字符串。
- tools字段将在训练/推理时和`{"role": "system", ...}"`部分组合,根据agent_template组成完整的system部分。
- `{"role": "tool_call", ...}`部分将根据agent_template自动转成对应格式的`{"role": "assistant", ...}`,多条连续的`{"role": "assistant", ...}`将拼接在一起组成完整的assistant_content。
- `{"role": "tool_response", ...}`也可以写成`{"role": "tool", ...}`,这两种写法是等价的。该部分也将根据`agent_template`自动转换格式。该部分在训练时将不进行损失的计算,角色类似于`{"role": "user", ...}`。
- 该格式支持并行调用工具,例子参考第一条数据样本。多模态Agent数据样本中``标签数量应与"images"长度相同,其标签位置代表图像特征的插入位置。当然也支持其他模态,例如audios, videos。
- 注意:您也可以手动将数据处理为role为system/user/assistant的messages格式。agent_template的作用是将其中的tools字段以及role为tool_call和tool_response的messages部分,自动映射为标准的role为system/user/assistant的messages格式。
- 更多请参考[Agent文档](../Instruction/Agent-support.md)。
### 文生图格式
```jsonl
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "给我画出一个苹果"}, {"role": "assistant", "content": ""}], "images": ["/xxx/x.jpg"]}
```
## dataset_info.json
可以参考ms-swift内置的[dataset_info.json](https://github.com/modelscope/ms-swift/blob/main/swift/dataset/data/dataset_info.json)。该方案使用AutoPreprocessor预处理函数将数据集转换为标准格式。dataset_info.json文件中包含了数据集元信息的list,以下为一些例子:
```json
[
{
"ms_dataset_id": "xxx/xxx"
},
{
"dataset_path": ""
},
{
"ms_dataset_id": "",
"subsets": ["v1"],
"split": ["train", "validation"],
"columns": {
"input": "query",
"output": "response"
}
},
{
"ms_dataset_id": "",
"hf_dataset_id": "",
"subsets": [{
"subset": "subset1",
"columns": {
"problem": "query",
"content": "response"
}
},
{
"subset": "subset2",
"columns": {
"messages": "_",
"new_messages": "messages"
}
}]
}
]
```
支持以下参数:
- ms_dataset_id: 参考DatasetMeta参数。
- hf_dataset_id: 参考DatasetMeta参数。
- dataset_path: 参考DatasetMeta参数。
- dataset_name: 参考DatasetMeta参数。
- subsets: 参考DatasetMeta参数。
- split: 参考DatasetMeta参数。
- columns: 在数据集进行预处理前,对数据集进行列名转换。
## 数据集注册
register_dataset会在`DATASET_MAPPING`中注册数据集,调用函数`register_dataset(dataset_meta)`即可完成数据集注册,其中dataset_meta将存储模型的元信息。DatasetMeta的参数列表如下:
- ms_dataset_id: ModelScope的dataset_id,默认为None。
- hf_dataset_id: HuggingFace的dataset_id,默认为None。
- dataset_path: 数据集**文件/文件夹**的本地路径(推荐使用绝对路径)。默认为None。
- dataset_name: 数据集别名,可以通过`--dataset `指定数据集,这在dataset_path很长时很方便。默认为None。
- subsets: 子数据集的名字列表或者`SubsetDataset`对象的列表,默认为`['default']`。(只有dataset_id或者dataset_dir(git clone开源数据集)有子数据集和split的概念)。
- split: 默认为`['train']`。
- preprocess_func: 预处理函数或可调用对象,默认为`AutoPreprocessor()`。该预处理函数接口为传入`HfDataset`,并返回满足标准格式的`HfDataset`。
- load_function: 默认为`DatasetLoader.load`。若需要自定义载入函数,则该载入函数需返回满足标准格式的`HfDataset`,这将抛弃ms-swift的数据集载入机制,提供给用户最大的自由度。通常该参数不需要进行修改。
以下介绍注册数据集的例子:
```python
from swift.dataset import (
ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
)
from typing import Dict, Any
class CustomPreprocessor(ResponsePreprocessor):
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
query = f"""任务:判断下面两句话语意是否相似。
句子1: {row['text1']}
句子2: {row['text2']}
请输出类别[0/1]: 0代表含义不同, 1代表含义相似。
"""
response = str(row['label'])
row = {
'query': query,
'response': response
}
return super().preprocess(row)
register_dataset(
DatasetMeta(
ms_dataset_id='swift/financial_classification',
subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
preprocess_func=CustomPreprocessor(),
))
if __name__ == '__main__':
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
dataset = load_dataset('swift/financial_classification:train')[0]
test_dataset = load_dataset('swift/financial_classification:test')[0]
print(f'dataset[0]: {dataset[0]}')
print(f'test_dataset[0]: {test_dataset[0]}')
```
================================================
FILE: docs/source/Customization/Custom-model.md
================================================
# 自定义模型
ms-swift内置的模型,你可以直接通过指定model_id或者model_path来使用:`--model `。ms-swift会根据model_id/model_path的后缀和`config.json`文件来判断model_type。
每种model_type都有唯一的模型结构、template和加载方式。当然,你也可以手动传入`--model_type`、`--template`来进行覆盖。ms-swift已支持的model_type和template可以查看[支持的模型与数据集](../Instruction/Supported-models-and-datasets.md)。
以下介绍如何注册一个新模型和对应的template。最佳实践参考[注册多模态模型最佳实践](../BestPractices/MLLM-Registration.md)。
## 模型注册
自定义模型通常使用模型注册的方式进行,可以参考[内置模型](https://github.com/modelscope/ms-swift/blob/main/swift/model/models/qwen.py)、[内置对话模板](https://github.com/modelscope/ms-swift/blob/main/swift/template/templates/qwen.py)或者[examples](https://github.com/modelscope/ms-swift/blob/main/examples/custom)的示例代码。你可以通过指定`--external_plugins xxx.py`解析外置注册的内容(方便pip install而非git clone的用户)。
register_model会在`MODEL_MAPPING`中注册模型,调用函数`register_model(model_meta)`即可完成模型注册,其中model_meta将存储模型的元信息。ModelMeta的参数列表如下:
- model_type: 必填项。模型类型,也是唯一ID。
- model_groups: 必填项。罗列ModelScope/HuggingFace的模型id和模型本地路径。运行[run_model_info.py](https://github.com/modelscope/ms-swift/blob/main/scripts/utils/run_model_info.py)文件将自动产生[支持的模型文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html)以及自动根据`--model`后缀匹配model_type。
- loader: 模型和tokenizer/processor(多模态模型)的加载器。默认使用`swift.model.ModelLoader`。
- template: 命令行不额外指定`--template`时的默认template类型。默认为None。
- model_arch: 模型架构。默认为None。多模态模型训练需要设置该参数来确定llm/vit/aligner的前缀。
- architectures: config.json中的architectures项,用于自动匹配模型对应的model_type。默认为`[]`。
- additional_saved_files: 全参数训练和merge-lora时需要额外保存的文件。默认为`[]`。
- torch_dtype: 模型加载时未传入`torch_dtype`时的默认dtype。默认为None,从config.json中读取。
- is_multimodal: 是否是多模态模型,默认为False。
- ignore_patterns: 从hub端下载文件需要忽略的文件patterns,默认为`[]`。
register_template会在`TEMPLATE_MAPPING`中注册对话模板,调用函数`register_template(template_meta)`即可完成对话模板注册,其中template_meta将存储template的元信息。TemplateMeta的参数列表如下:
- template_type: 必填项。对话模板类型,也是唯一ID。
- prefix: 必填项。对话模板的前缀,通常包含system、bos_token等部分,独立于多轮对话而产生的对话模板循环。例如qwen的prefix为`[]`。
- prompt: 必填项。表示对话模板中的`{{RESPONSE}}`之前的对话部分。我们使用`{{QUERY}}`代表user询问部分的填充符。例如qwen的prompt为`['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n']`。
- chat_sep: 必填项。多轮对话中每轮的分隔符。若设置为None,则该template不支持多轮对话。例如qwen的chat_sep为`['<|im_end|>\n']`。
- suffix: 默认为`[['eos_token_id']]`。对话模板的后缀部分,独立于多轮对话而产生的对话模板循环,通常为eos_token。例如qwen的suffix为`['<|im_end|>']。`
- template_cls: 默认为`Template`。通常在定义多模态模型的template时需要进行自定义,自定义`_encode`、`_post_encode`、`_data_collator`函数。
- system_prefix: 默认为None。含system的对话模板前缀。我们使用`{{SYSTEM}}`作为system的填充符。例如qwen的system_prefix为`['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']`。
- 注意:若system为空时,`prefix`可以被`system_prefix`替代,则可以将`prefix`写为含system的前缀,而无需设置`system_prefix`。
- 若prefix不含`{{SYSTEM}}`且未设置system_prefix,则该template不支持system。
- default_system: 默认为None。不传入`--system`时使用的默认system。例如qwen的default_system为`'You are a helpful assistant.'`。
- stop_words: 默认为`[]`。除了eos_token和`suffix[-1]`的额外停止符。例如qwen的stop_words为`['<|endoftext|>']`。
- 注意:推理时,输出的response将会过滤eos_token和`suffix[-1]`,但是会保留额外的stop_words。
================================================
FILE: docs/source/GetStarted/Quick-start.md
================================================
# 快速开始
🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持600+纯文本大模型与400+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen3、Qwen3.5、InternLM3、GLM4.5、Mistral、DeepSeek-R1、Llama4等模型,多模态大模型包括:Qwen3-VL、Qwen3-Omni、Llava、InternVL3.5、MiniCPM-V-4、Ovis2.5、GLM4.5-V、DeepSeek-VL2等模型。
🍔 除此之外,ms-swift汇集了最新的训练技术,包括集成Megatron并行技术,包括TP、PP、CP、EP等为训练提供加速,以及众多GRPO算法族强化学习的算法,包括:GRPO、DAPO、GSPO、SAPO、CISPO、RLOO、Reinforce++等提升模型智能。ms-swift支持广泛的训练任务,包括DPO、KTO、RM、CPO、SimPO、ORPO等偏好学习算法,以及Embedding、Reranker、序列分类任务。ms-swift提供了大模型训练全链路的支持,包括使用vLLM、SGLang和LMDeploy对推理、评测、部署模块提供加速,以及使用GPTQ、AWQ、BNB、FP8技术对大模型进行量化。
**为什么选择ms-swift?**
- 🍎 **模型类型**:支持600+纯文本大模型、**400+多模态大模型**以及All-to-All全模态模型训练到部署全流程,热门模型Day0支持。
- **数据集类型**:内置150+预训练、微调、人类对齐、多模态等各种任务数据集,并支持自定义数据集,用户只需准备数据集即可一键训练。
- **硬件支持**:支持A10/A100/H100、RTX系列、T4/V100、CPU、MPS以及国产硬件Ascend NPU等。
- **轻量训练**:支持了LoRA、QLoRA、DoRA、LoRA+、LLaMAPro、LongLoRA、LoRA-GA、ReFT、RS-LoRA、Adapter、LISA等轻量微调方式。
- **量化训练**:支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练,7B模型训练只需9GB训练资源。
- **显存优化**: GaLore、Q-Galore、UnSloth、Liger-Kernel、Flash-Attention 2/3 以及 **Ulysses和Ring-Attention序列并行技术**支持,降低长文本训练显存占用。
- **分布式训练**:支持分布式数据并行(DDP)、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP/FSDP2以及Megatron等分布式训练技术。
- 🍓 **多模态训练**:支持多模态packing技术提升训练速度100%+,支持文本、图像、视频和语音混合模态数据训练,支持vit/aligner/llm单独控制。
- **Agent训练**:支持Agent template,准备一套数据集可用于不同模型的训练。
- 🍊 **训练任务**:支持预训练和指令微调,以及DPO、GKD、KTO、RM、CPO、SimPO、ORPO等训练任务,支持**Embedding/Reranker**和序列分类任务。
- 🥥 **Megatron并行技术**:提供TP/PP/SP/CP/ETP/EP/VPP并行策略,显著提升**MoE模型训练速度**。支持300+纯文本大模型和100+多模态大模型的全参数和LoRA训练方法。支持CPT/SFT/GRPO/DPO/KTO/RM训练任务。
- 🍉 **强化学习**:内置**丰富GRPO族算法**,包括GRPO、DAPO、GSPO、SAPO、CISPO、CHORD、RLOO、Reinforce++等,支持同步和异步vLLM引擎推理加速,可使用插件拓展奖励函数、多轮推理调度器以及环境等。
- **全链路能力**:覆盖训练、推理、评测、量化和部署全流程。
- **界面训练**:提供使用Web-UI界面的方式进行训练、推理、评测、量化,完成大模型的全链路。
- **推理加速**:支持Transformers、vLLM、SGLang和LmDeploy推理加速引擎,并提供OpenAI接口,为推理、部署和评测模块提供加速。
- **模型评测**:以EvalScope作为评测后端,支持100+评测数据集对纯文本和多模态模型进行评测。
- **模型量化**:支持AWQ、GPTQ、FP8和BNB的量化导出,导出的模型支持使用vLLM/SGLang/LmDeploy推理加速。
## 安装
ms-swift的安装请参考[安装文档](./SWIFT-installation.md)。
## 使用样例
10分钟在单卡3090上对Qwen3-4B-Instruct-2507进行自我认知微调:
```shell
# 13GB
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model Qwen/Qwen3-4B-Instruct-2507 \
--tuner_type lora \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 1e-4 \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--gradient_accumulation_steps 16 \
--eval_steps 50 \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 5 \
--max_length 2048 \
--output_dir output \
--warmup_ratio 0.05 \
--dataloader_num_workers 4 \
--model_author swift \
--model_name swift-robot
```
小贴士:
- 如果要使用自定义数据集进行训练,你可以参考[这里](../Customization/Custom-dataset.md)组织数据集格式,并指定`--dataset `。
- `--model_author`和`--model_name`参数只有当数据集中包含`swift/self-cognition`时才生效。
- 如果要使用其他模型进行训练,你只需要修改`--model `即可。
- 默认使用**ModelScope**进行模型和数据集的下载。如果要使用HuggingFace,指定`--use_hf true`即可。
训练完成后,使用以下命令对训练后的权重进行推理:
- 这里的`--adapters`需要替换成训练生成的last checkpoint文件夹。由于adapters文件夹中包含了训练的参数文件`args.json`,因此不需要额外指定`--model`,`--system`,swift会自动读取这些参数。如果要关闭此行为,可以设置`--load_args false`。
```shell
# 使用交互式命令行进行推理
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
# merge-lora并使用vLLM进行推理加速
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--stream true \
--merge_lora true \
--infer_backend vllm \
--vllm_max_model_len 8192 \
--temperature 0 \
--max_new_tokens 2048
```
最后,使用以下命令将模型推送到ModelScope:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--adapters output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token '' \
--use_hf false
```
## 了解更多
- 更多Shell脚本:[https://github.com/modelscope/ms-swift/tree/main/examples](https://github.com/modelscope/ms-swift/tree/main/examples)
- 使用Python:[https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)
================================================
FILE: docs/source/GetStarted/SWIFT-installation.md
================================================
# SWIFT安装
## Wheel包安装
可以使用pip进行安装:
```shell
# 推荐
pip install 'ms-swift' -U
# 使用评测
pip install 'ms-swift[eval]' -U
# 全能力
pip install 'ms-swift[all]' -U
# 使用uv
pip install uv
uv pip install 'ms-swift' --torch-backend=auto
```
## 源代码安装
当前main分支为 swift4.x 版本。
```shell
# pip install git+https://github.com/modelscope/ms-swift.git
# 全能力
# pip install "git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]"
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e .
# 使用 uv
uv pip install -e . --torch-backend=auto
# 全能力
# pip install -e '.[all]'
```
安装swift3.x:
```shell
# pip install "git+https://github.com/modelscope/ms-swift.git@release/3.12"
# 全能力
# pip install "git+https://github.com/modelscope/ms-swift.git@release/3.12#egg=ms-swift[all]"
git clone -b release/3.12 https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e .
# 全能力
# pip install -e '.[all]'
```
## 镜像
docker可以查看[这里](https://github.com/modelscope/modelscope/blob/build_swift_image/docker/build_image.py#L347)。
```
# swift4.0.2
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
# swift3.12.5
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.0-vllm0.13.0-modelscope1.33.0-swift3.12.5
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.0-vllm0.13.0-modelscope1.33.0-swift3.12.5
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.0-vllm0.13.0-modelscope1.33.0-swift3.12.5
# swift3.11.3
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
```
历史镜像
```
# swift3.10.3
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.10.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.10.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.10.3
# swift3.9.3
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.9.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.9.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.8.0-vllm0.11.0-modelscope1.31.0-swift3.9.3
# swift3.8.3
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.1.1-modelscope1.29.2-swift3.8.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.1.1-modelscope1.29.2-swift3.8.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.1.1-modelscope1.29.2-swift3.8.3
# swift3.7.2
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2
# swift3.6.4
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.28.1-swift3.6.4
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.28.1-swift3.6.4
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.28.1-swift3.6.4
# swift3.5.3
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.27.1-swift3.5.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.27.1-swift3.5.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.27.1-swift3.5.3
# swift3.4.1.post1
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.5.post1-modelscope1.26.0-swift3.4.1.post1
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.5.post1-modelscope1.26.0-swift3.4.1.post1
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.5.post1-modelscope1.26.0-swift3.4.1.post1
# swift3.3.0.post1
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
# swift3.2.2
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.5.1-modelscope1.25.0-swift3.2.2
```
更多镜像可以查看[这里](https://modelscope.cn/docs/intro/environment-setup#%E6%9C%80%E6%96%B0%E9%95%9C%E5%83%8F)。
## 支持的硬件
| 硬件环境 | 备注 |
| --------------- | --------------------------- |
| A10/A100/H100 | |
| RTX20/30/40系列 | |
| T4/V100 | 部分模型出现NAN |
| Ascend NPU | 部分模型出现NAN或算子不支持 |
| MPS | 参考[issue 4572](https://github.com/modelscope/ms-swift/issues/4572) |
| CPU | |
## 运行环境
| | 范围 | 推荐 | 备注 |
|--------------|--------------|---------------------|--------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | 使用cpu、npu、mps则无需安装 |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| trl | >=0.15,<0.29 | 0.28.0 | RLHF |
| deepspeed | >=0.14 | 0.18.8 | 训练 |
| vllm | >=0.5.1 | 0.11.0/0.17.1 | 推理/部署 |
| sglang | >=0.4.6 | | 推理/部署 |
| lmdeploy | >=0.5 | 0.10.1 | 推理/部署 |
| evalscope | >=1.0 | | 评测 |
| gradio | | 5.32.1 | Web-UI/App |
更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
## Notebook环境
Swift支持训练的绝大多数模型都可以在`A10`显卡上使用,用户可以使用ModelScope官方提供的免费显卡资源:
1. 进入[ModelScope](https://www.modelscope.cn)官方网站并登录。
2. 点击左侧的`我的Notebook`并开启一个免费GPU实例。
3. 愉快地薅A10显卡羊毛。
================================================
FILE: docs/source/GetStarted/Web-UI.md
================================================
# Web-UI
目前SWIFT已经支持了界面化的训练和推理,参数支持和脚本训练相同。在安装SWIFT后,使用如下命令:
```shell
swift web-ui --lang zh
# or en
swift web-ui --lang en
```
开启界面训练和推理。
SWIFT web-ui是命令行的高级封装,即,在界面上启动的训练、部署等任务,会在系统中以命令行启动一个独立的进程,伪代码类似:
```python
import os
os.system('swift sft --model xxx --dataset xxx')
```
这给web-ui带来了几个特性:
1. web-ui的每个超参数描述都带有`--xxx`的标记,这与[命令行参数](../Instruction/Command-line-parameters.md)的内容是一致的
2. web-ui可以在一台多卡机器上并行启动多个训练/部署任务
3. web-ui服务关闭后,后台服务是仍旧运行的,这防止了web-ui被关掉后影响训练进程,如果需要关闭后台服务,只需要**选择对应的任务**后在界面上的`运行时`tab点击杀死服务
4. 重新启动web-ui后,如果需要显示正在运行的服务,在`运行时`tab点击`找回运行时任务`即可
5. 训练界面支持显示运行日志,请在选择某个任务后手动点击`展示运行状态`,在训练时运行状态支持展示训练图表,图标包括训练loss、训练acc、学习率等基本指标,在人类对齐任务重界面图标为margin、logps等关键指标
6. web-ui的训练不支持PPO,该过程比较复杂,建议使用examples的[shell脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/ppo)直接运行
如果需要使用share模式,请添加`--share true`参数。注意:请不要在dsw、notebook等环境中使用该参数。
目前ms-swift额外支持了界面推理模式(即Space部署):
```shell
swift app --model '' --studio_title My-Awesome-Space --stream true
# 或者
swift app --model '' --adapters '' --stream true
```
即可启动一个只有推理页面的应用,该应用会在启动时对模型进行部署并提供后续使用。
================================================
FILE: docs/source/Instruction/Agent-support.md
================================================
# Agent支持
## 数据集格式
ms-swift 使用 agent-template 实现了Agent数据格式与模型的解耦:基于统一的数据集格式,可以灵活切换不同模型进行训练,无需修改数据。
纯文本Agent和多模态Agent的示例数据样本如下:
```jsonl
{"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"realtime_aqi\", \"description\": \"天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息\", \"parameters\": {\"type\": \"object\", \"properties\": {\"city\": {\"type\": \"string\", \"description\": \"城市名,例如:上海\"}}, \"required\": [\"city\"]}}}]", "messages": [{"role": "user", "content": "北京和上海今天的天气情况"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"北京\"}}"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"上海\"}}"}, {"role": "tool_response", "content": "{\"city\": \"北京\", \"aqi\": \"10\", \"unit\": \"celsius\"}"}, {"role": "tool_response", "content": "{\"city\": \"上海\", \"aqi\": \"72\", \"unit\": \"fahrenheit\"}"}, {"role": "assistant", "content": "根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。"}]}
{"tools": "[{\"type\": \"function\", \"function\": {\"name\": \"click\", \"description\": \"点击屏幕中的某个位置\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"integer\", \"description\": \"横坐标,表示屏幕上的水平位置\"}, \"y\": {\"type\": \"integer\", \"description\": \"纵坐标,表示屏幕上的垂直位置\"}}, \"required\": [\"x\", \"y\"]}}}]", "messages": [{"role": "user", "content": "现在几点了?"}, {"role": "assistant", "content": "\n我可以通过打开日历App来获取当前时间。\n \n"}, {"role": "tool_call", "content": "{\"name\": \"click\", \"arguments\": {\"x\": 105, \"y\": 132}}"}, {"role": "tool_response", "content": "{\"images\": \"\", \"status\": \"success\"}"}, {"role": "assistant", "content": "成功打开日历App,现在的时间为中午11点"}], "images": ["desktop.png", "calendar.png"]}
```
- agent_template为"react_en", "hermes"等情况下,该格式适配所有模型Agent训练,可以轻松在不同模型间切换。
- 其中tools是一个包含tool列表的json字符串,messages中role为'tool_call'和'tool_response/tool'的content部分都需要是json字符串。
- tools字段将在训练/推理时和`{"role": "system", ...}"`部分组合,根据agent_template组成完整的system部分。
- `{"role": "tool_call", ...}`部分将根据agent_template自动转成对应格式的`{"role": "assistant", ...}`,多条连续的`{"role": "assistant", ...}`将拼接在一起组成完整的assistant_content。
- `{"role": "tool_response", ...}`也可以写成`{"role": "tool", ...}`,这两种写法是等价的。该部分也将根据`agent_template`自动转换格式。该部分在训练时将不进行损失的计算,角色类似于`{"role": "user", ...}`。
- 该格式支持并行调用工具,例子参考第一条数据样本。多模态Agent数据样本中``标签数量应与"images"长度相同,其标签位置代表图像特征的插入位置。当然也支持其他模态,例如audios, videos。
- 注意:您也可以手动将数据处理为role为system/user/assistant的messages格式。agent_template的作用是将其中的tools字段以及role为tool_call和tool_response的messages部分,自动映射为标准的role为system/user/assistant的messages格式。
以下为上述两条数据样本由qwen2_5和qwen2_5_vl的template进行encode后的input_ids和labels,选择的agent_template为**hermes**:
样本一(并行工具调用):
```text
[INPUT_IDS] <|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
# Tools
You may call one or more functions to assist with the user query.
You are provided with function signatures within XML tags:
{"type": "function", "function": {"name": "realtime_aqi", "description": "天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "城市名,例如:上海"}}, "required": ["city"]}}}
For each function call, return a json object with function name and arguments within XML tags:
{"name": , "arguments": }
<|im_end|>
<|im_start|>user
北京和上海今天的天气情况<|im_end|>
<|im_start|>assistant
{"name": "realtime_aqi", "arguments": {"city": "北京"}}
{"name": "realtime_aqi", "arguments": {"city": "上海"}}
<|im_end|>
<|im_start|>user
{"city": "北京", "aqi": "10", "unit": "celsius"}
{"city": "上海", "aqi": "72", "unit": "fahrenheit"}
<|im_end|>
<|im_start|>assistant
根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。<|im_end|>
[LABELS] [-100 * 195]
{"name": "realtime_aqi", "arguments": {"city": "北京"}}
{"name": "realtime_aqi", "arguments": {"city": "上海"}}
<|im_end|>[-100 * 67]根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。<|im_end|>
```
样本二(多模态,混合assistant和tool_call):
```text
[INPUT_IDS] <|im_start|>system
You are a helpful assistant.
# Tools
You may call one or more functions to assist with the user query.
You are provided with function signatures within XML tags:
{"type": "function", "function": {"name": "click", "description": "点击屏幕中的某个位置", "parameters": {"type": "object", "properties": {"x": {"type": "integer", "description": "横坐标,表示屏幕上的水平位置"}, "y": {"type": "integer", "description": "纵坐标,表示屏幕上的垂直位置"}}, "required": ["x", "y"]}}}
For each function call, return a json object with function name and arguments within XML tags:
{"name": , "arguments": }
<|im_end|>
<|im_start|>user
<|vision_start|>[151655 * 729]<|vision_end|>现在几点了?<|im_end|>
<|im_start|>assistant
我可以通过打开日历App来获取当前时间。
{"name": "click", "arguments": {"x": 105, "y": 132}}
<|im_end|>
<|im_start|>user
{"images": "<|vision_start|>[151655 * 729]<|vision_end|>", "status": "success"}
<|im_end|>
<|im_start|>assistant
成功打开日历App,现在的时间为中午11点<|im_end|>
[LABELS] [-100 * 924]
我可以通过打开日历App来获取当前时间。
{"name": "click", "arguments": {"x": 105, "y": 132}}
<|im_end|>[-100 * 759]成功打开日历App,现在的时间为中午11点<|im_end|>
```
**react_en**是常用的agent template格式之一,以下为样本一由qwen2_5使用`agent_template='react_en'`进行encode后的input_ids和labels:
```text
[INPUT_IDS] <|im_start|>system
Answer the following questions as best you can. You have access to the following tools:
realtime_aqi: Call this tool to interact with the realtime_aqi API. What is the realtime_aqi API useful for? 天气预报。获取实时空气质量。当前空气质量,PM2.5,PM10信息 Parameters: {"type": "object", "properties": {"city": {"type": "string", "description": "城市名,例如:上海"}}, "required": ["city"]} Format the arguments as a JSON object.
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [realtime_aqi]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
<|im_end|>
<|im_start|>user
北京和上海今天的天气情况<|im_end|>
<|im_start|>assistant
Action: realtime_aqi
Action Input: {'city': '北京'}
Action: realtime_aqi
Action Input: {'city': '上海'}
Observation:{"city": "北京", "aqi": "10", "unit": "celsius"}
Observation:{"city": "上海", "aqi": "72", "unit": "fahrenheit"}
根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。<|im_end|>
[LABELS] [-100 * 233]Action: realtime_aqi
Action Input: {'city': '北京'}
Action: realtime_aqi
Action Input: {'city': '上海'}
Observation:[-100 * 45]根据天气预报工具,北京今天的空气质量指数为10,属于良好水平;上海今天的空气质量指数为72,属于轻度污染水平。<|im_end|>
```
更多模型和agent_template的尝试可以使用以下代码,更多的agent template可选值参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/agent_template/__init__.py)。
```python
from swift import get_processor, get_template
tokenizer = get_processor('Qwen/Qwen3.5-2B')
template = get_template(tokenizer) # 使用默认agent模板
# template = get_template(tokenizer, agent_template='qwen3_5')
print(f'agent_template: {template._agent_template}')
data = {...}
template.set_mode('train')
encoded = template.encode(data)
print(f'[INPUT_IDS] {template.safe_decode(encoded["input_ids"])}\n')
print(f'[LABELS] {template.safe_decode(encoded["labels"])}')
```
## tools格式
tools字段提供了模型可以调用的API信息。你需要提供tools的名字,描述和参数,示例如下:
```python
tools = [{
'type': 'function',
'function': {
'name': 'get_current_weather',
'description': 'Get the current weather in a given location',
'parameters': {
'type': 'object',
'properties': {
'location': {
'type': 'string',
'description': 'The city and state, e.g. San Francisco, CA'
},
'unit': {
'type': 'string',
'enum': ['celsius', 'fahrenheit']
}
},
'required': ['location']
}
}
}]
```
## loss_scale的使用
loss_scale参数可用于调节模型输出部分在训练过程中的损失权重。目前支持两种配置方式:字符串精确匹配和正则表达式匹配。
1. 字符串匹配示例:ReACT 格式
以 ReACT 格式为例,可通过 `--loss_scale react` 启用相应的 loss_scale 配置(配置文件详见 [react.json](https://github.com/modelscope/ms-swift/blob/main/swift/loss_scale/config/react.json))。该方式基于字符串精确匹配,配置中的字典映射需提供一个包含两个元素的列表,分别表示:当前匹配字符串本身的损失权重,
从该字符串之后到下一个指定字符串之前的内容的损失权重。该设置的具体效果如下:
- 'Action:' 和 'Action Input:' 字段自身及其后续内容的损失权重均为 2;
- 'Thought:' 和 'Final Answer:' 字段自身及其后续内容的损失权重均为 1;
- 'Observation:' 字段自身的权重为 2,但其后跟随的工具调用结果部分的损失权重为 0。
2. 正则匹配示例:忽略空思维块
在训练推理模型时,我们可能需要忽略数据集中存在的形如 `'\n\n \n\n'`的空思维标记损失计算。此时可使用 `--loss_scale ignore_empty_think`(配置文件详见 [ignore_empty_think.json](https://github.com/modelscope/ms-swift/blob/main/swift/loss_scale/config/ignore_empty_think.json))。该配置采用正则表达式匹配方式,字典映射的列表只需指定一个值,表示匹配内容的损失权重。该设置的具体效果如下:
- 所有与正则表达式`\\s* \\s*`匹配的字符串,loss_scale为0,即不计算损失。
使用代码测试loss_scale:
```python
from swift import get_processor, get_template
data = {"messages": [
{"role": "user", "content": "aaaaa"},
{"role": "assistant", "content": "\n\n \n\nabc\n\n \n\n123"},
]}
template = get_template(get_processor('Qwen/Qwen3-8B'), loss_scale='ignore_empty_think')
template.set_mode('train')
inputs = template.encode(data)
print(template.safe_decode(inputs['labels']))
# '[-100 * 14]abc\n\n \n\n123<|im_end|>\n'
```
更多的loss_scale插件设计,请参考[架构](../Customization/Architecture.md#loss-scale)文档.
## 训练
- 训练Base模型的Agent能力,通过修改`--model`切换不同模型,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent/qwen2_5.sh)。
- 训练GLM4的agent_template为hermes,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent/glm4.sh)。
- 使用`--loss_scale`对模型输出部分的损失权重进行调整,参加[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/agent/loss_scale)。
## 推理
- 🚀原始模型或者全参数训练后模型的推理,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py)。
- LoRA训练后推理,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent/loss_scale/infer_lora.py)。
## 部署
服务端和客户端代码,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/agent)。
================================================
FILE: docs/source/Instruction/Command-line-parameters.md
================================================
# 命令行参数
命令行参数的介绍会分为基本参数,原子参数、集成参数和特定模型参数。**命令行最终使用的参数列表为集成参数。集成参数继承自基本参数和一些原子参数**。特定模型参数是针对于具体模型的参数,可以通过`--model_kwargs'`或者环境变量进行设置。Megatron-SWIFT命令行参数介绍可以在[Megatron-SWIFT训练文档](../Megatron-SWIFT/Command-line-parameters.md)中找到。
**提示:**
- 命令行传入list使用空格隔开即可。例如:`--dataset `。
- 命令行传入dict使用json。例如:`--model_kwargs '{"fps_max_frames": 12}'`。
- 带🔥的参数为重要参数,刚熟悉ms-swift的用户可以先关注这些命令行参数。
## 基本参数
- 🔥tuner_backend: 可选为'peft','unsloth'。默认为'peft'。
- 🔥tuner_type: 可选为'lora'、'full'、'longlora'、'adalora'、'llamapro'、'adapter'、'vera'、'boft'、'fourierft'、'reft'。默认为'lora'。
- 🔥adapters: 用于指定adapter的id/path的list,默认为`[]`。该参数通常用于推理/部署命令,例如:`swift infer --model '' --adapters ''`。该参数偶尔也用于断点续训,该参数与`resume_from_checkpoint`的区别在于,**该参数只读取adapter权重**,而不加载优化器和随机种子,并不跳过已训练的数据集部分。
- `--model`与`--adapters`的区别:`--model`后接完整权重的目录路径,内包含model/tokenizer/config等完整权重信息,例如`model.safetensors`。`--adapters`后接增量adapter权重目录路径的列表,内涵adapter的增量权重信息,例如`adapter_model.safetensors`。
- 🔥external_plugins: 外部`plugin.py`文件列表,这些文件会被额外加载(即对模块进行`import`)。默认为`[]`。你可以传入自定义模型、对话模板和数据集注册的`.py`文件路径,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/custom/sft.sh);或者自定义GRPO的组件,参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh)。
- seed: 全局随机种子,默认为42。
- 注意:该随机种子与控制数据集随机的`data_seed`互不影响。
- model_kwargs: 特定模型可传入的额外参数,该参数列表会在训练/推理时打印日志进行提示。例如`--model_kwargs '{"fps_max_frames": 12}'`。你也可以通过环境变量的方式设置,例如`FPS_MAX_FRAMES=12`。默认为None。
- 注意:**若你在训练时指定了特定模型参数,请在推理时也设置对应的参数**,这可以提高训练效果。
- 特定模型参数的含义可以在对应模型官方repo或者其推理代码中找到相应含义。ms-swift引入这些参数以确保训练的模型与官方推理代码效果对齐。
- load_args: 当指定`--resume_from_checkpoint`、`--model`、`--adapters`会读取保存文件中的`args.json`,读取的keys查看[base_args.py](https://github.com/modelscope/ms-swift/blob/main/swift/arguments/base_args/base_args.py)。推理和导出时默认为True,训练时默认为False。该参数通常不需要修改。
- load_data_args: 如果将该参数设置为True,则会额外读取`args.json`中的数据参数。默认为False。**该参数通常用于推理时对训练中切分的验证集进行推理**,例如:`swift infer --adapters xxx --load_data_args true --stream true --max_new_tokens 512`。
- use_hf: 控制模型下载、数据集下载、模型推送使用[ModelScope](https://modelscope.cn/)还是[HuggingFace](https://huggingface.co/)。默认为False,使用ModelScope。
- 提示:如果你想在国外访问ModelScope,可以尝试使用[ModelScope国际版](https://modelscope.ai/home),设置环境变量`MODELSCOPE_DOMAIN='www.modelscope.ai'`即可。
- hub_token: hub token. modelscope的hub token可以查看[这里](https://modelscope.cn/my/myaccesstoken)。默认为None。
- ddp_timeout: 默认为18000000,单位为秒。
- ddp_backend: 可选为"nccl"、"gloo"、"mpi"、"ccl"、"hccl"、"cncl"、"mccl"。默认为None,进行自动选择。
- ignore_args_error: 用于兼容jupyter notebook。默认为False。
### 模型参数
- 🔥model: [模型id](https://modelscope.cn/models)或模型本地路径。默认为None。
- 🔥model_type: 模型类型。**我们将相同的模型架构、模型加载过程、template定义为一个`model_type`**。默认为None,即**根据`--model`的后缀和config.json中的'architectures'属性进行自动选择**。对应模型的model_type可以在[支持的模型列表](./Supported-models-and-datasets.md)中找到。
- 注意:ms-swift中model_type的概念与`config.json`中的model_type不同。
- 自定义模型通常需要自行注册`model_type`和`template`,具体可以参考[自定义模型文档](../Customization/Custom-model.md)。
- model_revision: 模型版本,默认为None。
- task_type: 默认为'causal_lm'。可选为'causal_lm'、'seq_cls'、'embedding'、'reranker'和'generative_reranker'。seq_cls的例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls),embedding的例子查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/embedding)。
- 若设置为'seq_cls',你通常需要额外设置`--num_labels`和`--problem_type`。
- 🔥torch_dtype: 模型权重的数据类型,支持`float16`,`bfloat16`,`float32`。默认为None,从'config.json'文件中读取。
- attn_impl: attention类型,可选项为'sdpa', 'eager', 'flash_attn', 'flash_attention_2', 'flash_attention_3'等。默认使用None,读取'config.json'。
- 注意:这几种attention实现并不一定都支持,这取决于对应模型transformers实现的支持情况。
- 若设置为'flash_attn'(兼容旧版本),则使用'flash_attention_2'。
- 🔥experts_impl: 专家实现类型,可选项为'grouped_mm', 'batched_mm', 'eager'。默认为None。该特性需要"transformers>=5.0.0"。
- new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)。
- 注意:你也可以传入以`.txt`结尾的文件路径,每行为一个special token。
- num_labels: 分类模型(即`--task_type seq_cls`)需要指定该参数。代表标签数量,默认为None。
- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None,若模型为 reward_model 或 num_labels 为1,该参数为'regression',其他情况,该参数为'single_label_classification'。
- rope_scaling: rope类型,你可以传入字符串,例如:`linear`、`dynamic`、`yarn`并结合传入`max_model_len`,ms-swift会自动设置对应的rope_scaling并覆盖'config.json'中的rope_scaling。或者你需要传入一个json字符串,例如`'{"factor":2.0, "type":"yarn"}'`,该值会直接覆盖'config.json'中的rope_scaling。默认为None。
- max_model_len: 如果使用`rope_scaling`并传入字符串,可以设置`max_model_len`,该参数用来计算rope的`factor`倍数。该参数默认为None。若为非None,该参数会**覆盖**'config.json'中的`max_position_embeddings`值。
- device_map: 模型使用的device_map配置,例如:'auto'、'cpu'、json字符串、json文件路径。该参数会**透传**入transformers的`from_pretrained`接口。默认为None,根据设备和分布式训练情况自动设置。
- max_memory: device_map设置为'auto'或者'sequential'时,会根据max_memory进行模型权重的device分配,例如:`--max_memory '{0: "20GB", 1: "20GB"}'`。默认为None。该参数会透传入transformers的`from_pretrained`接口。
- local_repo_path: 部分模型在加载时依赖于github repo,例如[deepseek-vl2](https://github.com/deepseek-ai/DeepSeek-VL2)。为了避免`git clone`时遇到网络问题,可以直接使用本地repo。该参数需要传入本地repo的路径, 默认为`None`。
- init_strategy: 加载模型时,初始化模型中所有未初始化的参数(自定义模型架构时)。可选为'zero', 'uniform', 'normal', 'xavier_uniform', 'xavier_normal', 'kaiming_uniform', 'kaiming_normal', 'orthogonal'。默认为None。
### 数据参数
- 🔥dataset: 数据集id或路径的list。默认为`[]`。每个数据集的传入格式为:`'数据集id or 数据集路径:子数据集#采样数量'`,其中子数据集和取样数据可选。本地数据集支持jsonl、csv、json、文件夹等。**hub端的开源数据集可以通过`git clone`到本地并将文件夹传入而离线使用**。自定义数据集格式可以参考[自定义数据集文档](../Customization/Custom-dataset.md)。你可以传入`--dataset `来使用多个数据集。
- 子数据集: 该参数只有当dataset为ID或者文件夹时生效。若注册时指定了subsets,且只有一个子数据集,则默认选择注册时指定的子数据集,否则默认为'default'。你可以使用`/`来选择多个子数据集,例如:`:subset1/subset2`。你也可以使用'all'来选择注册时指定的所有子数据集,例如:`:all`。注册例子可以参考[这里](https://modelscope.cn/datasets/swift/garbage_competition)。
- 采样数量: 默认使用完整的数据集。你可以通过设置`#采样数`对选择的数据集进行采样,例如``。若采样数少于数据样本总数,则进行随机选择(不重复采样)。若采样数高于数据样本总数,则只额外随机采样`采样数%数据样本总数`的样本,数据样本重复采样`采样数//数据样本总数`次。注意:流式数据集(`--streaming true`)只进行顺序采样。若设置`--dataset_shuffle false`,则非流式数据集也进行顺序采样。
- 🔥val_dataset: 验证集id或路径的list。默认为`[]`。
- 🔥cached_dataset: 使用缓存数据集(使用`swift export --to_cached_dataset true ...`命令产生),避免大型数据集训练/推理时,tokenize过程占用gpu时间。该参数用于设置缓存训练数据集文件夹路径,默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/cached_dataset)。
- 提示:cached_dataset只会在数据集中额外存储length字段(为避免存储压力),并过滤掉会报错的数据样本。在训练/推理时,支持`--max_length`参数进行超长数据过滤/裁剪以及`--packing`参数。数据实际预处理过程将在训练时同步进行,该过程和训练是重叠的,并不会影响训练速度。
- cached_dataset在`ms-swift`和`Megatron-SWIFT`之间是通用的,且支持pt/sft/infer/rlhf,使用`--template_mode`设置训练类型;支持embedding/reranker/seq_cls任务,使用`--task_type`设置任务类型。
- 支持对cache_dataset进行采样,语法为`#采样数`,支持采样数高于和少于样本数的情况,功能与实现参考`--dataset`的介绍。
- cached_val_dataset: 缓存验证数据集的文件夹路径,默认为`[]`。
- 🔥split_dataset_ratio: 不指定val_dataset时从训练集拆分验证集的比例,默认为0.,即不从训练集切分验证集。
- data_seed: 数据集随机种子,默认为42。
- 🔥dataset_num_proc: 数据集预处理的进程数,默认为1。
- 提示:纯文本模型建议将该值开大加速预处理速度。而多模态模型不建议开太大,这可能导致更慢的预处理速度(多模态模型若出现cpu利用率100%,但是处理速度极慢的情况,建议额外设置`OMP_NUM_THREADS`环境变量)。
- 🔥load_from_cache_file: 是否从缓存中加载数据集,默认为False。**建议在实际运行时设置为True,debug阶段设置为False**。你可以修改`MODELSCOPE_CACHE`环境变量控制缓存的路径。
- dataset_shuffle: 是否对dataset进行随机操作。默认为True。
- 注意:**CPT/SFT的随机包括两个部分**:数据集的随机,由`dataset_shuffle`控制;train_dataloader中的随机,由`train_dataloader_shuffle`控制。
- val_dataset_shuffle: 是否对val_dataset进行随机操作。默认为False。
- streaming: 流式读取并处理数据集,默认False。(流式数据集的随机并不彻底,可能导致loss波动剧烈。)
- 注意:需要额外设置`--max_steps`,因为流式数据集无法获得其长度。你可以通过设置`--save_strategy epoch`并设置较大的max_steps来实现与`--num_train_epochs`等效的训练。或者,你也可以设置`max_epochs`确保训练到对应epochs时退出训练,并对权重进行验证和保存。
- 注意:流式数据集可以跳过预处理等待,将预处理时间与训练时间重叠。流式数据集的预处理只在rank0上进行,并通过数据分发的方式同步到其他进程,**其通常效率不如非流式数据集采用的数据分片读取方式**。当训练的world_size较大时,预处理和数据分发将成为训练瓶颈。
- interleave_prob: 默认值为 None。在组合多个数据集时,默认使用datasets库的 `concatenate_datasets` 函数;如果设置了该参数,则会使用 `interleave_datasets` 函数。该参数通常用于流式数据集的组合,并会作为参数传入 `interleave_datasets` 函数中。该参数不对`--val_dataset`生效。
- stopping_strategy: 可选为"first_exhausted", "all_exhausted",默认为"first_exhausted"。传入`interleave_datasets`函数中。该参数不对`--val_dataset`生效。
- shuffle_buffer_size: 该参数用于指定**流式数据集**的随机buffer大小,默认为1000。该参数只在`dataset_shuffle`设置为true时有效。
- download_mode: 数据集下载模式,包含`reuse_dataset_if_exists`和`force_redownload`,默认为'reuse_dataset_if_exists'。
- 通常在使用hub端数据集报错时设置为`--download_mode force_redownload`。
- columns: 用于对数据集进行列映射,使数据集满足AutoPreprocessor可以处理的样式,AutoPreprocessor可以处理的数据集格式查看[自定义数据集文档](../Customization/Custom-dataset.md)。你可以传入json字符串,例如:`'{"text1": "query", "text2": "response"}'`,代表将数据集中的"text1"映射为"query","text2"映射为"response",而query-response格式可以被AutoPreprocessor处理。默认为None。
- strict: 如果为True,则数据集只要某行有问题直接抛错,否则会丢弃出错数据样本。默认False。该参数通常用于排查错误。
- 🔥remove_unused_columns: 是否删除数据集中不被使用的列,默认为True。
- 若该参数设置为False,则将额外的数据集列传递至trainer的`compute_loss`函数内,**方便自定义损失函数使用额外的数据集列**。
- GPRO该参数的默认值为False。
- 🔥model_name: **仅用于自我认知任务**,只对`swift/self-cognition`数据集生效,替换掉数据集中的`{{NAME}}`通配符。传入模型中文名和英文名,以空格分隔,例如:`--model_name 小黄 'Xiao Huang'`。默认为None。
- 🔥model_author: 仅用于自我认知任务,只对`swift/self-cognition`数据集生效,替换掉数据集中的`{{AUTHOR}}`通配符。传入模型作者的中文名和英文名,以空格分隔,例如:`--model_author '魔搭' 'ModelScope'`。默认为None。
- custom_dataset_info: 自定义数据集注册的json文件路径,参考[自定义数据集](../Customization/Custom-dataset.md)和[内置'dataset_info.json'文件](https://github.com/modelscope/ms-swift/blob/main/swift/dataset/data/dataset_info.json)。默认为`[]`。
### 模板参数
- 🔥template: 对话模板类型。默认为None,自动选择对应model的template类型,对应关系参考[支持的模型列表](./Supported-models-and-datasets.md)。
- 🔥system: 自定义system字段,可以传入字符串或者**txt文件路径**。默认为None,使用注册template时的默认system。
- 注意:数据集中的system**优先级**最高,然后是`--system`,最后是注册template时设置的`default_system`。
- 🔥max_length: 限制单数据集样本经过`tokenizer.encode`后的tokens最大长度,超过的数据样本会根据`truncation_strategy`参数进行处理(避免训练OOM)。默认为None,即设置为模型支持的tokens最大长度(max_model_len)。
- 当PPO、GRPO、GKD和推理情况下,`max_length`代表`max_prompt_length`。
- truncation_strategy: 如果单样本的tokens超过`max_length`如何处理,支持'delete'、'left'、'right'和'split',代表删除、左侧裁剪、右侧裁剪和切成多条数据样本,默认为'delete'。
- 注意:`--truncation_strategy split`只支持预训练时使用,即`swift/megatron pt`场景下,该策略会将超长字段切成多条数据样本,从而避免tokens浪费。(该特性不兼容cached_dataset)
- 注意:若多模态模型的训练时将'truncation_strategy'设置为`left`或`right`,**ms-swift会保留所有的image_token等多模态tokens**,这可能会导致训练时OOM。
- 🔥max_pixels: 多模态模型输入图片的最大像素数(H\*W),将超过该限制的图像进行缩放(避免训练OOM)。默认为None,不限制最大像素数。
- 注意:该参数适用于所有的多模态模型。而Qwen2.5-VL特有的模型参数`MAX_PIXELS`(你可以在文档最下面找到)只针对Qwen2.5-VL模型。
- 🔥agent_template: Agent模板,确定如何将工具列表'tools'转换成'system'、如何在推理/部署时从模型回复中提取toolcall部分,以及确定'messages'中`{"role": "tool_call", "content": "xxx"}`, `{"role": "tool_response", "content": "xxx"}`的模板格式。可选为"react_en", "hermes", "glm4", "qwen_en", "toolbench"等,更多请查看[这里](https://github.com/modelscope/ms-swift/blob/main/swift/agent_template/mapping.py)。默认为None,根据模型类型进行自动选择。可以参考[Agent文档](./Agent-support.md)。
- norm_bbox: 控制如何缩放边界框(即数据集中的"bbox",里面的数据为绝对坐标,参考[自定义数据集文档](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html#grounding))。选项为'norm1000'和'none'。'norm1000'表示将bbox坐标缩放至千分位坐标,而'none'表示不进行缩放。默认值为None,将根据模型自动选择。
- 当**图片在训练中发生缩放时**(例如设置了max_pixels参数),该参数也能很好进行解决。
- use_chat_template: 使用chat模板还是generation模板(generation模板通常用于预训练时)。默认为`True`。
- 注意:`swift pt`默认为False,使用generation模板。该参数可以很好的**兼容多模态模型**。
- padding_side: 当训练`batch_size>=2`时的padding_side,可选值为'left'、'right',默认为'right'。(推理时的batch_size>=2时,只进行左padding)。
- 注意:PPO和GKD默认设置为'left'。
- 🔥padding_free: 将一个batch中的数据进行展平而避免数据padding,从而降低显存占用并加快训练(**同一batch的不同序列之间依旧是不可见的**)。默认为False。当前支持CPT/SFT/DPO/GRPO/KTO/GKD。
- 注意:使用padding_free请结合`--attn_impl flash_attn`使用且"transformers>=4.44",具体查看[该PR](https://github.com/huggingface/transformers/pull/31629)。(同packing)
- **相较于packing,padding_free不需要额外的预处理时间,但packing的训练速度更快且显存占用更稳定**。
- 🔥loss_scale: 训练tokens的loss权重设置。默认为`'default'`。loss_scale包含3种基本策略:'default'、'last_round'、'all',以及其他策略:'ignore_empty_think'以及agent需要的:'react'、'hermes'、'qwen'、'agentflan'、'alpha_umi'等,可选值参考[loss_scale模块](https://github.com/modelscope/ms-swift/blob/main/swift/loss_scale/mapping.py)。ms-swift 支持了基本策略和其他策略的混用,例如:`'default+ignore_empty_think'`,`'last_round+ignore_empty_think'`。若没有指定基本策略,则默认为'default',例如:'hermes'与'default+hermes'等价。
- 'default': 所有response(含history)以权重1计算交叉熵损失(**messages中的system/user/多模态tokens以及Agent训练中`tool_response`部分不计算损失**)。(**SFT默认为该值**)
- 'last_round': 只计算最后一轮response的损失。最后一轮含义为最后一个"user"之后的所有内容。(**RLHF默认为该值**)
- 'all': 计算所有tokens的损失。(**`swift pt`默认为该值**)
- 'ignore_empty_think': 忽略空的`'\n\n \n\n'`损失计算。(满足正则匹配`'\\s* \\s*'`即可)。
- 'react', 'hermes', 'qwen': 将`tool_call`部分的loss权重调整为2。
- sequence_parallel_size: 序列并行大小,默认是1。当前支持CPT/SFT/DPO/GRPO。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel)。
- template_backend: 选择template后端,可选为'swift'、'jinja',默认为'swift'。如果使用jinja,则使用transformers的`apply_chat_template`。
- 注意:jinja的template后端只支持推理,不支持训练(无法确定损失计算的tokens范围)。
- response_prefix: response的前缀字符,该参数只在推理时生效。默认为None,根据enable_thinking参数和模版类型确定。
- enable_thinking: 该参数在推理时生效,代表是否开启thinking模式。默认为None,默认值由模板(模型)类型确定(思考/混合思考模板为True,非思考模板为False)。若enable_thinking为False,则增加非思考前缀,例如Qwen3-8B混合思考模型增加前缀`'\n\n \n\n'`,Qwen3-8B-Thinking则不增加前缀。若enable_thinking为True,则增加思考前缀,例如`'\n'`。注意:该参数的优先级低于response_prefix参数。
- 注意:对于思考模型(思考/混合思考)或显式开启enable_thinking,我们会在推理和训练时,对历史的思考内容进行删除(最后一轮的思考内容保留,即最后一个user信息后的内容)。若训练时的loss_scale基本策略不为last_round,例如为'default',则不对历史的思考内容进行删除。
- add_non_thinking_prefix: 该参数只在训练时生效,代表是否对数据样本assistant部分**不以思考标记`''`开头**的数据样本增加非思考前缀(通常混合思考模型含非思考前缀)。该特性可以让swift内置的数据集可以训练混合思考模型。默认值为True。例如:例如Qwen3-8B混合思考模型的非思考前缀为`'\n\n \n\n'`,Qwen3-8B-Thinking/Instruct的非思考前缀为`''`。注意:训练时,loss_scale的基本策略为last_round,则只对最后一轮做此修改;否则,例如为'default'、'all',则对每一轮数据做此修改。若设置为False,则不对数据样本增加非思考前缀。
### 生成参数
参考[generation_config](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig)文档。
- 🔥max_new_tokens: 推理最大生成新tokens的数量。默认为None,无限制。
- temperature: 温度参数,温度越高,输出的随机性越大。默认为None,读取'generation_config.json'。
- 你可以设置`--temperature 0`或者`--top_k 1`以取消推理随机性。
- top_k: top_k参数,保留概率最高的top_k数量 tokens用于生成,默认为None。读取'generation_config.json'。
- top_p: top_p参数,保留概率最高的累计概率达到 top_p 的tokens用于生成,默认为None。读取generation_config.json。
- repetition_penalty: 重复惩罚参数。1.0 表示不进行惩罚。默认为None,读取generation_config.json。
- num_beams: beam search的并行保留数量,默认为1。
- 🔥stream: 流式输出,默认为`None`,即使用交互式界面时为True,数据集批量推理时为False。
- stop_words: 除了eos_token外额外的停止词,默认为`[]`。
- 注意:eos_token会在输出respsone中被删除,额外停止词会在输出中保留。
- logprobs: 是否输出logprobs,默认为False。
- top_logprobs: 输出top_logprobs的数量,默认为None。
- structured_outputs_regex: 结构化输出(引导解码)的正则表达式模式。设置后,模型生成将被约束为匹配指定的正则表达式模式。仅在`infer_backend`为`vllm`时生效。默认为`None`。
### 量化参数
以下为加载模型时量化的参数,具体含义可以查看[量化](https://huggingface.co/docs/transformers/main/en/main_classes/quantization)文档。这里不包含`swift export`中涉及的`gptq`、`awq`量化参数。
- 🔥quant_method: 加载模型时采用的量化方法,可选项为'bnb'、'hqq'、'eetq'、'quanto'和'fp8',默认为None。
- 若对awq/gptq量化模型进行qlora训练,则不需要设置额外`quant_method`等量化参数。
- 🔥quant_bits: 量化bits数,默认为None。
- hqq_axis: hqq量化axis,默认为None。
- bnb_4bit_compute_dtype: bnb量化计算类型,可选为`float16`、`bfloat16`、`float32`。默认为None,设置为`torch_dtype`。
- bnb_4bit_quant_type: bnb量化类型,支持`fp4`和`nf4`,默认为`nf4`。
- bnb_4bit_use_double_quant: 是否使用双重量化,默认为`True`。
- bnb_4bit_quant_storage: bnb量化存储类型,默认为None。
### RAY参数
- use_ray: boolean类型。是否使用ray,默认为`False`
- ray_exp_name: ray实验名字,这个字段会用作cluster和worker名称前缀,可以不填
- device_groups: 字符串(jsonstring)类型。在使用ray时,该字段必须配置,具体可以查看[ray文档](Ray.md)。
### yaml支持
- config: 可以使用config代替命令行参数,例如:
```shell
swift sft --config demo.yaml
```
demo.yaml的内容为具体命令行配置:
```yaml
# Model args
model: Qwen/Qwen2.5-7B-Instruct
dataset: swift/self-cognition
...
# Train args
output_dir: xxx/xxx
gradient_checkpointing: true
...
```
## 原子参数
### Seq2SeqTrainer参数
该参数列表继承自transformers `Seq2SeqTrainingArguments`,ms-swift对其默认值进行了覆盖。未列出的请参考[HF官方文档](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments)。
- 🔥output_dir: 模型预测结果和检查点将被写入的输出目录。默认为None,设置为`'output/'`。
- 🔥gradient_checkpointing: 是否使用gradient_checkpointing,默认为True。该参数可以显著降低显存占用,但降低训练速度。
- 🔥vit_gradient_checkpointing: 多模态模型训练时,是否对vit部分开启gradient_checkpointing。默认为None,即设置为`gradient_checkpointing`。例子参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/vit_gradient_checkpointing.sh)。
- 注意:多模态模型且是LoRA训练时,当设置了`--freeze_vit false`,且命令行中出现以下警告:`UserWarning: None of the inputs have requires_grad=True. Gradients will be None`,请设置`--vit_gradient_checkpointing false`,或提相关issue。全参数训练则不会出现该问题。(如果RLHF LoRA训练中,ref_model抛出来的警告,则是正常的)
- 🔥deepspeed: 默认为None。可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件。你也可以传入自定义deepspeed配置文件的路径。
- zero_hpz_partition_size: 默认为None,这个参数是ZeRO++的特性,即node内模型分片,node间数据分片,如果遇到grad_norm NaN,请尝试使用`--torch_dtype float16`。
- deepspeed_autotp_size: DeepSpeed张量并行大小,默认为1。使用DeepSpeed AutoTP时需将参数`--deepspeed`设置为'zero0'、'zero1'或'zero2'。(注意:该功能只支持全参数)
- 🔥fsdp: FSDP2分布式训练配置。默认为None。可以设置为'fsdp2'来使用ms-swift内置的FSDP2配置文件。你也可以传入自定义FSDP配置文件的路径。FSDP2是PyTorch原生的分布式训练方案,与DeepSpeed二选一使用。
- 🔥per_device_train_batch_size: 默认值1。
- 🔥per_device_eval_batch_size: 默认值1。
- 🔥gradient_accumulation_steps: 梯度累加。**默认为None,即设置gradient_accumulation_steps使得total_batch_size>=16**。total_batch_size等于`per_device_train_batch_size * gradient_accumulation_steps * world_size`。在GRPO训练中,默认为1。
- 在CPT/SFT训练中,梯度累加的训练效果等价使用更大的batch_size,但在RLHF训练中,训练效果并不等价。
- weight_decay: weight衰减系数,默认值0.1。
- adam_beta1: Adam系列优化器中一阶矩估计(动量)的指数衰减率。默认为0.9。
- adam_beta2: Adam系列优化器中二阶矩估计(方差)的指数衰减率。默认为0.95。
- adam_epsilon: Adam系列优化器中用于数值稳定性的epsilon值。默认为1e-8。
- 🔥learning_rate: 学习率,**全参数训练默认为1e-5,LoRA训练等tuners为1e-4**。
- 提示:若要设置`min_lr`,您可以传入参数`--lr_scheduler_type cosine_with_min_lr --lr_scheduler_kwargs '{"min_lr": 1e-6}'`。
- 🔥vit_lr: 当训练多模态大模型时,该参数指定vit的学习率,默认为None,等于learning_rate。通常与`--freeze_vit`、`--freeze_aligner`参数结合使用。
- 提示:在日志中打印的"learning_rate"为`param_groups[0]`的学习率,其中param_groups的顺序依次是vit, aligner, llm(若含可训练参数)。
- 🔥aligner_lr: 当训练多模态大模型时,该参数指定aligner的学习率,默认为None,等于learning_rate。
- lr_scheduler_type: lr_scheduler类型,默认为'cosine'。
- lr_scheduler_kwargs: lr_scheduler其他参数。默认为None。
- gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数。例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`。默认为None。
- 注意:当使用DDP而不使用deepspeed/fsdp,且gradient_checkpointing_kwargs为None,会默认设置其为`'{"use_reentrant": false}'`而避免出现报错。
- full_determinism: 确保训练中获得可重现的结果,注意:这会对性能产生负面影响。默认为False。
- 🔥report_to: 默认值为`tensorboard`。你也可以指定`--report_to tensorboard wandb swanlab`、`--report_to all`。
- 如果你指定了`--report_to wandb`,你可以通过`WANDB_PROJECT`设置项目名称,`WANDB_API_KEY`指定账户对应的API KEY。
- logging_first_step: 是否记录第一个step的日志,默认为True。
- logging_steps: 日志打印间隔,默认为5。
- router_aux_loss_coef: 用于moe模型训练时,设置 aux_loss 的权重,默认为`0.`。
- enable_dft_loss: 是否在SFT训练中使用[DFT](https://arxiv.org/abs/2508.05629) (Dynamic Fine-Tuning) loss,默认为False。
- enable_channel_loss: 启用channel loss,默认为`False`。你需要在数据集中准备"channel"字段,ms-swift会根据该字段分组统计loss(若未准备"channel"字段,则归为默认`None` channel)。数据集格式参考[channel loss](../Customization/Custom-dataset.md#channel-loss)。channel loss兼容packing/padding_free/loss_scale等技术。
- safe_serialization: 是否存储为safetensors,默认为True。
- max_shard_size: 单存储文件最大大小,默认'5GB'。
- logging_dir: tensorboard日志保存路径。默认为None,即设置为`f'{self.output_dir}/runs'`。
- predict_with_generate: 验证时使用生成式的方式,默认为False。
- metric_for_best_model: 默认为None,即当`predict_with_generate`设置为False时,设置为'loss',否则设置为'rouge-l'(在PPO训练时,不进行默认值设置;GRPO训练设置为'reward')。
- greater_is_better: 默认为None,即当`metric_for_best_model`含'loss'时,设置为False,否则设置为True。
- max_epochs: 训练到`max_epochs`时强制退出训练,并对权重进行验证和保存。该参数在使用流式数据集时很有用。默认为None。
其他重要参数:
- 🔥num_train_epochs: 训练的epoch数,默认为3。
- 🔥save_strategy: 保存模型的策略,可选为'no'、'steps'、'epoch',默认为'steps'。
- 🔥save_steps: 默认为500。
- 🔥eval_strategy: 评估策略。默认为None,跟随`save_strategy`的策略。
- 若不使用`val_dataset`和`eval_dataset`且`split_dataset_ratio`为0,则默认为'no'。
- 🔥eval_steps: 默认为None,如果存在评估数据集,则跟随`save_steps`的策略。
- eval_on_start: 是否在训练前执行一次评估步骤,以确保验证步骤能正常工作。默认为False。
- 🔥save_total_limit: 最多保存的checkpoint数,会将过期的checkpoint进行删除。默认为None,保存所有的checkpoint。若设置为2,则保存best checkpoint和last checkpoint。
- max_steps: 最大训练的steps数。在数据集为流式时需要被设置。默认为-1。
- 🔥warmup_ratio: 默认为0.。
- save_on_each_node: 在每一个节点都进行权重保存。默认为False。该参数在多机训练时需要被考虑。
- 提示:在多机训练时,通常将`output_dir`设置为节点共享目录,因此无需额外设置该参数。
- save_only_model: 是否只保存模型权重而不包含优化器状态,随机种子状态等内容,这在全参数训练时可以减少保存的时间消耗和空间占用。默认为False。
- 🔥resume_from_checkpoint: 断点续训参数,指定checkpoint路径。默认为None。
- 提示:**断点续训请保持其他参数不变,额外增加`--resume_from_checkpoint checkpoint_dir`**。权重等信息将在trainer中读取。
- 注意: resume_from_checkpoint会读取模型权重,优化器状态,随机种子,并从上次训练的steps继续开始训练。你可以指定`--resume_only_model`只读取模型权重。
- resume_only_model: 默认为False。如果在指定resume_from_checkpoint的基础上,将该参数设置为True,则仅resume模型权重,而忽略优化器状态和随机种子。
- 注意:**resume_only_model默认将进行数据跳过**,此行为可通过 `ignore_data_skip` 参数控制。
- ignore_data_skip: 当设置`resume_from_checkpoint`和`resume_only_model`时,该参数控制是否跳过已经训练的数据,并将epoch和迭代数等训练状态进行恢复。默认为False。若设置为True,则将不加载训练状态并不进行数据跳过,将从迭代数0开始训练。
- 🔥ddp_find_unused_parameters: 默认为None。
- 🔥dataloader_num_workers: 默认为None,若是windows平台,则设置为0,否则设置为1。
- dataloader_pin_memory: 默认为True。
- dataloader_persistent_workers: 默认为False。
- dataloader_prefetch_factor: 默认为None。若 `dataloader_num_workers > 0`,则设置为2。每个工作进程预先加载的批次数量。2 表示所有工作进程总共会预取 2 * num_workers 个批次。
- train_dataloader_shuffle: CPT/SFT训练的dataloader是否随机,默认为True。该参数对IterableDataset无效(即对流式数据集失效)。IterableDataset采用顺序的方式读取。
- optim: 优化器,默认值为 `"adamw_torch"` (对于 torch>=2.8 为 `"adamw_torch_fused"`)。完整的优化器列表请参见 [training_args.py](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) 中的 `OptimizerNames`。
- optim_args: 提供给优化器的可选参数,默认为None。
- group_by_length: 是否在训练数据集中将长度大致相同的样本分组在一起(有随机因素),以最小化填充并确保各节点与进程的负载均衡以提高效率。默认为False。具体算法参考`transformers.trainer_pt_utils.get_length_grouped_indices`。
- 🔥neftune_noise_alpha: neftune添加的噪声系数。默认为0,通常可以设置为5、10、15。
- 🔥use_liger_kernel: 是否启用[Liger](https://github.com/linkedin/Liger-Kernel)内核加速训练并减少显存消耗。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger)。
- 注意:liger_kernel不支持device_map,请使用DDP/DeepSpeed进行多卡训练。liger_kernel目前只支持`task_type='causal_lm'`。
- average_tokens_across_devices: 是否在设备之间进行token数平均。如果设置为True,将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False。
- max_grad_norm: 梯度裁剪。默认为1.。
- 注意:日志中的grad_norm记录的是裁剪前的值。
- push_to_hub: 推送checkpoint到hub。默认为False。
- hub_model_id: 默认为None。
- hub_private_repo: 默认为False。
### Tuner参数
- 🔥freeze_llm: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_llm设置为True会将LLM部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_llm设置为True将会取消在LLM部分添加LoRA模块。该参数默认为False。
- 🔥freeze_vit: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_vit设置为True会将vit部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_vit设置为True将会取消在vit部分添加LoRA模块。该参数默认为True。
- 注意:**这里的vit不仅限于vision_tower, 也包括audio_tower**。若是Omni模型,若你只希望对vision_tower加LoRA,而不希望对audio_tower加LoRA,你可以修改[这里的代码](https://github.com/modelscope/ms-swift/blob/a5d4c0a2ce0658cef8332d6c0fa619a52afa26ff/swift/llm/model/model_arch.py#L544-L554)。
- 🔥freeze_aligner: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_aligner设置为True会将aligner(也称为projector)部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_aligner设置为True将会取消在aligner部分添加LoRA模块。该参数默认为True。
- 🔥target_modules: 指定lora模块, 默认为`['all-linear']`。你也可以设置为module的后缀,例如:`--target_modules q_proj k_proj v_proj`。该参数不限于LoRA,可用于其他tuners。
- 注意:在LLM和多模态LLM中,'all-linear'的行为有所不同。若是LLM则自动寻找除lm_head外的linear并附加tuner;**若是多模态LLM,则默认只在LLM上附加tuner,该行为可以被`freeze_llm`、`freeze_vit`、`freeze_aligner`控制**。
- 🔥target_regex: 指定lora模块的regex表达式,默认为`None`。如果该值传入,则target_modules参数失效。例如你可以设置`--target_regex '^(language_model).*\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$'`,将符合该正则的模块指定为LoRA模块。该参数不限于LoRA,可用于其他tuners。
- target_parameters: 要替换为LoRA的参数名称列表。该参数的行为与 `target_modules` 类似,但传入的应是参数名称而不是模块名称。该特性需要安装"peft>=0.17.0"。例如,在 Hugging Face Transformers 中许多混合专家(MoE)层中,并未使用 `nn.Linear`,而是使用了 `nn.Parameter`。这时可以使用target_parameters参数实现。
- init_weights: 初始化weights的方法,LoRA可以指定为`true`、`false`、`gaussian`、`pissa`、`pissa_niter_[number of iters]`,Bone可以指定为`true`、`false`、`bat`。默认值`true`。
- 🔥modules_to_save: 在已附加tuner后,额外指定一部分原模型模块参与训练和存储。默认为`[]`。该参数不限于LoRA,可用于其他tuners。例如设置为`--modules_to_save embed_tokens lm_head`,在LoRA训练中解开embed_tokens和lm_head层进行训练,这两部分的权重信息最终会保存在`adapter_model.safetensors`中。
#### 全参
- freeze_parameters: 需要被冻结参数的前缀,默认为`[]`。
- freeze_parameters_regex: 需要被冻结参数的正则表达式,默认为None。
- freeze_parameters_ratio: 从下往上冻结的参数比例,默认为0。可设置为1将所有参数冻结,结合`trainable_parameters`设置可训练参数。
- trainable_parameters: 额外可训练参数的前缀,默认为`[]`。
- trainable_parameters_regex: 匹配额外可训练参数的正则表达式,默认为None。
- 备注:`trainable_parameters`、`trainable_parameters_regex`的优先级高于`freeze_parameters`、`freeze_parameters_regex`和`freeze_parameters_ratio`。例如:当指定全参数训练时,会将所有模块设置为可训练的状态,随后根据`freeze_parameters`、`freeze_parameters_regex`、`freeze_parameters_ratio`将部分参数冻结,最后根据`trainable_parameters`、`trainable_parameters_regex`重新打开部分参数参与训练。
#### LoRA
- 🔥lora_rank: 默认为`8`。
- 🔥lora_alpha: 默认为`32`。
- lora_dropout: 默认为`0.05`。
- lora_bias: 默认为`'none'`,可以选择的值: 'none'、'all'。如果你要将bias全都设置为可训练,你可以设置为`'all'`。
- lora_dtype: 指定lora模块的dtype类型。支持'float16'、'bfloat16'、'float32'。默认为None,跟随peft行为。
- 🔥use_dora: 默认为`False`,是否使用`DoRA`。
- use_rslora: 默认为`False`,是否使用`RS-LoRA`。
- 🔥lorap_lr_ratio: LoRA+参数,默认值`None`,建议值为`10~16`。使用lora时额外指定该参数可使用lora+。
##### LoRA-GA
- lora_ga_batch_size: 默认值为 `2`。在 LoRA-GA 中估计梯度以进行初始化时使用的批处理大小。
- lora_ga_iters: 默认值为 `2`。在 LoRA-GA 中估计梯度以进行初始化时的迭代次数。
- lora_ga_max_length: 默认值为 `1024`。在 LoRA-GA 中估计梯度以进行初始化时的最大输入长度。
- lora_ga_direction: 默认值为 `ArB2r`。在 LoRA-GA 中使用估计梯度进行初始化时的初始方向。允许的值有:`ArBr`、`A2rBr`、`ArB2r` 和 `random`。
- lora_ga_scale: 默认值为 `stable`。LoRA-GA 的初始化缩放方式。允许的值有:`gd`、`unit`、`stable` 和 `weightS`。
- lora_ga_stable_gamma: 默认值为 `16`。当初始化时选择 `stable` 缩放时的 gamma 值。
#### FourierFt
FourierFt使用`target_modules`、`target_regex`、`modules_to_save`三个参数,含义见上面文档中的描述。额外参数包括:
- fourier_n_frequency: 傅里叶变换的频率数量, `int`类型, 类似于LoRA中的`r`. 默认值`2000`.
- fourier_scaling: W矩阵的缩放值, `float`类型, 类似LoRA中的`lora_alpha`. 默认值`300.0`.
#### BOFT
BOFT使用`target_modules`、`target_regex`、`modules_to_save`三个参数,含义见上面文档中的描述。额外参数包括:
- boft_block_size: BOFT块尺寸, 默认值4.
- boft_block_num: BOFT块数量, 不能和`boft_block_size`同时使用.
- boft_dropout: boft的dropout值, 默认0.0.
#### Vera
Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数,含义见上面文档中的描述。额外参数包括:
- vera_rank: Vera Attention的尺寸, 默认值256.
- vera_projection_prng_key: 是否存储Vera映射矩阵, 默认为True.
- vera_dropout: Vera的dropout值, 默认`0.0`.
- vera_d_initial: Vera的d矩阵的初始值, 默认`0.1`.
#### GaLore
- 🔥use_galore: 默认值False, 是否使用GaLore.
- galore_target_modules: 默认值None, 不传的情况下对attention和mlp应用GaLore.
- galore_rank: 默认值128, GaLore的rank值.
- galore_update_proj_gap: 默认值50, 分解矩阵的更新间隔.
- galore_scale: 默认值1.0, 矩阵权重系数.
- galore_proj_type: 默认值`std`, GaLore矩阵分解类型.
- galore_optim_per_parameter: 默认值False, 是否给每个Galore目标Parameter设定一个单独的optimizer.
- galore_with_embedding: 默认值False, 是否对embedding应用GaLore.
- galore_quantization: 是否使用q-galore. 默认值`False`.
- galore_proj_quant: 是否对SVD分解矩阵做量化, 默认`False`.
- galore_proj_bits: SVD量化bit数.
- galore_proj_group_size: SVD量化分组数.
- galore_cos_threshold: 投影矩阵更新的cos相似度阈值. 默认值0.4.
- galore_gamma_proj: 在投影矩阵逐渐相似后会拉长更新间隔, 本参数为每次拉长间隔的系数, 默认值2.
- galore_queue_size: 计算投影矩阵相似度的队列长度, 默认值5.
#### LISA
注意: LISA仅支持全参数,即`--tuner_type full`。
- 🔥lisa_activated_layers: 默认值`0`,代表不使用LISA,改为非0代表需要激活的layers个数,建议设置为2或8。
- lisa_step_interval: 默认值`20`,多少iter切换可反向传播的layers。
#### UNSLOTH
🔥unsloth无新增参数,对已有参数进行调节即可支持,例如:
```
--tuner_backend unsloth
--tuner_type full/lora
--quant_bits 4
```
#### LLAMAPRO
- 🔥llamapro_num_new_blocks: 默认值`4`, 插入的新layers总数.
- llamapro_num_groups: 默认值`None`, 分为多少组插入new_blocks, 如果为`None`则等于`llamapro_num_new_blocks`, 即每个新的layer单独插入原模型.
#### AdaLoRA
以下参数`tuner_type`设置为`adalora`时生效. adalora的`target_modules`等参数继承于lora的对应参数,但`lora_dtype`参数不生效。
- adalora_target_r: 默认值`8`, adalora的平均rank.
- adalora_init_r: 默认值`12`, adalora的初始rank.
- adalora_tinit: 默认值`0`, adalora的初始warmup.
- adalora_tfinal: 默认值`0`, adalora的final warmup.
- adalora_deltaT: 默认值`1`, adalora的step间隔.
- adalora_beta1: 默认值`0.85`, adalora的EMA参数.
- adalora_beta2: 默认值`0.85`, adalora的EMA参数.
- adalora_orth_reg_weight: 默认值`0.5`, adalora的正则化参数.
#### ReFT
以下参数`tuner_type`设置为`reft`时生效.
> 1. ReFT无法合并tuner
> 2. ReFT和gradient_checkpointing不兼容
> 3. 如果使用DeepSpeed遇到问题请暂时卸载DeepSpeed
- 🔥reft_layers: ReFT应用于哪些层上, 默认为`None`, 代表所有层, 可以输入层号的list, 例如reft_layers 1 2 3 4`
- 🔥reft_rank: ReFT矩阵的rank, 默认为`4`.
- reft_intervention_type: ReFT的类型, 支持'NoreftIntervention', 'LoreftIntervention', 'ConsreftIntervention', 'LobireftIntervention', 'DireftIntervention', 'NodireftIntervention', 默认为`LoreftIntervention`.
- reft_args: ReFT Intervention中的其他支持参数, 以json-string格式输入.
### vLLM参数
参数含义可以查看[vllm文档](https://docs.vllm.ai/en/latest/serving/engine_args.html)。
- 🔥vllm_gpu_memory_utilization: GPU内存比例,取值范围为0到1。默认值`0.9`。
- 🔥vllm_tensor_parallel_size: tp并行数,默认为`1`。
- vllm_pipeline_parallel_size: pp并行数,默认为`1`。
- vllm_data_parallel_size: dp并行数,默认为`1`,在`swift deploy/rollout`命令中生效。
- 若在`swift infer`中,使用`NPROC_PER_NODE`来设置dp并行数。参考这里的[例子](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_ddp.sh)。
- vllm_enable_expert_parallel: 开启专家并行,默认为False。
- vllm_max_num_seqs: 单次迭代中处理的最大序列数,默认为`256`。
- 🔥vllm_max_model_len: 模型支持的最大长度。默认为`None`,即从config.json中读取。
- vllm_disable_custom_all_reduce: 禁用自定义的 all-reduce 内核,回退到 NCCL。为了稳定性,默认为`True`。
- vllm_enforce_eager: vllm使用pytorch eager模式还是建立cuda graph,默认为`False`。设置为True可以节约显存,但会影响效率。
- vllm_mm_processor_cache_gb: 多模态处理器缓存大小(GiB),用于缓存已处理的多模态输入(如图像、视频)避免重复处理。默认为`4`。设置为`0`可禁用缓存但会降低性能(不推荐)。仅对多模态模型生效。
- vllm_speculative_config: 推测解码配置,传入json字符串。默认为None。
- vllm_disable_cascade_attn: 是否强制关闭V1引擎的cascade attention实现以防止潜在数值误差,默认为False,由vLLM内部逻辑决定是否使用。
- 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图,默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。
- vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。
- vllm_quantization: vllm可以在内部量化模型,参数支持的值详见[这里](https://docs.vllm.ai/en/latest/serving/engine_args.html)。
- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存,节约重复查询前缀的处理时间,加快推理效率。默认为`None`,跟随vLLM行为。
- vllm_use_async_engine: vLLM backend下是否使用async engine。默认为None,会根据场景自动设置:encode任务(embedding、seq_cls、reranker、generative_reranker)默认为True,部署场景(swift deploy)默认为True,其他情况默认为False。注意:encode任务需使用async engine。
- vllm_reasoning_parser: 推理解析器类型,用于思考模型的思维链内容解析。默认为`None`。仅用于 `swift deploy` 命令。可选的种类参考[vLLM文档](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions)。
- vllm_engine_kwargs: vllm的额外参数,格式为json字符串。默认为None。
### SGLang参数
参数含义可以查看[sglang文档](https://docs.sglang.ai/backend/server_arguments.html)。
- 🔥sglang_tp_size: tp数。默认为1。
- sglang_pp_size: pp数。默认为1。
- sglang_dp_size: dp数。默认为1。
- sglang_ep_size: ep数。默认为1。
- sglang_enable_ep_moe: 是否启用ep moe。默认为False。该参数已在最新sglang中移除。
- sglang_mem_fraction_static: 用于静态分配模型权重和KV缓存内存池的GPU内存比例。如果你遇到GPU内存不足错误,可以尝试降低该值。默认为None。
- sglang_context_length: 模型的最大上下文长度。默认为 None,将使用模型的`config.json`中的值。
- sglang_disable_cuda_graph: 禁用CUDA图。默认为False。
- sglang_quantization: 量化方法。默认为None。
- sglang_kv_cache_dtype: 用于k/v缓存存储的数据类型。'auto'表示将使用模型的数据类型。'fp8_e5m2'和'fp8_e4m3'适用于CUDA 11.8及以上版本。默认为'auto'。
- sglang_enable_dp_attention: 为注意力机制启用数据并行,为前馈网络(FFN)启用张量并行。数据并行的规模(dp size)应等于张量并行的规模(tp size)。目前支持DeepSeek-V2/3以及Qwen2/3 MoE模型。默认为False。
- sglang_disable_custom_all_reduce: 禁用自定义的 all-reduce 内核,回退到 NCCL。为了稳定性,默认为True。
- sglang_speculative_algorithm: 推测算法,可选值:None、"EAGLE"、"EAGLE3"、"NEXTN"、"STANDALONE"、"NGRAM"。默认为None。
- sglang_speculative_num_steps: 在推测解码中从草稿模型采样的步数。默认值为None。
- sglang_speculative_eagle_topk: 在 EAGLE2 算法中每步从草稿模型采样的 token 数量。默认值为None。
- sglang_speculative_num_draft_tokens: 在推测解码中从草稿模型采样的 token 数量。默认值为None。
### LMDeploy参数
参数含义可以查看[lmdeploy文档](https://lmdeploy.readthedocs.io/en/latest/api/pipeline.html#turbomindengineconfig)。
- 🔥lmdeploy_tp: tensor并行度。默认为`1`。
- lmdeploy_session_len: 最大会话长度。默认为`None`。
- lmdeploy_cache_max_entry_count: k/v缓存占用的GPU内存百分比。默认为`0.8`。
- lmdeploy_quant_policy: 默认为0。当需要将k/v量化为4或8位时,分别将其设置为4或8。
- lmdeploy_vision_batch_size: 传入VisionConfig的max_batch_size参数。默认为`1`。
### 合并参数
- 🔥merge_lora: 是否合并lora,本参数支持lora、llamapro、longlora,默认为False。例子参数[这里](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lora.sh)。
- safe_serialization: 是否存储为safetensors,默认为True。
- max_shard_size: 单存储文件最大大小,默认'5GB'。
## 集成参数
### 训练参数
训练参数除包含[基本参数](#基本参数)、[Seq2SeqTrainer参数](#Seq2SeqTrainer参数)、[tuner参数](#tuner参数)外,还包含下面的部分:
- add_version: 在`output_dir`上额外增加目录`'<版本号>-<时间戳>'`防止权重覆盖,默认为True。
- check_model: 检查本地模型文件有损坏或修改并给出提示,默认为True。**如果是断网环境,请设置为False**。
- 🔥create_checkpoint_symlink: 额外创建checkpoint软链接,方便书写自动化训练脚本。best_model和last_model的软链接路径分别为f'{output_dir}/best'和f'{output_dir}/last'。
- 🔥packing: 使用`padding_free`的方式将不同长度的数据样本打包成**近似**统一长度的样本(packing能保证不对完整的序列进行切分),实现训练时各节点与进程的负载均衡(避免长文本拖慢短文本的训练速度),从而提高GPU利用率,保持显存占用稳定。当使用 `--attn_impl flash_attn` 时,可确保packed样本内的不同序列之间相互独立,互不可见。该参数默认为`False`,目前支持 CPT/SFT/DPO/KTO/GKD以及embedding/reranker/seq_cls任务的packing。注意:**packing会导致数据集样本数减少,请自行调节梯度累加数和学习率**。
- packing_length: packing的长度。默认为None,设置为max_length。
- packing_num_proc: packing的进程数,默认为1。需要注意的是,不同的`packing_num_proc`,最终形成的packed数据集是不同的。(该参数在流式packing时不生效)。通常不需要修改该值,packing速度远快于tokenize速度。
- lazy_tokenize: 是否使用lazy_tokenize。若该参数设置为False,则在训练之前对所有的数据集样本进行tokenize(多模态模型则包括从磁盘中读取图片)。该参数默认为None,在LLM训练中默认为False,而MLLM训练默认为True,节约内存。
- 注意:若你要进行图像的数据增强,你需要将lazy_tokenize(或streaming)设置为True,并修改Template类中的encode方法。
- use_logits_to_keep: 通过在`forward`中根据labels传入logits_to_keep,减少无效logits的计算与存储,从而减少显存占用并加快训练速度。默认为None,进行自动选择。
- acc_strategy: 训练和验证时计算acc的策略。可选为`seq`和`token`级别的acc,默认为`token`。
- max_new_tokens: 覆盖生成参数。predict_with_generate=True时的最大生成token数量,默认64。
- temperature: 覆盖生成参数。predict_with_generate=True时的temperature,默认0。
- optimizer: 使用的optimizers插件(优先级高于`--optim`),默认为None。可选optimizers参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/optimizers/mapping.py)。
- loss_type: 自定义的loss_type名称。默认为None,使用模型自带损失函数。可选loss参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)。
- eval_metric: 自定义eval metric名称。默认为None。可选eval_metric参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/eval_metric/mapping.py)。
- 关于默认值:当`task_type`为'causal_lm', 且`predict_with_generate=True`的情况下默认设置为'nlg'。`task_type` 为'embedding',根据loss_type,默认值为'infonce' 或 'paired'。`task_type`为'reranker/generative_reranker',默认值为'reranker'。
- callbacks: 自定义trainer callback,默认为`[]`。可选callbacks参考[这里](https://github.com/modelscope/ms-swift/blob/main/swift/callbacks/mapping.py)。例如:通过在`callbacks`中添加`deepspeed_elastic`(可选`graceful_exit`)可以来启用弹性训练。参考[Elastic示例](../BestPractices/Elastic.md)
- early_stop_interval: 早停的间隔,会检验best_metric在early_stop_interval个周期内(基于`save_steps`, 建议`eval_steps`和`save_steps`设为同值)没有提升时终止训练。具体代码在[early_stop.py](https://github.com/modelscope/ms-swift/blob/main/swift/callbacks/early_stop.py)中。同时,如果有较为复杂的早停需求,直接覆盖callback.py中的已有实现即可。设置该参数时,自动加入`early_stop`的trainer callback。
- eval_use_evalscope: 是否使用evalscope进行训练时评测,需要设置该参数来开启评测,具体使用参考[示例](../Instruction/Evaluation.md#训练中评测)。
- eval_dataset: 评测数据集,可设置多个数据集,用空格分割。
- eval_dataset_args: 评测数据集参数,json格式,可设置多个数据集的参数。
- eval_limit: 评测数据集采样数。
- eval_generation_config: 评测时模型推理配置,json格式,默认为`{'max_tokens': 512}`。
- use_flash_ckpt: 是否启用[DLRover Flash Checkpoint](https://github.com/intelligent-machine-learning/dlrover)的flash checkpoint。默认为`false`,启用后,权重会先保存至共享内存,之后异步持久化;建议搭配`PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"` 一起使用,避免训练过程CUDA OOM。
#### SWANLAB
- swanlab_token: SwanLab的api-key。你也可以使用`SWANLAB_API_KEY`环境变量指定。
- swanlab_project: swanlab的project,可以在页面中预先创建[https://swanlab.cn/space/~](https://swanlab.cn/space/~)或自动创建,默认为"ms-swift"。
- swanlab_workspace: 默认为None,会使用api-key对应的username。
- swanlab_exp_name: 实验名,可以为空,为空时默认传入--output_dir的值。
- swanlab_notification_method: 在训练完成/发生错误时,swanlab的通知方式,具体参考[这里](https://docs.swanlab.cn/plugin/notification-dingtalk.html)。支持'dingtalk'、'lark'、'email'、'discord'、'wxwork'、'slack'。
- swanlab_webhook_url: 默认为None。swanlab的`swanlab_notification_method`对应的 webhook url。
- swanlab_secret: 默认为None。swanlab的`swanlab_notification_method`对应的 secret。
- swanlab_mode: 可选cloud和local,云模式或者本地模式。
### RLHF参数
RLHF参数继承于[训练参数](#训练参数)。
- 🔥rlhf_type: 人类对齐算法类型,支持'dpo'、'orpo'、'simpo'、'kto'、'cpo'、'rm'、'ppo'、'grpo'和'gkd'。默认为'dpo'。
- ref_model: 采用dpo、kto、ppo、grpo算法且使用全参数训练时需要传入。默认为None,设置为`--model`。
- ref_adapters: 默认为`[]`。若你要使用SFT产生的LoRA权重进行DPO/KTO/GRPO,请在训练时设置`--adapters sft_ckpt --ref_adapters sft_ckpt`。若是此场景的断点续训,则设置`--resume_from_checkpoint rlhf_ckpt --ref_adapters sft_ckpt`。
- ref_model_type: 同model_type。默认为None。
- ref_model_revision: 同model_revision。默认为None。
- 🔥beta: 控制与参考模型偏差程度的参数。beta值越高,表示与参考模型的偏差越小。默认为`None`,使用不同rlhf算法的默认值不同,其中`simpo`算法默认为`2.`,GRPO默认为`0.04`,GKD默认为0.5,其他算法默认为`0.1`。具体参考[文档](./RLHF.md)。
- label_smoothing: 是否使用DPO smoothing,默认值为`0`。
- max_completion_length: GRPO/PPO/GKD算法中的最大生成长度,默认为512。
- 🔥rpo_alpha: 来自[RPO 论文](https://arxiv.org/abs/2404.19733)中的参数,用于控制损失函数中NLL项的权重(即SFT损失),`loss = dpo_loss + rpo_alpha * sft_loss`,论文中推荐设置为`1.`。默认为`None`,即默认不引入sft_loss。
- ld_alpha: 来自[LD-DPO 论文](https://arxiv.org/abs/2409.06411),对超出公共前缀部分的logps加权 $\alpha$ 抑制长度偏好。
- discopop_tau: 来自 [DiscoPOP 论文](https://arxiv.org/abs/2406.08414)的温度参数 $\tau$ ,用于缩放 log-ratio。默认值0.05。在 loss_type 为 discopop 时生效。
- loss_type: 损失类型。默认为None,使用不同的rlhf算法,其默认值不同。
- DPO: 可选项参考[文档](https://huggingface.co/docs/trl/main/en/dpo_trainer#loss-functions),支持传入多个值实现混合训练([MPO](https://arxiv.org/abs/2411.10442)), 传入多个值时需要设置参数 loss_weights。默认为`sigmoid`。
- GRPO: 参考[GRPO参数](#grpo参数)。
- loss_weights: 在 DPO 训练中设置多个 loss_type 时,用于指定各个损失项的权重。
- cpo_alpha: CPO/SimPO loss 中 nll loss的系数, 默认为`1.`。
- simpo_gamma: SimPO算法中的reward margin项,论文建议设置为0.5-1.5,默认为`1.`。
- desirable_weight: KTO算法中用于抵消 desirable 和 undesirable 数量不均衡的影响,对 desirable 损失按该系数进行加权,默认为`1.`。
- undesirable_weight: KTO算法中用于抵消 desirable 和 undesirable 数量不均衡的影响,对 undesirable 损失按该系数进行加权,默认为`1.`。
- center_rewards_coefficient: 用于RM训练。用于激励奖励模型输出均值为零的奖励的系数,具体查看这篇[论文](https://huggingface.co/papers/2312.09244)。推荐值:0.01。
- loss_scale: 覆盖模板参数。rlhf训练时,默认为'last_round'。
- temperature: 默认为0.9,该参数将在PPO、GRPO、GKD中使用。
#### GKD参数
- lmbda: 默认为0.5。该参数在GKD中使用。控制学生数据比例的 lambda 参数(即策略内学生生成输出所占的比例)。若lmbda为0,则不使用学生生成数据。
- sft_alpha: 默认为0。控制GKD中加入sft_loss的权重。最后的loss为`gkd_loss + sft_alpha * sft_loss`。
- seq_kd: 默认为False。该参数在GKD中使用。控制是否执行序列级知识蒸馏(Sequence-Level KD)的 seq_kd 参数(可视为对教师模型生成输出的监督式微调)。
- 注意:你可以提前对数据集内容使用teacher模型进行推理(使用vllm/sglang/lmdeploy等推理引擎加速),并在训练时将`seq_kd`设置为False。或者将`seq_kd`设置为True,在训练时使用teacher模型生成序列(能保证多个epoch生成数据的不同,但效率较慢)。
- offload_teacher_model: 卸载教师模型以节约显存,只在采样/计算logps时加载,默认为False。
- truncation_strategy: 用于处理输入长度超过 max_length 的样本,支持 delete 和 left 两种策略,分别表示删除该样本和从左侧裁剪。默认值为 left。若使用 delete 策略,被删除的超长样本或编码失败的样本将在原数据集中通过重采样进行替换。
- log_completions: 是否记录训练中的模型生成内容,搭配 `--report_to wandb/swanlab` 使用。默认为False。
- 提示:若没有设置`--report_to wandb/swanlab`,则会在checkpoint中创建`completions.jsonl`来存储生成内容。
- 仅记录 vLLM 采样结果。
#### Reward/Teacher模型参数
reward模型参数将在PPO、GRPO中使用。
- reward_model: 默认为None。
- reward_adapters: 默认为`[]`。
- reward_model_type: 默认为None。
- reward_model_revision: 默认为None。
- teacher_model: 默认为None。rlhf_type为'gkd'时需传入此参数。
- teacher_adapters: 默认为`[]`。
- teacher_model_type: 默认为None。
- teacher_model_revision: 默认为None。
- teacher_deepspeed: 同 deepspeed 参数,控制 teacher model 的 deepspeed 配置,默认使用训练模型的 deepspeed 配置。
#### PPO参数
以下参数含义可以参考[这里](https://huggingface.co/docs/trl/main/ppo_trainer)。
- num_ppo_epochs: 默认为4。
- whiten_rewards: 默认为False。
- kl_coef: 默认为0.05。
- cliprange: 默认为0.2。
- vf_coef: 默认为0.1。
- cliprange_value: 默认为0.2。
- gamma: 默认为1.0。
- lam: 默认为0.95。
- num_mini_batches: 默认为1。
- local_rollout_forward_batch_size: 默认为64。
- num_sample_generations: 默认为10。
- missing_eos_penalty: 默认为None。
#### GRPO参数
- beta: KL正则系数,默认为0.04,设置为0时不加载ref model。
- per_device_train_batch_size: 每个设备训练批量大小,在GRPO中,指 completion 的批次大小。
- per_device_eval_batch_size: 每个设备评估批量大小,在GRPO中,指 completion 的批次大小。
- steps_per_generation: 每轮生成的优化步数,默认等于 gradient_accumulation_steps。与 generation_batch_size 只能同时设置一个。
- generation_batch_size: 总的采样 completion 批量大小,需要是 num_processes * per_device_train_batch_size 的倍数,默认等于 per_device_train_batch_size * steps_per_generation * num_processes。
- num_generations: 每个prompt采样的数量,论文中的G值,generation_batch_size 必须能被 num_generations 整除。默认为 8。
- num_generations_eval: 评估阶段每个prompt采样的数量。允许在评估时使用较少的生成数量以节省计算资源。如果为 None,则使用 num_generations 的值。默认为 None。
- ds3_gather_for_generation: 该参数适用于DeepSpeed ZeRO-3。如果启用,策略模型权重将被收集用于生成,从而提高生成速度。然而,禁用此选项允许训练超出单个GPU VRAM的模型,尽管生成速度会变慢。禁用此选项与vLLM生成不兼容。默认为True。
- reward_funcs: GRPO算法奖励函数,可选项为`accuracy`、`format`、`cosine`、`repetition`和`soft_overlong`,见swift/rewards/orm.py。你也可以在plugin中自定义自己的奖励函数。默认为`[]`。
- reward_weights: 每个奖励函数的权重。必须与奖励函数和奖励模型的总数量匹配。如果为 None,则所有奖励的权重都相等,为`1.0`。
- 提示:如果GRPO训练中包含`--reward_model`,则其加在奖励函数的最后位置。
- reward_model_plugin: 奖励模型逻辑,默认为orm逻辑, 详细见[自定义奖励模型](./GRPO/DeveloperGuide/reward_model.md#自定义奖励模型)。
- dataset_shuffle: 是否对dataset进行随机操作,默认为True。
- truncation_strategy: 用于处理输入长度超过 max_length 的样本,支持 delete 和 left 两种策略,分别表示删除该样本和从左侧裁剪。默认值为 left。若使用 delete 策略,被删除的超长样本或编码失败的样本将在原数据集中通过重采样进行替换。
- loss_type: loss 归一化的类型,可选项为['grpo', 'bnpo', 'dr_grpo', 'dapo', 'cispo', 'sapo'], 默认为'grpo', 具体参考[文档](./GRPO/DeveloperGuide/loss_types.md)
- log_completions: 是否记录训练中的模型生成内容,搭配 `--report_to wandb/swanlab` 使用。默认为False。
- 提示:若没有设置`--report_to wandb/swanlab`,则会在checkpoint中创建`completions.jsonl`来存储生成内容。
- use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend,默认为False。
- vllm_mode: vLLM 集成模式,可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样,colocate 模式在程序内部署 vLLM。使用server端时,
- vllm_mode server 参数
- vllm_server_host: vLLM server host地址,默认为None。
- vllm_server_port: vLLM server 服务端口,默认为8000。
- vllm_server_base_url: vLLM server的Base URL(比如 http://local_host:8000), 默认为None。设置后,忽略host和port设置。
- vllm_server_group_port: vllm server 内部通信端口,除非端口被占用,一般无需设置,默认为51216。
- vllm_server_timeout: 连接vLLM server的超时时间,默认为 240s。
- vllm_server_pass_dataset: 透传额外的数据集信息到vLLM server,用于多轮训练。
- async_generate: 异步rollout以提高训练速度,注意开启时采样会使用上一轮更新的模型进行采样,不支持多轮场景。默认`false`.
- enable_flattened_weight_sync: 是否使用 flattened tensor 进行权重同步。启用后会将多个参数打包为单个连续 tensor 进行传输,可提升同步效率,在 Server Mode 下生效,默认为 True。
- SWIFT_UPDATE_WEIGHTS_BUCKET_SIZE: 环境变量,用于控制flattened tensor 权重同步时的传输桶大小(bucket size),适用于 Server Mode 下的全参数训练,单位为 MB,默认值为 512 MB。
- vllm_mode colocate 参数(更多参数支持参考[vLLM参数](#vLLM参数)。)
- vllm_gpu_memory_utilization: vllm透传参数,默认为0.9。
- vllm_max_model_len: vllm透传参数,默认为None。
- vllm_enforce_eager: vllm透传参数,默认为False。
- vllm_limit_mm_per_prompt: vllm透传参数,默认为None。
- vllm_enable_prefix_caching: vllm透传参数,默认为True。
- vllm_tensor_parallel_size: tp并行数,默认为`1`。
- vllm_enable_lora: 支持vLLM Engine 加载 LoRA adapter,默认为False。用于加速LoRA训练的权重同步,具体参考[文档](./GRPO/GetStarted/GRPO.md#权重同步加速)。
- sleep_level: 训练时释放 vLLM 显存,可选项为[0, 1, 2], 默认为0,不释放。
- offload_optimizer: 是否在vLLM推理时offload optimizer参数,默认为False。
- offload_model: 是否在vLLM推理时 offload 模型,默认为False。
- completion_length_limit_scope: 在多轮对话中,`max_completion_length` 的限制范围。
`total`限制所有对话轮次的总输出长度不超过`max_completion_length`, `per_round`限制每一轮的输出长度。
- num_iterations: 每条数据的更新次数,[GRPO论文](https://arxiv.org/abs/2402.03300)中的 $\mu$ 值,默认为1。
- epsilon: clip 系数,默认为0.2。
- epsilon_high: upper clip 系数,默认为None,设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
- tau_pos: [SAPO](https://arxiv.org/abs/2511.20347)算法中正优势的温度参数,控制软门控函数的锐度。较大值使门控更锐利(接近硬裁剪),较小值使门控更平滑。默认为1.0。
- tau_neg: SAPO算法中负优势的温度参数,控制软门控函数的锐度。通常设置`tau_neg > tau_pos`以对负优势施加更强约束。默认为1.05。
- dynamic_sample: 筛除group内奖励标准差为0的数据,额外采样新数据,默认为False。
- max_resample_times: dynamic_sample设置下限制重采样次数,默认3次。
- overlong_filter: 跳过超长截断的样本,不参与loss计算,默认为False。
- delta: [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291)中双侧 GRPO 上界裁剪值。若设置,建议大于 1 + epsilon。默认为None。
- importance_sampling_level: 控制重要性采样比计算,可选项为 `token` 和 `sequence`,`token` 模式下保留原始的每个 token 的对数概率比,`sequence` 模式下则会对序列中所有有效 token 的对数概率比进行平均。[GSPO论文](https://arxiv.org/abs/2507.18071)中使用sequence级别计算来稳定训练,默认为`token`。
- advantage_estimator: 优势计算函数,默认为 `grpo`,即计算组内相对优势,可选项为 `grpo`、[`rloo`](./GRPO/AdvancedResearch/RLOO.md)、[`reinforce_plus_plus`](./GRPO/AdvancedResearch/REINFORCEPP.md)。
- kl_in_reward: 控制 KL 散度正则项的处理位置;`false`表示作为损失函数的独立正则项,`true`表示将 KL 直接并入奖励(从奖励中扣除)。默认情况与advantage_estimator绑定,`grpo`下默认为`false`,`rloo` 和 `reinforce_plus_plus` 下默认为 `true`。
- scale_rewards: 指定奖励的缩放策略。可选值包括 `group`(按组内标准差缩放)、`batch`(按整个批次的标准差缩放)、`none`(不进行缩放)、`gdpo`(对每个奖励函数分别进行组内归一化后加权聚合,参考 [GDPO 论文](https://arxiv.org/abs/2601.05242))。在 ms-swift < 3.10 版本中,该参数为布尔类型,`true` 对应 `group`,`false` 对应 `none`。默认值与 `advantage_estimator` 绑定:`grpo` 对应 `group`,`rloo` 对应 `none`,`reinforce_plus_plus` 对应 `batch`。
- 注意:`gdpo` 模式不支持 `kl_in_reward=True`,若同时设置会自动将 `kl_in_reward` 设为 `False`。
- GDPO 适用于多奖励优化场景:当使用多个奖励函数时,GDPO 会对每个奖励函数分别在组内进行标准化(减均值、除标准差),然后使用 `reward_weights` 进行加权求和,最后再进行批次级别的标准化。这种方式可以更好地保留各个奖励的相对差异,避免不同奖励组合坍塌成相同的 advantage 值。
- sync_ref_model: 是否定期同步ref_model,默认为False。
- ref_model_mixup_alpha: 控制在更新过程中model和先前ref_model之间的混合。更新公式为 $π_{ref} = α * π_θ + (1 - α) * π_{ref_{prev}}$。默认为0.6。
- ref_model_sync_steps: 同步频率,默认为512。
- move_model_batches: 在模型向vLLM等快速推理框架移动参数时,将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分,否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个。
- multi_turn_scheduler: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现。
- max_turns: 多轮GRPO的轮数上限。默认为None,不做限制。
- top_entropy_quantile: 仅对熵值处于前指定分位的 token 参与损失计算,默认为1.0,即不过滤低熵 token,具体参考[文档](./GRPO/AdvancedResearch/entropy_mask.md)
- log_entropy: 记录训练中的熵值变化动态,默认为False,具体参考[文档](./GRPO/GetStarted/GRPO.md#logged-metrics)
- rollout_importance_sampling_mode: 训推不一致校正模式,可选项为 `token_truncate`、`token_mask`、`sequence_truncate`、`sequence_mask`。默认为None,不启用校正。具体参考[文档](./GRPO/AdvancedResearch/training_inference_mismatch.md)
- rollout_importance_sampling_threshold: 重要性采样权重的阈值,用于截断或屏蔽极端权重。默认为2.0。
- log_rollout_offpolicy_metrics: 当 `rollout_importance_sampling_mode` 未设置时,是否记录训推不一致诊断指标(KL、PPL、χ²等)。当设置了 `rollout_importance_sampling_mode` 时,指标会自动记录。默认为False。
- off_policy_sequence_mask_delta: Off-Policy Sequence Masking 阈值,来自 [DeepSeek-V3.2 论文](https://arxiv.org/abs/2512.02556)。当设置此值时,会计算每个序列的 `mean(old_policy_logps - policy_logps)`,若该值大于阈值且该序列的优势为负,则 mask 掉该序列不参与损失计算。具体参考[文档](./GRPO/AdvancedResearch/training_inference_mismatch.md#off-policy-sequence-masking)
##### 奖励函数参数
内置的奖励函数参考[文档](./GRPO/DeveloperGuide/reward_function.md)
cosine 奖励参数
- cosine_min_len_value_wrong: cosine 奖励函数参数,生成错误答案时,最小长度对应的奖励值。默认值为-0.5。
- cosine_max_len_value_wrong: 生成错误答案时,最大长度对应的奖励值。默认值为0.0。
- cosine_min_len_value_correct: 生成正确答案时,最小长度对应的奖励值。默认值为1.0。
- cosine_max_len_value_correct: 生成正确答案时,最大长度对应的奖励值。默认值为0.5。
- cosine_max_len: 生成文本的最大长度限制。默认等于 max_completion_length。
repetition 奖励参数
- repetition_n_grams: 用于检测重复的 n-gram 大小。默认值为3。
- repetition_max_penalty: 最大惩罚值,用于控制惩罚的强度。默认值为-1.0。
soft overlong 奖励参数
- soft_max_length: 论文中的L_max,模型的最大生成长度,默认等于max_completion_length。
- soft_cache_length: 论文中的L_cache,控制长度惩罚区间,区间为[soft_max_length-soft_cache_length, soft_max_length]。
### 推理参数
推理参数除包含[基本参数](#基本参数)、[合并参数](#合并参数)、[vLLM参数](#vllm参数)、[LMDeploy参数](#LMDeploy参数)外,还包含下面的部分:
- 🔥infer_backend: 推理加速后端,支持'transformers'、'vllm'、'sglang'、'lmdeploy'四种推理引擎。默认为'transformers'。
- 注意:这四种引擎使用的都是swift的template,使用`--template_backend`控制。
- 🔥max_batch_size: 指定infer_backend为'transformers'时生效,用于批量推理,默认为1。若设置为-1,则不受限制。
- 🔥result_path: 推理结果存储路径(jsonl),默认为None。如果对数据集进行推理/评测,则默认保存在checkpoint目录(含args.json文件)或者'./result'目录,最终存储路径会在命令行中打印(交互式推理或部署默认不存储结果)。
- 注意:若已存在`result_path`文件,则会进行追加写入。
- write_batch_size: 结果写入`result_path`的batch_size。默认为1000。若设置为-1,则不受限制。
- metric: 对推理的结果进行评估,目前支持'acc'和'rouge'。默认为None,即不进行评估。
- val_dataset_sample: 推理数据集采样数,默认为None。
- reranker_use_activation: 在reranker推理时,是否在score之后使用sigmoid,默认为True。
### 部署参数
部署参数继承于[推理参数](#推理参数)。
- host: 服务host,默认为'0.0.0.0'。
- port: 端口号,默认为8000。
- api_key: 访问需要使用的api_key,默认为None。
- owned_by: 默认为`swift`。
- 🔥served_model_name: 提供服务的模型名称,默认使用model的后缀。
- verbose: 打印详细日志,默认为True。
- 注意:在`swift app`或者`swift eval`时,默认为False。
- log_interval: tokens/s统计值打印间隔,默认20秒。设置为-1则不打印。
- max_logprobs: 最多返回客户端的logprobs数量,默认为20。
### Rollout参数
Rollout参数继承于[部署参数](#部署参数)
- multi_turn_scheduler: 多轮GRPO训练规划器,传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现。默认为None,具体参考[文档](./GRPO/DeveloperGuide/multi_turn.md)。
- max_turns: 多轮GRPO训练下的最大轮数,默认为None,即不做约束。
- vllm_enable_lora: 支持vLLM Engine 加载 LoRA adapter,默认为False。用于加速LoRA训练的权重同步,具体参考[文档](./GRPO/GetStarted/GRPO.md#权重同步加速)。
- vllm_max_lora_rank: vLLM Engine LoRA参数,需大于等于训练的lora_rank,建议等于。默认为16。
### Web-UI参数
- server_name: web-ui的host,默认为'0.0.0.0'。
- server_port: web-ui的port,默认为7860。
- share: 默认为False。
- lang: web-ui的语言,可选为'zh', 'en'。默认为'zh'。
### App参数
App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)。
- base_url: 模型部署的base_url,例如`http://localhost:8000/v1`。默认为`None`,使用本地部署。
- studio_title: studio的标题。默认为None,设置为模型名。
- is_multimodal: 是否启动多模态版本的app。默认为None,自动根据model判断,若无法判断,设置为False。
- lang: 覆盖Web-UI参数,默认为'en'。
### 评测参数
评测参数继承于[部署参数](#部署参数)。
- 🔥eval_backend: 评测后端,默认为'Native',也可以指定为'OpenCompass'或'VLMEvalKit'。
- 🔥eval_dataset: 评测数据集,请查看[评测文档](./Evaluation.md)。
- eval_limit: 每个评测集的采样数,默认为None。
- eval_output_dir: 评测存储结果的文件夹,默认为'eval_output'。
- temperature: 覆盖生成参数,默认为0。
- eval_num_proc: 评测时客户端最大并发数,默认为16。
- eval_url: 评测url,例如`http://localhost:8000/v1`。例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/eval/eval_url)。默认为None,采用本地部署评估。
- eval_generation_config: 评测时模型推理配置,需传入json字符串格式,例如:`'{"max_new_tokens": 512}'`;默认为None。
- extra_eval_args: 额外评测参数,需传入json字符串格式,默认为空。仅对Native评测有效,更多参数说明请查看[这里](https://evalscope.readthedocs.io/zh-cn/latest/get_started/parameters.html)
- local_dataset: 部分评测集,如`CMB`无法直接运行,需要下载额外数据包才可以使用。设置本参数为`true`可以自动下载全量数据包,并在当前目录下创建`data`文件夹并开始评测。数据包仅会下载一次,后续会使用缓存。该参数默认为`false`。
- 注意:默认评测会使用`~/.cache/opencompass`下的数据集,在指定本参数后会直接使用当前目录下的data文件夹。
### 导出参数
导出参数除包含[基本参数](#基本参数)和[合并参数](#合并参数)外,还包含下面的部分:
- 🔥output_dir: 导出结果存储路径。默认为None,会自动设置合适后缀的路径。
- exist_ok: 如果output_dir存在,不抛出异常,进行覆盖。默认为False。
- 🔥quant_method: 可选为'gptq'、'awq'、'bnb'和'fp8',默认为None。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize)。
- quant_n_samples: gptq/awq的校验集采样数,默认为256。
- quant_batch_size: 量化batch_size,默认为1。
- group_size: 量化group大小,默认为128。
- to_cached_dataset: 提前对数据集进行tokenize并导出,默认为False。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/cached_dataset)。更多介绍请查看`cached_dataset`。
- 提示:你可以通过`--split_dataset_ratio`或者`--val_dataset`指定验证集内容。
- template_mode: 用于支持对`swift rlhf`训练的`cached_dataset`功能。该参数只在`--to_cached_dataset true`时生效。可选项包括: 'train'、'rlhf'和'kto'。其中`swift pt/sft`使用'train',`swift rlhf --rlhf_type kto`使用'kto',其他rlhf算法使用'rlhf'。注意:当前'gkd', 'ppo', 'grpo'算法不支持`cached_dataset`功能。默认为'train'。
- to_ollama: 产生ollama所需的Modelfile文件。默认为False。
- 🔥to_mcore: HF格式权重转成Megatron格式。默认为False。
- to_hf: Megatron格式权重转成HF格式。默认为False。
- mcore_model: mcore格式模型路径。默认为None。
- mcore_adapter: mcore格式模型的adapter路径,默认为None。
- thread_count: `--to_mcore true`时的模型切片数。默认为None,根据模型大小自动设置,使得最大分片小于10GB。
- 🔥offload_bridge: Megatron导出的用于vLLM更新HF格式权重使用CPU主存存放,以降低 GPU 显存占用。默认为 False。
- 🔥test_convert_precision: 测试HF和Megatron格式权重转换的精度误差。默认为False。
- test_convert_dtype: 转换精度测试使用的dtype,默认为'float32'。
- 🔥push_to_hub: 是否推送hub,默认为False。例子参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/export/push_to_hub.sh)。
- hub_model_id: 推送的model_id,默认为None。
- hub_private_repo: 是否是private repo,默认为False。
- commit_message: 提交信息,默认为'update files'。
### 采样参数
- prm_model: 过程奖励模型的类型,可以是模型id(以'transformers'方式拉起),或者plugin中定义的prm key(自定义推理过程)。
- orm_model: 结果奖励模型的类型,通常是通配符或测试用例等,一般定义在plugin中。
- sampler_type: 采样类型,目前支持 sample, distill
- sampler_engine: 支持`transformers`, `lmdeploy`, `vllm`, `client`, `no`,默认为`transformers`,采样模型的推理引擎。
- output_dir: 输出目录,默认为`sample_output`。
- output_file: 输出文件名称,默认为`None`使用时间戳作为文件名。传入时不需要传入目录,仅支持jsonl格式。
- override_exist_file: 如`output_file`存在,是否覆盖。
- num_sampling_batch_size: 每次采样的batch_size。
- num_sampling_batches: 共采样多少batch。
- n_best_to_keep: 返回多少最佳sequences。
- data_range: 本采样处理数据集的分片。传入格式为`2 3`,代表数据集分为3份处理(这意味着通常有三个`swift sample`在并行处理),本实例正在处理第3个分片。
- temperature: 在这里默认为1.0。
- prm_threshold: PRM阈值,低于该阈值的结果会被过滤掉,默认值为`0`。
- easy_query_threshold: 单个query的所有采样中,ORM评估如果正确,大于该比例的query会被丢弃,防止过于简单的query出现在结果中,默认为`None`,代表不过滤。
- engine_kwargs: 传入sampler_engine的额外参数,以json string传入,例如`{"cache_max_entry_count":0.7}`。
- num_return_sequences: 采样返回的原始sequence数量。默认为64,本参数对`sample`采样有效。
- cache_files: 为避免同时加载prm和generator造成显存OOM,可以分两步进行采样,第一步将prm和orm置为`None`,则所有结果都会输出到文件中,第二次运行采样将sampler_engine置为`no`并传入`--cache_files`为上次采样的输出文件,则会使用上次输出的结果进行prm和orm评估并输出最终结果。
- 注意:使用cache_files时,`--dataset`仍然需要传入,这是因为cache_files的id是由原始数据计算的md5,需要把两部分信息结合使用。
## 特定模型参数
除了以上参数外,有些模型还支持额外的具体模型参数。这些参数含义通常可以在对应模型官方repo或者其推理代码中找到相应含义。**ms-swift引入这些参数以确保训练的模型与官方推理代码效果对齐**。
- 特定模型参数可以通过`--model_kwargs`或者环境变量进行设置,例如: `--model_kwargs '{"fps_max_frames": 12}'`或者`FPS_MAX_FRAMES=12`。
- 注意:若你在训练时指定了特定模型参数,请在推理时也设置对应的参数,这可以提高训练效果。
### qwen2_vl, qvq, qwen2_5_vl, mimo_vl, keye_vl, keye_vl_1_5
参数含义与`qwen_vl_utils<0.0.12`或者`qwen_omni_utils`库中含义一致,可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。ms-swift通过修改这些常数值来控制图片分辨率和视频帧率,避免训练时OOM。
- IMAGE_FACTOR: 默认为28。
- MIN_PIXELS: 默认为`4 * 28 * 28`。图像的最小分辨率。建议设置为28*28的倍数。
- 🔥MAX_PIXELS: 默认为`16384 * 28 * 28`。图像的最大分辨率。建议设置为28*28的倍数。
- MAX_RATIO: 默认为200。
- VIDEO_MIN_PIXELS: 默认为`128 * 28 * 28`。视频中一帧的最小分辨率。建议设置为28*28的倍数。
- 🔥VIDEO_MAX_PIXELS: 默认为`768 * 28 * 28`。视频中一帧的最大分辨率。建议设置为28*28的倍数。
- VIDEO_TOTAL_PIXELS: 默认为`24576 * 28 * 28`。
- FRAME_FACTOR: 默认为2。
- FPS: 默认为2.0。
- FPS_MIN_FRAMES: 默认为4。一段视频的最小抽帧数。
- 🔥FPS_MAX_FRAMES: 默认为768。一段视频的最大抽帧数。
- 🔥QWENVL_BBOX_FORMAT: grounding格式使用'legacy'还是'new'。'legacy'格式为:`<|object_ref_start|>一只狗<|object_ref_end|><|box_start|>(432,991),(1111,2077)<|box_end|>`,'new'格式参考:[Qwen3-VL cookbook](https://github.com/QwenLM/Qwen3-VL/blob/main/cookbooks/2d_grounding.ipynb),并参考[grounding数据集格式文档](../Customization/Custom-dataset.md#grounding)。默认为'legacy'。
- 注意:该环境变量适配Qwen2/2.5/3-VL和Qwen2.5/3-Omni系列模型。
### qwen2_audio
- SAMPLING_RATE: 默认为16000。
### qwen3_vl, qwen3_5
参数含义与`qwen_vl_utils>=0.0.14`库中的含义一致,可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。通过传入以下环境变量,可以修改该库的全局变量默认值。(也兼容使用`qwen2_5_vl`的环境变量,例如:`MAX_PIXELS`、`VIDEO_MAX_PIXELS`,会做自动转换。)
- SPATIAL_MERGE_SIZE: 默认为2。
- IMAGE_MIN_TOKEN_NUM: 默认为`4`,代表一张图片最小图像tokens的个数。
- 🔥IMAGE_MAX_TOKEN_NUM: 默认为`16384`,代表一张图片最大图像tokens的个数。(用于避免OOM)
- 提示:等价最大图像像素为`IMAGE_MAX_TOKEN_NUM * 32 *32`。
- VIDEO_MIN_TOKEN_NUM: 默认为`128`,代表视频中一帧的最小视频tokens的个数。
- 🔥VIDEO_MAX_TOKEN_NUM: 默认为`768`,代表视频中一帧的最大视频tokens的个数。(用于避免OOM)
- MAX_RATIO: 默认为200。
- FRAME_FACTOR: 默认为2。
- FPS: 默认为2.0。
- FPS_MIN_FRAMES: 默认为4。代表一段视频的最小抽帧数。
- 🔥FPS_MAX_FRAMES: 默认为768,代表一段视频的最大抽帧数。(用于避免OOM)
### qwen2_5_omni, qwen3_omni
qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还包含以下参数:(注意:qwen3_omni包含的是**qwen3_vl**, qwen2_audio的模型特定参数)
- USE_AUDIO_IN_VIDEO: 默认为False。是否使用video中的音频信息。
- 🔥ENABLE_AUDIO_OUTPUT: 默认为None,即使用`config.json`中的值。若使用zero3进行训练,请设置为False。
- 提示:ms-swift只对thinker部分进行微调,建议设置为False以降低显存占用(只创建thinker部分的模型结构)。
### qwen3_vl_emb, qwen3_vl_reranker
参数含义与`qwen3_vl`相同,见上面的描述。以下为对默认值的覆盖:
- IMAGE_MAX_TOKEN_NUM: qwen3_vl_emb默认为1800, qwen3_vl_reranker默认为1280。具体参考这里:[qwen3_vl_embedding](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-2B/file/view/master/scripts%2Fqwen3_vl_embedding.py?status=1#L26), [qwen3_vl_reranker](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B/file/view/master/scripts%2Fqwen3_vl_reranker.py?status=1#L16)。
- FPS: 默认为1。
- FPS_MAX_FRAMES: 默认为64。
### internvl, internvl_phi3
参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)。
- MAX_NUM: 默认为12。
- INPUT_SIZE: 默认为448。
### internvl2, internvl2_phi3, internvl2_5, internvl3, internvl3_5
参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)。
- MAX_NUM: 默认为12。
- INPUT_SIZE: 默认为448。
- VIDEO_MAX_NUM: 默认为1。视频的MAX_NUM。
- VIDEO_SEGMENTS: 默认为8。
### minicpmv2_6, minicpmv4, minicpmo
- MAX_SLICE_NUMS: 默认为9,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。
- VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
- MAX_NUM_FRAMES: 默认为64,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
### minicpmo
- INIT_TTS: 默认为False。
- INIT_AUDIO: 默认为False。
### ovis1_6, ovis2
- MAX_PARTITION: 默认为9,参考[这里](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)。
### ovis2_5
以下参数含义可以在[这里](https://modelscope.cn/models/AIDC-AI/Ovis2.5-2B)的示例代码中找到。
- MIX_PIXELS: int类型,默认为`448 * 448`。
- MAX_PIXELS: int类型,默认为`1344 * 1792`。若出现OOM,可以调小该值。
- VIDEO_MAX_PIXELS: int类型,默认为`896 * 896`。
- NUM_FRAMES: 默认为8。用于视频抽帧。
### mplug_owl3, mplug_owl3_241101
- MAX_NUM_FRAMES: 默认为16,参考[这里](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)。
### xcomposer2_4khd
- HD_NUM: 默认为55,参考[这里](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)。
### xcomposer2_5
- HD_NUM: 图片数量为1时,默认值为24。大于1,默认为6。参考[这里](https://modelscope.cn/models/AI-ModelScope/internlm-xcomposer2d5-7b/file/view/master?fileName=modeling_internlm_xcomposer2.py&status=1#L254)。
### video_cogvlm2
- NUM_FRAMES: 默认为24,参考[这里](https://github.com/zai-org/CogVLM2/blob/main/video_demo/inference.py#L22)。
### phi3_vision
- NUM_CROPS: 默认为4,参考[这里](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)。
### llama3_1_omni
- N_MELS: 默认为128,参考[这里](https://github.com/ictnlp/LLaMA-Omni/blob/544d0ff3de8817fdcbc5192941a11cf4a72cbf2b/omni_speech/infer/infer.py#L57)。
### video_llava
- NUM_FRAMES: 默认为16。
## 其他环境变量
- CUDA_VISIBLE_DEVICES: 控制使用哪些GPU卡。默认使用所有卡。
- ASCEND_RT_VISIBLE_DEVICES: 控制使用哪些NPU卡(只对ASCEND卡生效)。默认使用所有卡。
- MODELSCOPE_CACHE: 控制缓存路径。(多机训练时建议设置该值,以确保不同节点使用相同的数据集缓存)
- PYTORCH_CUDA_ALLOC_CONF: 推荐设置为`'expandable_segments:True'`,这将减少GPU内存碎片,具体请参考[torch文档](https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management)。
- NPROC_PER_NODE: torchrun中`--nproc_per_node`的参数透传。默认为1。若设置了`NPROC_PER_NODE`或者`NNODES`环境变量,则使用torchrun启动训练或推理。
- MASTER_PORT: torchrun中`--master_port`的参数透传。默认为29500。
- MASTER_ADDR: torchrun中`--master_addr`的参数透传。
- NNODES: torchrun中`--nnodes`的参数透传。
- NODE_RANK: torchrun中`--node_rank`的参数透传。
- LOG_LEVEL: 日志的level,默认为'INFO',你可以设置为'WARNING', 'ERROR'等。
- SWIFT_DEBUG: 在`engine.infer(...)`时,若设置为'1',TransformersEngine将会打印input_ids和generate_ids的内容方便进行调试与对齐。
- VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。
- SWIFT_TIMEOUT: 若多模态数据集中存在图像URL,该参数用于控制获取图片的timeout,默认为20s。
- ROOT_IMAGE_DIR: 图像(多模态)资源的根目录。通过设置该参数,可以在数据集中使用相对于 `ROOT_IMAGE_DIR` 的相对路径。默认情况下,是相对于运行目录的相对路径。
- SWIFT_SINGLE_DEVICE_MODE: 单设备模式,可选值为"0"(默认值)/"1",在此模式下,每个进程只能看到一个设备
================================================
FILE: docs/source/Instruction/Evaluation.md
================================================
# 评测
SWIFT支持了eval(评测)能力,用于对原始模型和训练后的模型给出标准化的评测指标。
## 能力介绍
SWIFT的eval能力使用了魔搭社区[评测框架EvalScope](https://github.com/modelscope/eval-scope),并进行了高级封装以支持各类模型的评测需求。
> 注意:EvalScope支持许多其他的复杂能力,例如[模型的性能评测](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/stress_test/quick_start.html),请直接使用EvalScope框架。
目前我们支持了**标准评测集**的评测流程,以及**用户自定义**评测集的评测流程。其中**标准评测集**由三个评测后端提供支持:
下面展示所支持的数据集名称,若需了解数据集的详细信息,请参考[所有支持的数据集](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/index.html)
1. Native(默认):
主要支持纯文本评测,同时**支持**评测结果可视化
```text
'arc', 'bbh', 'ceval', 'cmmlu', 'competition_math',
'general_qa', 'gpqa', 'gsm8k', 'hellaswag', 'humaneval',
'ifeval', 'iquiz', 'mmlu', 'mmlu_pro',
'race', 'trivia_qa', 'truthful_qa'
```
2. OpenCompass:
主要支持纯文本评测,暂**不支持**评测结果可视化
```text
'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada',
'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze',
'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval',
'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench',
'ARC_e', 'COPA', 'ARC_c', 'DRCD'
```
3. VLMEvalKit:
主要支持多模态评测,暂**不支持**评测结果可视化
```text
'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN',
'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11',
'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2',
'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL',
'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar',
'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL',
'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI',
'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST',
'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500',
'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL',
'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME'
```
## 环境准备
```shell
pip install ms-swift[eval] -U
```
或从源代码安装:
```shell
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e '.[eval]'
```
## 评测
支持纯文本评测、多模态评测、url评测、自定义数据集评测四种方式
**基本示例**
```shell
CUDA_VISIBLE_DEVICES=0 \
swift eval \
--model Qwen/Qwen2.5-0.5B-Instruct \
--eval_backend Native \
--infer_backend transformers \
--eval_limit 10 \
--eval_dataset gsm8k
```
其中:
- model: 可指定本地模型路径或者modelscope上的模型ID
- eval_backend: 可选 Native, OpenCompass, VLMEvalKit,默认为 Native
- infer_backend: 可选 transformers, vllm, sglang, lmdeploy,默认为 transformers
- eval_limit: 每个评测集的采样数,默认为None,表示使用全部数据,可用于快速验证
- eval_dataset: 评测数据集,可设置多个数据集,用空格分割
**复杂评测示例**
```shell
CUDA_VISIBLE_DEVICES=0 \
swift eval \
--model Qwen/Qwen2.5-0.5B-Instruct \
--eval_backend Native \
--infer_backend transformers \
--eval_limit 10 \
--eval_dataset gsm8k \
--eval_dataset_args '{"gsm8k": {"few_shot_num": 0, "filters": {"remove_until": " "}}}' \
--eval_generation_config '{"max_tokens": 512, "temperature": 0}' \
--extra_eval_args '{"ignore_errors": true, "debug": true}'
```
详细评测的参数列表可以参考[这里](Command-line-parameters.md#评测参数)。
## 训练中评测
SWIFT支持在训练过程中使用EvalScope对当前的模型进行评测,以便及时了解模型的训练效果。
**基本示例**
```shell
CUDA_VISIBLE_DEVICES=0 \
swift sft \
--model "Qwen/Qwen2.5-0.5B-Instruct" \
--tuner_type "lora" \
--dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \
--torch_dtype "bfloat16" \
--num_train_epochs "1" \
--per_device_train_batch_size "1" \
--learning_rate "1e-4" \
--lora_rank "8" \
--lora_alpha "32" \
--target_modules "all-linear" \
--gradient_accumulation_steps "16" \
--save_steps "50" \
--save_total_limit "5" \
--logging_steps "5" \
--max_length "2048" \
--eval_strategy "steps" \
--eval_steps "5" \
--per_device_eval_batch_size "5" \
--eval_use_evalscope \
--eval_dataset "gsm8k" \
--eval_dataset_args '{"gsm8k": {"few_shot_num": 0}}' \
--eval_limit "10"
```
注意启动命令为`sft`,其中eval相关的参数有:
- eval_strategy: 评估策略。默认为None,跟随`save_strategy`的策略
- eval_steps: 默认为None,如果存在评估数据集,则跟随`save_steps`的策略
- eval_use_evalscope: 是否使用evalscope进行评测,需要设置该参数来开启评测
- eval_dataset: 评测数据集,可设置多个数据集,用空格分割
- eval_dataset_args: 评测数据集参数,json格式,可设置多个数据集的参数
- eval_limit: 评测数据集采样数
- eval_generation_config: 评测时模型推理配置,json格式,默认为`{'max_tokens': 512}`
更多评测的样例可以参考[examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval)
## 自定义评测集
本框架支持选择题和问答题,两种预定义的数据集格式,使用流程如下:
*注意:使用自定义评测时,eval_backend参数必须为Native*
### 选择题格式(MCQ)
适合用户是选择题的场景,评测指标为准确率(accuracy)。
**数据准备**
准备选择题格式的csv文件,该目录结构如下:
```text
mcq/
├── example_dev.csv # (可选)文件名组成为`{subset_name}_dev.csv`,用于fewshot评测
└── example_val.csv # 文件名组成为`{subset_name}_val.csv`,用于实际评测的数据
```
其中csv文件需要为下面的格式:
```text
id,question,A,B,C,D,answer
1,通常来说,组成动物蛋白质的氨基酸有____,4种,22种,20种,19种,C
2,血液内存在的下列物质中,不属于代谢终产物的是____。,尿素,尿酸,丙酮酸,二氧化碳,C
```
其中:
- `id`是序号(可选)
- `question`是问题
- `A`, `B`, `C`, `D`等是可选项,最大支持10个选项
- `answer`是正确选项
**启动评测**
运行下面的命令:
```bash
CUDA_VISIBLE_DEVICES=0 \
swift eval \
--model Qwen/Qwen2.5-0.5B-Instruct \
--eval_backend Native \
--infer_backend transformers \
--eval_dataset general_mcq \
--eval_dataset_args '{"general_mcq": {"local_path": "/path/to/mcq", "subset_list": ["example"]}}'
```
其中:
- `eval_dataset` 需要设置为 `general_mcq`
- `eval_dataset_args` 需要设置
- `local_path` 自定义数据集文件夹路径
- `subset_list` 评测数据集名称,上述 `*_dev.csv` 中的 `*`
**运行结果**
```text
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+=====================+=============+=================+==========+=======+=========+=========+
| Qwen2-0.5B-Instruct | general_mcq | AverageAccuracy | example | 12 | 0.5833 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
```
## 问答题格式(QA)
适合用户是问答题的场景,评测指标是`ROUGE`和`BLEU`。
**数据准备**
准备一个问答题格式的jsonline文件,该目录包含了一个文件:
```text
qa/
└── example.jsonl
```
该jsonline文件需要为下面的格式:
```json
{"query": "中国的首都是哪里?", "response": "中国的首都是北京"}
{"query": "世界上最高的山是哪座山?", "response": "是珠穆朗玛峰"}
{"query": "为什么北极见不到企鹅?", "response": "因为企鹅大多生活在南极"}
```
**启动评测**
运行下面的命令:
```bash
CUDA_VISIBLE_DEVICES=0 \
swift eval \
--model Qwen/Qwen2.5-0.5B-Instruct \
--eval_backend Native \
--infer_backend transformers \
--eval_dataset general_qa \
--eval_dataset_args '{"general_qa": {"local_path": "/path/to/qa", "subset_list": ["example"]}}'
```
其中:
- `eval_dataset` 需要设置为 `general_qa`
- `eval_dataset_args` 是一个json字符串,需要设置:
- `local_path` 自定义数据集文件夹路径
- `subset_list` 评测数据集名称,上述 `*.jsonl` 中的 `*`
**运行结果**
```text
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+=====================+=============+=================+==========+=======+=========+=========+
| Qwen2-0.5B-Instruct | general_qa | bleu-1 | default | 12 | 0.2324 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | bleu-2 | default | 12 | 0.1451 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | bleu-3 | default | 12 | 0.0625 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | bleu-4 | default | 12 | 0.0556 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-1-f | default | 12 | 0.3441 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-1-p | default | 12 | 0.2393 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-1-r | default | 12 | 0.8889 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-2-f | default | 12 | 0.2062 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-2-p | default | 12 | 0.1453 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-2-r | default | 12 | 0.6167 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-l-f | default | 12 | 0.333 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-l-p | default | 12 | 0.2324 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
| Qwen2-0.5B-Instruct | general_qa | rouge-l-r | default | 12 | 0.8889 | default |
+---------------------+-------------+-----------------+----------+-------+---------+---------+
```
================================================
FILE: docs/source/Instruction/Export-and-push.md
================================================
# 导出与推送
## Merge LoRA
- 查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lora.sh)。
## 量化
SWIFT支持AWQ、GPTQ、FP8、BNB模型的量化导出。其中使用AWQ、GPTQ需使用校准数据集,量化性能较好但量化耗时较长;而FP8、BNB无需校准数据集,量化耗时较短。
| 量化技术 | 多模态 | 推理加速 | 继续训练 |
| -------- | ------ | -------- | -------- |
| GPTQ | ✅ | ✅ | ✅ |
| AWQ | ✅ | ✅ | ✅ |
| BNB | ❌ | ✅ | ✅ |
除SWIFT安装外,需要安装以下额外依赖:
```shell
# 使用awq量化:
# autoawq和cuda版本有对应关系,请按照`https://github.com/casper-hansen/AutoAWQ`选择版本
# 如果出现torch依赖冲突,请额外增加指令`--no-deps`
pip install autoawq -U
# 使用gptq量化:
# auto_gptq和cuda版本有对应关系,请按照`https://github.com/PanQiWei/AutoGPTQ#quick-installation`选择版本
pip install auto_gptq optimum -U
# 使用gptq v2量化:
pip install gptqmodel optimum -U
# 使用bnb量化:
pip install bitsandbytes -U
```
我们提供了一系列脚本展现SWIFT的量化导出能力:
- 支持[AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[GPTQ v2](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq_v2.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh)量化导出。
- 多模态量化: 支持使用GPTQ和AWQ对多模态模型进行量化,其中AWQ支持的多模态模型有限。参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/mllm)。
- 更多系列模型的支持: 支持[Bert](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/bert),[Reward Model](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/reward_model)的量化导出。
- 使用SWIFT量化导出的模型支持使用vllm/sglang/lmdeploy进行推理加速;也支持使用QLoRA继续进行SFT/RLHF。
## 推送模型
SWIFT支持将训练/量化的模型重新推送到ModelScope/HuggingFace。默认推送到ModelScope,你可以指定`--use_hf true`推送到HuggingFace。
```shell
swift export \
--model output/vx-xxx/checkpoint-xxx \
--push_to_hub true \
--hub_model_id '' \
--hub_token '' \
--use_hf false
```
小贴士:
- 你可以使用`--model `或者`--adapters `指定需要推送的checkpoint目录,这两种写法在推送模型场景没有差异。
- 推送到ModelScope时,你需要确保你已经注册了魔搭账号,你的SDK token可以在[该页面](https://www.modelscope.cn/my/myaccesstoken)中获取。推送模型需确保sdk token的账号具有model_id对应组织的编辑权限。推送模型将自动创建对应model_id的模型仓库(如果该模型仓库不存在),你可以使用`--hub_private_repo true`来自动创建私有的模型仓库。
================================================
FILE: docs/source/Instruction/Frequently-asked-questions.md
================================================
# 常见问题整理
下面是SWIFT使用过程中遇到的一些常见问题。
## 训练
SWIFT支持的训练方法包括预训练、指令监督微调、偏好学习、GRPO、Embedding、Reranker、序列分类任务等,详见[主页](https://github.com/modelscope/ms-swift/blob/main/README_CN.md)。
### Q1: SWIFT支持的模型有哪些?如何设置本地模型路径?
支持的模型详见文档[支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html)。如果模型已经下载到了本地,设置`--model `即可。对于离线环境训练,同时设置`--model 本地路径`,`--check_model false`,如果提示git clone相关报错,需要clone repo,然后通过`local_repo_path`指定,详见[命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。从ModelScope下载的模型,可以配置环境变量`MODELSCOPE_CACHE=your_path`将原始的模型存到指定路径;如果用ModelScope SDK下载,通过`cache_dir="本地地址"`;也可以使用`modelscope download`命令行工具或`git`下载,详见modelscope文档[模型下载](https://modelscope.cn/docs/models/download)。如果需要从Hugging Face下载模型,设置环境变量`USE_HF=1`。
SWIFT会自动匹配model_type,也可以查看文档[支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html),手动指定。
### Q2: SWIFT支持的数据集有哪些?如何使用自定义数据集?
支持的数据集详见文档[支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/Supported-models-and-datasets.html)。自定义数据集格式及使用方法见文档[自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html),符合这些格式的数据集会自动使用swift内置的数据预处理器。如果与文档中的格式不一致,请自行转换格式,或者参考已支持的数据集接入。若自定义数据集中有额外的字段,这些字段默认不会被使用,可以通过[命令行参数remove_unused_columns](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id4)进行设置。
需要将数据集下载到本地,然后通过路径指定,请查看[自定义数据集文档](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html#dataset-info-json)。`git clone`下载到本地,然后通过dataset_info.json文件中的`dataset_path`字段指定就行。
数据随机详见[命令行参数dataset_shuffle](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。
强制重新下载数据集,设置命令行参数`--download_mode`。对数据集进行错误检查,请设置命令行参数`strict`。需要数据集质检工具时,可以查看另一个库[data-juicer](https://github.com/modelscope/data-juicer)。
由于datasets的底层pyarrow对于类型管控比较严格,图像grounding数据集的objects部分、agent数据集的tools部分等,因为这个原因要用str,要不pyarrow就会报错:你每行的类型不一致。
训练中遇到报错`AttributeError:’TrainerState’ object has no attribute ’last_model_checkpoint’`,数据集太少了,数据数量不足一个step导致的报错,增加一些数据。另外,切分的验证集数据很少时也会有类似报错。
下面是一个assistant字段为空导致的报错:
```text
File "/mnt/workspace/swift/swift/1lm/dataset/preprocessor/core. py", line 69, in _check_messages raise
ValueError(f'assistant_message; {assistant_message}')
ValueError: assistant_message: {'role' :'assistant', 'content': ''}
```
```shell
CUDA_VISIBLE_DEVICES=0 NPROC_PER_NODE=1 MAX_PIXELS=1003520 swift sft --model Qwen/Qwen2.5-VL-7B-Instruct --tuner_type lora --dataset /mnt/workspace/data.json --deepspeed zero2 --max_length 16384
```
数据集assistant字段为空,如果是推理,把这个空字符串删掉,因为这个会导致训练时NaN,会做检查。
### Q3: 从缓存加载数据集相关问题
设置命令行参数`--load_from_cache_file true`,可以加快数据集加载速度,尤其是在多模态数据集、数据量较大等场景。在debug或修改preprocessor时,设置为false,更多说明请在[命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)中搜索该参数。
### Q4: 如何搭建SWIFT环境?有镜像可以使用吗?
环境搭建详见[SWIFT安装文档](https://swift.readthedocs.io/zh-cn/latest/GetStarted/SWIFT-installation.html),一些常见依赖的推荐版本可以在[主页](https://github.com/modelscope/ms-swift/blob/main/README_CN.md)上找到。文档中提供了镜像,用`docker run`命令启动容器即可,如:`docker run --gpus all -p 8000:8000 -it -d --name ms modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.0-vllm0.13.0-modelscope1.33.0-swift3.12.5 /bin/bash`,启动容器后拉最新代码安装swift。
### Q5: 多模态模型训练数据格式、参数冻结、优化器设置相关问题
多模态模型训练的[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)。支持纯文本、图文数据训练,也可以两种数据混合训练。图像、视频、音频相关的参数,如,最大像素、fps等请查看[特定模型参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id19)。
Grounding任务中通用数据格式支持了一个物体对应多个bbox,参考文档[自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html#grounding)。videos可以是图片列表,使用文件目录的方式。
SWIFT按max_pixels对图像进行调整,会保存预处理前和后的图像,然后对bbox进行调整,不过推理没有这样的调整,需要提前手动处理图像。
VLM模型训练减少显存使用,请配置`--freeze_vit true`,以及限制最大像素的参数`--max_pixels`。`--freeze_vit`,`--freeze_aligner`,`--freeze_llm`这几个参数详见[命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#tuner)。如果ViT没有训练,那有会有warning: none of the inputs have requires_grad=True是正常的,如果训练了,则不应该抛出。
使用全参数微调visual encoder同时使用LoRA微调LLM,参考这里[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/lora_llm_full_vit)。
### Q6: template相关问题
由于jinja chat template没有labels,所以不支持训练。
多模态数据集如果需要在加载数据之后做动态数据增强,例如,给输入数据随机添加噪声等,请在template中修改encode方法。
### Q7: SWIFT训练如何debug?
详见[预训练与微调文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Pre-training-and-Fine-tuning.html)。
### Q8: SWIFT如何使用python脚本训练?
参考[notebook例子](https://github.com/modelscope/ms-swift/tree/main/examples/notebook)。
### Q9: SWIFT如何使用UI界面训练?
使用`swift web-ui`命令,界面训练与命令行一致,界面上的参数请查看命令行参数文档。自定义数据集的使用与上面Q2一致。Megatron-SWIFT不支持UI界面训练。
### Q10: 单机多卡训练相关问题
SWIFT多卡训练底层依赖torchrun。`deepspeed` 和 `device_map`不兼容,两个只能选1个。更多细节请查看代码库中的[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu)。
### Q11: 多机多卡训练相关问题
请查看[多机多卡例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node)。多机多卡训练时,只有主节点有日志。
多机训练速度缓慢,如,使用DeepSpeed ZeRO3训练会出现严重的速度下降,请查看[issue](https://github.com/modelscope/ms-swift/issues/1825)。
### Q12: 大规模数据集相关问题
数据集太大了,然后每次tokenize都需要很久,请使用`lazy_tokenize`或流式读取`streaming`,详见[命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。
### Q13: 断点续训相关问题
先前训练脚本中的参数不变,加上`--resume_from_checkpoint output/xxx/vx-xxx/checkpoint-xxx`,详见[命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。如果数据集发生了改动,仅加载模型,请同时设置`--resume_only_model`。更复杂的场景,请在命令行参数文档中搜索resume。
### Q14: 数据集流式加载相关问题
流式加载`--streaming true`,一边训练一边加载,需要设置max_steps,详见`streaming`参数说明,[命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id4)。
注意:streaming是不随机的,也不划分验证集,验证集通过命令行参数`val_dataset`指定。
断点续训时,流式只能往前索引,不能随机索引,跳过已经训练的数据耗时特别长,不建议用流式。
### Q15: packing相关问题
packing要和flash_attn一起使用,不然是有误差,attention_mask会出问题。packing_cache这个参数,在多机训练时,需要设置为共享的磁盘路径。
Qwen3.5模型中的linear-attention不支持var_len,不建议开启packing。
开启packing,多模态数据会有两次map,map完一次后还会进行第二次mapping,一次是数据集的,一次是template的。如果速度非常慢,可以设置`OMP_NUM_THREADS=14`加速,或者可以把packing去掉,就不会有第二次了。
### Q16: 数据集多进程处理
数据集map过程比较慢时,设置参数`--dataset_num_proc`可以开多进程。多模态数据集map比较慢是正常的。
### Q17: 当前训练完默认保存多少个checkpoint?
默认保存所有的checkpoint,详见[命令行参数 save_total_limit](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。
### Q18: 训练过程的loss和acc
自定义的损失函数在plugin中加就可以。如果需要不同数据集的loss曲线,请设置`--enable_channel_loss`。
如果eval得到的acc和对应保存的ckpt去重新推理一遍计算得到的acc不是一致的,是因为训练时候的eval_acc和推理时候的acc计算方式不一样导致的。`acc_strategy`: 默认为`'token'`, 可选择的值包括: `'token'`,`'seq'`。
训练过程中没有token_acc是因为有些模型`logits`和`labels`数量对不上,就不算的。
可以在[这里](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py)查看当前支持的loss或添加新的loss,
检查``等特殊token是否参与损失计算,可以在命令行日志中找一下打印的labels。
训练agent时,tool_call就是应该算loss,tool_response不算loss。
### Q19: 模型参数freeze相关问题
训练的过程中,冻结某些层时导致某些参数未参与梯度回传,请配置参数`--ddp_find_unused_parameters true`。
freeze_parameters和freeze_vit/freeze_aligner/freeze_llm:先freeze parameters再active parameters。`freeze vit/freeze aligner/freeze llm`这三个参数会对freeze parameters 和trainable parameters进行调整.因为有些模型的ViT中包含`aligner`,所以会将`aligner`单独加入trainable_parameters。
freeze_parameters_ratio这个参数的机制是从embedding开始从下往上freeze。
### Q20: 序列并行相关问题
序列并行支持pt, sft, dpo and grpo。参考这个例子[sequence_parallel](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel)。
VLM模型的目前仅支持flash-attn,纯文本支持flash-attn和sdpa。
sequence parallel可以和Liger kernel同时使用。
sequence parallel和自定义loss冲突时,由于sequence parallel在自己的代码中定制了loss,可以自己改下[这里](https://github.com/modelscope/ms-swift/blob/main/swift/trainers/sequence_parallel/ulysses.py)。
### Q21: 扩充词表
用SWIFT框架扩充词表需要设置命令行参数`new_special_tokens`,`--modules_to_save embed_tokens lm_head`,详见[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)。
### Q22: tuners相关问题
SWIFT中的LlamaPro对多模态做了适配。
LongLoRA只有LLaMA系列模型能用。
LoRA训练和`--trainable_parameters`参数不兼容,LoRA模块之外其他的可训练参数用modules_to_save。
### Q23: embedding/reranker训练
[embedding训练例子](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding)。
[reranker训练例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker)。
数据格式见[自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/Custom-dataset.html)。
### Q24: 分类任务训练
SWIFT支持多标签分类,自定义数据集文档有数据格式,在命令行参数文档中搜索`problem_type`,其他和回归是一样的。
注意:label字段和message字段同级。
### Q25: thinking模型训练
查看这个[issue](https://github.com/modelscope/ms-swift/issues/4030)。
### Q26: 想问一下,SWIFT支持蒸馏吗?
参考这个[例子](https://github.com/modelscope/ms-swift/blob/main/examples/sampler/distill/distill.sh)。
### Q27: gkd训练student model和teacher model的model_type需要一致吗,一个dense一个moe可以吗?
可以的,只需要词表一样,不过带MoE就会比较慢。
### Q28: GRPO训练相关问题
SWIFT现在支持多模态的GRPO训练。GRPO训练过程中loss接近0是正常情况,参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)。
设置sleep_mode,推理结束VllmEngine释放显存。下次调用时,再加载,而不是一直占用。
GRPO训练时不想引入KL项,可以通过命令行参数beta设置。
LoRA微调后继续做GRPO训练,请在命令行参数文档中搜索`--adapters`。
由于算entropy会有额外的一点开销,所以默认没有记录熵曲线。如果需要,请设置`--log_entropy true`,
colocate模式不支持use_async_engine。
GRPO不支持channel_loss。
Liger kernel和padding free没法在GRPO阶段一起开。如果一起开,需要改liger grpo loss的实现,在liger kernel库中,不方便改。
如果训练集有不同的task,请查看[多任务训练](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/DeveloperGuide/multi_task.html)。
### Q29: reward函数(模型)相关问题
reward_model和reward_funcs可以一起使用。
自定义reward函数参考[examples/train/grpo/plugin](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin)。
针对math问题,要从数据集里面传solution,不然不好算accuracy。
如果在ORM的自定义奖励函数中需要传入数据集中的某个列,请将该列放到messages之外的其他列。
在GRPO训练的过程中如果需要指定一个llm-judge模型来做打分,请参考奖励模型的文档。
### Q30: rollout相关问题
Rollout应该是不兼容pipeline parallel。
vLLM推理引擎默认trust_rwmote_code为true。
### Q31: 请教一个问题,grpo脚本中的save_steps指的是step还是global step?目前本地训练显示的global step是18, wandb上显示的step是628。
`global_step`,本地tqdm显示的。
### Q32: 默认只用 num_iterations=1 的话,clip 就失去作用了吧?dapo 的 clip higher 也没用。我看 veRL 有个 micro batch 可以设置单轮小批次更新 policy model 来使得 clip 项生效,ms-swift 的 mini batch 看源码貌似只是做了梯度累加?
是的,需要num_iterations>1。
### Q33: 请问gspo训练支持传入参数top_entropy_quantile吗?传入了--importance_sampling_level sequence后,还能实现对熵分布前x%的token的优化吗?
支持,顺序是先正常计算loss(受importance_sampling_level影响),再根据top_entropy_quantile mask掉loss。
### Q34: GRPO文档中的faq
更多GRPO相关的FAQ,请查看[GRPO文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/GRPO/GetStarted/GRPO.html#faq)
### Q35: ppo等偏好训练相关问题
PPO训练不支持梯度裁剪。
目前PPO还只支持RM和policy是同一系列的模型(tokenizer/template)。
不支持多轮的DPO。
### Q36: MoE模型训练相关问题
MoE模型LoRA训练,如果aux-loss基本没变化,将all-router也加到target_modules。
LoRA训练中,路由器模块是否参与训练看gate是否是nn.Linear实现,如果是nn.Parameter就不训练,详见命令行参数[target_parameters](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#tuner)。
### Q37: Megatron-SWIFT训练相关问题
Checkpoint保存,参考命令行参数[save_strategy](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Command-line-parameters.html)。
Megatron多机训练时,因为pp并行只有在pp last rank有完整的信息, 日志在last rank打印,而不是从master node打印。
Megatron-SWIFT支持了save_total_limit,支持了SwanLab监控训练,详见[Megatron-SWIFT命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Command-line-parameters.html)
ViT用的是transformers的模型结构,目前没有并行,训练遇到OOM时降低`decoder_first_pipeline_num_layers`。
Megatron-SWIFT支持新的模型,目前没有教程,请查看新增模型的PR。
sequence_parallel的并行数等于tp数。
FP8训练支持block wise,参考[examples/megatron/fp8例子](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/fp8)。
### Q38: 请问Megatron-SWIFT如何配置断点续训?
配置`--mcore_model`加载checkpoint,另外根据需要配置这几个参数,`--finetune`,`--no_load_optim`,`--no_load_rng`。如果是LoRA断点续训,配置`--mcore_adapter`,其他同全参数训练,详见[Megatron-SWIFT命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Megatron-SWIFT/Command-line-parameters.html)。
### Q39: mtp相关问题
需要MTP训练,请设置命令行参数`mtp_num_layers`。
如果base模型不附带MTP结构,可以从头初始化训练MTP。
多模态的MTP目前还没支持。
### Q40: 有个关于Megatron GKD的问题请教,如果teacher是Qwen3-235B,student是Qwen3-30BA3B,之前SFT 235B都是pp8然后decoder fist和decoder last设为11。如果我在GKD的时候也设置decoder first last,会不会影响student的并行?
现在两个模型的并行参数是共用的,不同并行的设置会在v4版本后支持。
### Q41: 量化模型训练相关问题
QLoRA微调参考[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora)。
量化模型不能全参数微调,GPTQ模型的int型参数无法参与求导,只能附着LoRA等额外结构参与更新。
QLoRA训练后的模型merge参考[QLoRA例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora)。
Megatron-SWIFT不支持QLoRA训练。
### Q42: 一些特殊模型的训练
SWIFT目前不支持MiniCPM-O使用音频模态输入的训练。
微调DeepSeek-VL-2,transformers用4.42以前的版本,`peft==0.11.*`。
Moonlight-16B-A3B-Instruct微调。因为模型文件中禁止了训练, 参考DeepSeek-VL-2的解决方案,issue中搜索。
微调Ovis2这个模型有点特殊,需要padding到max_length。设置一下`--max_length`。
Qwen2.5-Omni目前不支持talker训练,只有thinker。
Qwen2-Audio的sft不支持packing。
### Q43: 请问在不支持flash attention的设备上attention implemation默认是什么呢?文档中默认是none
默认使用sdpa。
### Q44: 请问默认模型训练都是left padding是吧?
训练可以选择使用左padding还是右padding。默认是右padding, `batch infer`都是左padding。
### Q45: 请问下MoE的参数有哪些,参数表里关键字搜索不到?专家数量,专家路由这些参数怎么设置?
直接用config.json中的参数。
### Q46: SWIFT能够支持设置最小的learning rate吗,感觉最后减到太小了
可以设置,`--lr_scheduler_type cosine_with_min_lr --lr_scheduler_kwargs '{"min_lr": 1e-6}'`。
### Q47: 目前支持用yaml文件配置grpo和sft吗?
都支持的,该配置是在main.py中直接处理成命令行。
### Q48: 请问现在是不支持use_liger_kernel和log_entropy一起用吗?
不支持。
### Q49: 请问下,遇到这个报错,怎么处理?安装了apex也不行
```text
RuntimeError: ColumnParallelLinear was called with gradient_accumulation_fusion set to True but the custom CUDA extension fused_weight_gradient_mlp_cuda module is not found. To use gradient_accumulation_fusion you must install APEX with --cpp_ext and --cuda_ext. For example: pip install --global-option="--cpp_ext" --global-option="--cuda_ext ." Note that the extension requires CUDA>=11. Otherwise, you must turn off gradient accumulation fusion.
```
设置一下`--gradient_accumulation_fusion false`。
### Q50: 几个任务一起finetune vlm,不同任务视频采样规则不一致,ms-swift是否支持?在哪里配置?
[命令行参数文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)看下`interleave_prob`。
### Q51: 想问一个问题,多模态packing预训练每次pytorch allocator cache flushes since last step后,显存使用好像就会增长一点,步数多了容易oom
加个环境变量`PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'`。
### Q52: use_logits_to_keep 现在多模态大模型上可以用吗?
如果多模态token的展开在模型的forward内会报错。
### Q53: 请问一下为什么训练到会有好几次显存大幅度增加,已经50step或者100step
设置环境变量`PYTORCH_CUDA_ALLOC_CONF`,具体查看PyTorch文档。
### Q54: 从qwen base模型微调成chat模型有没有实践文档,有什么要特别配置的吗?
`swift sft`,没有其他需要特别配置的,参考[例子](https://github.com/modelscope/ms-swift/tree/main/examples/train/base_to_chat)。
### Q55: 模型训练后,回复重复了很多内容
参考[预训练与微调](https://swift.readthedocs.io/zh-cn/latest/Instruction/Pre-training-and-Fine-tuning.html)。如果训练过程中出现重复的情况,请多训练几个epoch, 清洗数据, 全参数训练, 采用RLHF的方式缓解。
### Q56: 请问为什么 --torch_dtype float16 (卡不能使用bf16)会出现报错:lib/python3.12/site-packages/torch/amp/grad_scaler.py", line 260, in _unscale_grads_ raise ValueError("Attempting to unscale FP16 gradients.") ValueError: Attempting to unscale FP16 gradients.
全参数,不能fp16训练的。
### Q57: 请问下,lora参数合并报错,目前peft是0.11.0,这个是因为peft版本需要升级吗
```text
File "/opt/conda/lib/python3.9/site-packages/peft/config.py", line 118, in from_peft_type
return config_cls(**kwargs)
TypeError: __init__() got an unexpected keyword argument 'corda_config'
```
训练和合并的peft版本不一致导致的。
### Q58: 请问这个问题如何解决?safetensors_rust.SafetensorError: Error while deserializing header: HeaderTooLarge
磁盘空间不足了,模型没有保存完整。
### Q59: 这个错误为什么会出现在这,numpy.object找不到在哪?
`numpy==1.26.3`,尝试一下。
### Q60: unsloth训练,报错:assert(type(target modules) in (list,tuple,))。配置的参数是--target modules all-linear
别用`all-linear`,改为具体的模块列表,比如`--target_modules q k v`。
### Q61: 请问对于qwen2.5-omni来说--freeze_vit false意味这视觉编码器和音频编码器都打开了,有什么办法可以只打开音频编码器不打开视觉编码器吗?
`--target_regex`写一下。
## 推理
SWIFT支持python脚本、命令行、ui界面推理,详见[推理和部署](https://swift.readthedocs.io/zh-cn/latest/Instruction/Inference-and-deployment.html)。
### Q1:SWIFT推理如何设置模型?
如果是全参数训练的模型、LoRA训练后合并的模型或者从model hub下载的模型,设置命令行参数`--model `;LoRA训练后未合并的模型,用`--adapters`设置,同时可通过`--model`指定基模路径。
### Q2: SWIFT如何使用数据集进行推理?推理结果保存在哪儿?
`--val_dataset `,指定数据集。对于训练后的模型也可以设置参数`--load_data_args true`。推理结果保存路径通过`--result_path your_path`设置,日志中会打印路径。详见文档[命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。
如果需要保留推理数据集中额外的字段,请设置`--remove_unused_columns false`。
### Q3: SWIFT如何设置批量推理?
如果infer_backend为`transformers`,设置命令行参数`--max_batch_size 16`,或[python脚本](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py)。这里max_batch_size指的是每张卡上的batch_size。
### Q4: SWIFT如何设置流式推理?
`--stream true`,此时推理结果将逐条写入jsonl文件。需要注意的是,流式推理不支持ddp。
### Q5: vLLM和SGLang推理后端相关的问题
对于LoRA训练的模型,请查看vLLM和SGLang文档,如果支持LoRA推理则不需要合并。此外,SGLang推理目前不支持多模态。
### Q6: 生成参数相关的问题
temperature等参数默认从generation_config.json中读取。设置`--temperature 0`或者`--top_k 1`可以取消推理随机性。
### Q7: 如何将system_prompt置空?命令行不设置system参数,但是它会加上默认的system。
设置`--system ''`。
### Q8: 推理时如何计算acc/rouge等指标?
参考[推理参数metric](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id14)。
### Q9: 模型推理的时候如果需要在特定前缀下继续推理的话是设置哪个参数?
参数`--response_prefix`。
### Q10: 数据answer里面已经包含了部分prompt,希望补全answer,应该怎么修改inference?
```text
{"messages": [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": "answer1, "}]}
```
用swift3.0以后的版本是可以的,参考[examples/infer/demo_agent](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py)。
### Q11: 多模态模型推理时如何限制最大像素,以减少显存占用?
设置命令行参数`--max_pixels xxx`、环境变量`MAX_PIXELS=xxx`、或特定模型参数`--model_kwargs '{"max_pixels": xxx}'`,其中环境变量仅对文档中对应的模型生效,详见文档[特定模型参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id18)。
### Q12: SWIFT推理如何输出概率值logprobs参数?
命令行推理设置`--logprobs true`,python脚本推理设置`request_config = RequestConfig(..., logprobs=True, top_logprobs=2)`,参考[test_logprobs.py](https://github.com/modelscope/ms-swift/blob/main/tests/infer/test_logprobs.py)。
### Q13: SWIFT推理如何输出last_hidden_state?
没有例子,可以参考GRPO trainer的`_get_last_hidden_state`方法。
### Q14: transformers,vllm,ollama等推理结果不一致问题
SWIFT的template是对齐transformers的。检查推理参数是否对其。此外,VllmEngine和TransformersEngine是有差异的。
### Q15: embedding/reranker模型推理
embedding模型推理参考这里的[例子](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_embedding.py)。reranker模型推理参考这里的[例子](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_reranker.py)。
### Q16: 请问在使用python脚本推理时,如何使用cpu?
设置环境变量,`os.environ['CUDA_VISIBLE_DEVICES'] = '-1'`。
### Q17: 使用swift infer命令进行推理,支持多机推理吗?
如果单节点放得下模型,外面封装k8s就行。如果单节点放不下那就不支持。
### Q18: swift sample的时候,好像不支持batch?好像是for循环一个个例子sample,有点慢
有一个[脚本](https://github.com/modelscope/ms-swift/blob/main/examples/train/rft/rft.py),可以用多进程对数据集拆分采样。
### Q19: 特殊模型依赖版本相关问题
Qwen2-Audio推理结果出现混乱,请使用transformers4.48。
transformers4.55.2训练的LoRA不能使用小于4.52的版本加载了,详见[issue#5440](https://github.com/modelscope/ms-swift/issues/5440)。
swift对不同版本的qwen-vl-utils做了兼容,使用qwen2.5-vl和qwen3-vl模型时不需要切换该依赖版本。
### Q20: 报错,safetensors_rust.SafetensorError: Error while deserializing header:MetadataIncompleteBuffer
模型权重损坏了。
## 导出
### Q1: autoawq相关的报错
如果推理没有涉及AWQ量化模型,但出现了autoawq相关的报错,可以尝试卸载autoawq再进行推理。不支持AWQ量化的模型,尝试用GPTQ进行量化。
### Q2: SWIFT量化模型时,一张卡上放不下模型的情况
尝试设置`--device_map cpu`。或者多卡加载模型,单卡量化。
### Q3: 想问一下用swift export对qwen2.5 72B模型进行gptq int4量化,max model length=32768用的是默认值,给的校准数据集有128个样本,但是量化的时候报错了,报错日志是:factorization could not be completed because the input is not positive-definite(the leading minor of order 18145 is not pisitive-definite)。是什么原因?
海森矩阵不正定的问题,试试其他的数据集。
### Q4: swift export的时候传入自定义的template_type,是不是就可以永久改掉template_type了?如果swift export --template_type 自定义,是不是就可以把模型对应的template改掉
不会被修改,swift中的template是定义在swift内部的,不是以jinja方式保存的。
### Q5: 模型训练完能直接转gguf格式吗?
目前只支持导出ModelFile,详见文档[命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html)。
## 部署
### Q1: SWIFT部署如何设置模型?
同上面推理Q1。
### Q2: SWIFT如何进行多卡部署?
详见[例子](https://github.com/modelscope/ms-swift/tree/main/examples/deploy)。如果是transformers engine,不支持DDP,不能多卡部署。此外,不支持异构部署,如不同型号的显卡、各显卡设置不同的存储占比等。
### Q3: 通过--system参数指定system prompt与数据集中每个数据前加system prompt以及template的system prompt是不是有一个就行?这些方式对模型来说,是不是一样的?
system优先级:数据集中的>命令行的>template中默认的。
### Q4: 客户端多模态输入相关问题
客户端传入图片、音频等,见[客户端例子](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client/mllm)。
如果图片url非法,可以设置请求的超时时间,环境变量`SWIFT_TIMEOUT`,或者`InferClient`中可以传参数。
### Q5: 生成参数设置相关问题
temperature等参数推理只能启动前设置,部署可以在启动时设置默认值,之后在客户端继续设置,覆盖默认值。
### Q6: SWIFT部署的模型怎么设置流式生成?
客户端控制的,查看[examples/deploy/client](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client)。
### Q7: SWIFT部署如何输出token的概率?
服务端设置`--logprobs true`,要客户端传参数,`request_config = RequestConfig(..., logprobs=True, top_logprobs=2)`。
### Q8: 部署模型时,thinking相关问题
如果需要禁止思考,目前只能在swift deploy启动的时候禁止thinking。查看这个[issue](https://github.com/modelscope/ms-swift/issues/4030)。
### Q9: 部署时,设置什么参数可以实现一次输出多个结果?
`RequestConfig`参数`n`。
### Q10: SWIFT部署,指定--infer_backend vllm,和直接使用vllm部署相关问题
如果两者推理结果相差较多,可能是template没对齐。如果推理速度相差较多,可能是图像分辨率不一致。swift默认使用V1 engine,可以通过环境变量`VLLM_USE_V1=1`控制。
### Q11: 特殊模型和依赖版本相关问题
如果遇到报错没有“model.language_model.embed_tokens.weight”,训练前后的transformers版本不一致。
qwen2.5使用fp16推理如果遇到返回乱码,尝试bf16。
### Q12: 有个问题想问一下,qwen2-7b部署后使用客户端时,调用openai的api要使用client.completions.create,不能使用client.chat.completions.create,但是使用qwen2-7b-instruct-q5_k_m.gguf的时候可以使用client.chat.completions.create,这是为什么呀?
base模型可以用client.chat.completions.create的,不过这个是兼容行为。
## 评测
### Q1: SWIFT支持的评测集有哪些?以及如何使用自定义评测集?
标准评测集和用户自定义评测集的使用详见文档[评测](https://swift.readthedocs.io/zh-cn/latest/Instruction/Evaluation.html)。
### Q2: 官方支持的评测数据集手动下载后,swift eval能配置本地路径评测吗?
离线评测请参考EvalScope文档[快速上手](https://evalscope.readthedocs.io/zh-cn/latest/get_started/basic_usage.html)
### Q3: eval微调后的模型,总是会在固定的百分比停掉,但是vllm服务看着一直是有在正常运行的。模型越大,断开的越早。
`SWIFT_TIMEOUT`环境变量设置为-1。
### Q4: 评估的时候可不可以控制数据集条数?评估一个mmlu需要一个多小时,也太慢了。
配置参数`--eval_limit`,这里的`--eval_limit`是控制了每个subset的条数,比如mmlu有50多个subset,每个limit10条,那就是500多条。
### Q5: 问一下评估swift eval里,模型最多生成1024token就结束了,这个如何修改?设置--max_new_tokens 5000,看起来没起作用
查看命令行参数[eval_generation_config](https://swift.readthedocs.io/zh-cn/latest/Instruction/Command-line-parameters.html#id16)
### Q6: 请教一下,想使用OpenCompass的后端评测,如何从本地加载下载好的数据集?
OpenCompass后端不支持设置`data_args`。
### Q7: swift eval 来评估模型,--eval_backend OpenCompass不支持自定义数据集吗?
```text
ValueError: eval_dataset: /mnt/workspace/data.jsonl is not supported.
eval_backend: OpenCompass supported datasets: ['C3', 'summedits', 'WiC', 'csl', 'lambada', 'mbpp', 'hellaswag', 'ARC_e', 'math', 'nq', 'race', 'MultiRC', 'cmb', 'ceval', 'GaokaoBench', 'mmlu', 'winogrande', 'tnews', 'triviaqa', 'CB', 'cluewsc', 'humaneval', 'AX_g', 'DRCD', 'RTE', 'ocnli_fc', 'gsm8k', 'obqa', 'ReCoRD', 'Xsum', 'ocnli', 'WSC', 'siqa', 'agieval', 'piqa', 'cmnli', 'cmmlu', 'eprstmt', 'storycloze', 'AX_b', 'afqmc', 'strategyqa', 'bustm', 'BoolQ', 'COPA', 'ARC_c', 'PMMEval', 'chid', 'CMRC', 'lcsts']
```
OpenCompass不支持自定义数据集,用native可以自定义模式。
### Q8: evalscope原生是可以生成报告的,其他后端如opencompass是不支持生成报告可视化是吗?
目前只支持native的可视化,其他后端还不支持。
### Q9: 请问一下评测ifeval报这个错是什么原因?
```text
[Errno 20] Not a directory: '/root/nltk_data/tokenizers/punkt_tab.zip/punkt_tab/english/collocations.tab'
```
解压这个文件,`unzip /path/to/nltk_data/tokenizers/punkt_tab.zip`。
### Q10: 请问评测时eval_backend='OpenCompass',怎么指定离线数据集路径?
查看[数据准备教程](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/backend/opencompass_backend.html#id3),下载数据集并解压。不用指定`dataset-args`,将数据集文件夹(即data文件夹)放置在当前工作路径下即可。
### Q11: 用evalscope报这个错是什么原因?
```text
unzip: cannot find or open /root/nltk_data/tokenizers/punkt_tab.zip, /root/nltk_data/tokenizers/punkt_tab.zip.zip or /root/nltk_data/tokenizers/punkt_tab.zip.ZIP
```
这是在下载nltk的依赖,手动下载[punkt_tab.zip](https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip),解压到`~/nltk_data/tokenizers`下面。
### Q12: 为啥纯文本没问题,测多模态我们指定路径了,但他还是检测不到数据集,会去下载?
VLMEvalKit流程跟native不一样,会自己下载数据放到`~/LMUData/`下面,详见[文档](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/backend/vlmevalkit_backend.html#id2)。
### Q13: 请问一下swift eval做benchmark评测的时候,是否可以指定llm作为judge, 参数应该怎么传进去?
支持,使用swift得从`extra_eval_args`去传递`judge-model-args`参数,包括`api_key,api_url,model_id`,整体是一个json字符串。
### Q14: 请问在执行eval的时候出现了多卡显存分配不均是什么原因?
```shell
NPROC_PER_NODE=8
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\ MAX_PIXELS=802816\ swift eval\
--model "$MODEL_PATH” \$EXTRA_ARGS \
--eval_backend Native \ --infer_backend transformers\ --device_map auto \
--eval_limit"$EVAL_LIMIT"\ --eval_dataset general_qa\
--dataset_args "{\"general_qa\": {\"local_path\": \"${DATA_PATH}\", \"subset_list\": [\"${SUBSET_NAME}\"]}}" \ --host 127.0.0.1\> "$LOG_FILE" 2>&1
```
swift eval不支持DDP方式启动。
### Q15: 请问哪里可以看到swift评测的时候送入的query除了问题之外还有哪些额外的字段呢?
最简单的方法是看输出的reviews文件中的input字段,是输入给模型的内容转换后的Markdown格式。如果用backend是opencompass的话没有这些,需要用native backend。
ms-swift的eval能力使用了魔搭社区评测框架EvalScope, 复杂能力请直接使用[EvalScope框架](https://evalscope.readthedocs.io/zh-cn/latest/get_started/introduction.html)。
================================================
FILE: docs/source/Instruction/GKD.md
================================================
# GKD
GKD(Generalized Knowledge Distillation,广义知识蒸馏)训练算法由论文 [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://arxiv.org/pdf/2306.13649) 提出。该算法通过结合离线(off-policy)和在线(on-policy)学习策略,将教师模型的知识迁移到学生模型中。
## 损失函数
当给定输入序列 $x$ 与输出序列 $y$,GKD 的损失函数可以写为:
$$
\mathcal{L}_{\text{GKD}}(x, y) = \sum_{t=1}^{|y|} D(P_{\text{teacher}}(\cdot | x, y_{ 对极端情况($\beta = 0$ 或 $\beta = 1$),直接计算单个 KL 散度:
> - 当 $\beta = 0$ 时:直接定义 $D = \text{KL}(P_{\text{teacher}} \| P_{\text{student}})$(Forward KL,Mode-covering)
> - 当 $\beta = 1$ 时:直接定义 $D = \text{KL}(P_{\text{student}} \| P_{\text{teacher}})$(Reverse KL,Mode-seeking)
> - 当 $0 < \beta < 1$ 时:使用上述混合分布公式进行插值
通过调节 $\beta$ 参数,可以在不同的散度度量之间进行插值,当 $\beta = 0.5$ 时,散度为标准的对称 JSD。
## 三种训练模式
GKD训练具有三种训练模式,区别在于输出序列 $y$ 的来源。
### 模式选择逻辑
训练时,每个样本按照以下优先级选择模式:
```python
# 伪代码:模式选择逻辑
if random() < lmbda:
# Mode 1: On-Policy 学习,由学生模型采样输出序列
y = student.generate(x)
source = "student"
elif seq_kd:
# Mode 2: Sequential KD,由教师模型采样输出序列
y = teacher.generate(x)
source = "teacher"
else:
# Mode 3: 使用数据集中的输出序列
y = y_ground_truth
source = "dataset"
# 相同的损失函数
loss = D_JSD(P_teacher(·|x,y), P_student(·|x,y))
```
### Mode 1: On-Policy 学习
设置参数`lambda`, 以概率 $\lambda$ 触发,使用学生模型采样 $y \sim P_{\text{student}}(\cdot | x)$
- 学生模型从**自己生成的序列**中学习
- 暴露在自己可能犯的错误中,学会**自我纠正和错误恢复**
- 对齐训练分布与推理分布
- 提升模型的鲁棒性和实际应用表现
**适用场景**:
- 学生模型已有一定生成能力
- 希望提升模型在真实推理场景下的表现
### Mode 2: Sequential KD(`seq_kd=True` 且未触发 on-policy)
设置参数 `seq_kd=True`, 当未触发 on-policy 时,使用教师模型采样
**数据来源**:$y \sim P_{\text{teacher}}(\cdot | x)$
### Mode 3: 离线学习(其他情况)
**数据来源**:$y = y^* \sim \text{Dataset}$
- 学生模型从**数据集的标注序列**中学习
## 参数设置
我们可以通过设置以下参数进行 GKD 训练:
### 基础参数
| 参数 | 类型 | 默认值 | 取值范围 | 说明 |
|------|------|--------|---------|------|
| `--teacher_model` | str | None | - | 教师模型路径或模型 ID |
| `--beta` | float | 0.5 | [0.0, 1.0] | 散度插值系数 • 0.0: Forward KL • 0.5: JSD (平衡) • 1.0: Reverse KL |
| `--lmbda` | float | 0.5 | [0.0, 1.0] | On-Policy 学习触发概率 • 0.0: 离线学习 • 0.5: 混合策略 • 1.0: 纯 On-Policy |
| `--seq_kd` | bool | False | True/False | 是否使用教师生成序列 • False: 非 on-policy 时使用数据集 • True: 非 on-policy 时使用教师生成 |
| `--temperature` | float | 0.9 | > 0 | 生成采样温度,控制随机性 |
| `--sft_alpha` | float | 0 | >= 0 | 混合一定比例的sft loss,对非student生成结果生效 |
| `--max_completion_length` | int | 512 | > 0 | 生成时的最大 token 数 |
### Top-K KL 计算
默认情况下,GKD 使用完整词表计算 KL 散度,容易造成 OOM,这种情况下可以使用 **Top-K** 模式来减少显存占用和计算量。
| 参数 | 类型 | 默认值 | 说明 |
|------|------|--------|------|
| `--gkd_logits_topk` | int | None | Top-K logits 数量 • None: 使用完整词表(默认) • 正整数: 仅使用教师模型概率最高的 K 个 token 计算 KL |
**Top-K 模式原理**:
在 Top-K 模式下,选取**教师模型**输出概率最高的 K 个 token,在这个子集上计算两个模型分布的 KL 散度。
$$
D_{\text{JSD}(\beta)}^{\text{top-k}}(P_T, P_S) = \beta \cdot \text{KL}(\tilde{P}_T \| \tilde{M}) + (1-\beta) \cdot \text{KL}(\tilde{P}_S \| \tilde{M})
$$
其中 Top-K 索引来自教师模型:$\text{Top-K} = \text{argtop}_K(P_T)$,$\tilde{P}_T$ 和 $\tilde{P}_S$ 是在 Top-K 子集上**重新归一化**的概率分布:
$$
\tilde{P}_T(v) = \frac{P_T(v)}{\sum_{v' \in \text{Top-K}} P_T(v')}, \quad \tilde{P}_S(v) = \frac{P_S(v)}{\sum_{v' \in \text{Top-K}} P_S(v')}, \quad v \in \text{Top-K}
$$
**使用示例**:
```bash
swift rlhf \
--rlhf_type gkd \
--model Qwen/Qwen2.5-7B-Instruct \
--teacher_model Qwen/Qwen2.5-72B-Instruct \
--gkd_logits_topk 64 \
--dataset your_dataset \
...
```
> **注意**:Top-K 模式不能与 liger kernel 同时使用(`--use_liger_kernel`)。
### 外部教师模型 API
当设置 `gkd_logits_topk` 时,可以使用外部教师模型 API 服务来获取 logprobs,这样可以避免在训练进程中加载教师模型。
| 参数 | 类型 | 默认值 | 说明 |
|------|------|--------|------|
| `--teacher_model_server` | str | None | 教师模型服务地址 如:`http://localhost:8000` |
| `--gkd_logits_topk` | int | **必需** | 使用外部 API 时必须设置,对应 API 返回的 top_logprobs 数量 |
**步骤 1:部署教师模型服务**
```bash
# 使用 vllm serve 部署教师模型
CUDA_VISIBLE_DEVICES=0 vllm serve Qwen/Qwen2.5-14B-Instruct \
--port 8000 \
--max-logprobs 64 \
--gpu-memory-utilization 0.9
```
**步骤 2:启动 GKD 训练**
```bash
swift rlhf \
--rlhf_type gkd \
--model Qwen/Qwen2.5-7B \
--teacher_model_server http://localhost:8000 \
--gkd_logits_topk 64 \
--dataset your_dataset \
--lmbda 1.0 \
--beta 1.0 \
...
```
> **vLLM max_logprobs 限制**:
> - vLLM 默认 `max_logprobs=20`,可通过 `--max-logprobs N` 参数调整
> - `gkd_logits_topk` 不能超过服务端的 `max_logprobs` 设置
## 采样加速
在 GKD 训练中,涉及到两种在线采样的情况:
1. **学生模型采样**(当 `lmbda > 0`):以 $\lambda$ 概率触发学生模型采样
2. **教师模型采样**(当 `seq_kd=True`):以 $1-\lambda$ 概率触发教师模型采样
由于采样过程会显著减慢训练速度,可参考以下两种加速方案:
### 方案 1:学生模型采样加速
**要求**:swift >= 3.10.dev
使用 vLLM 作为推理后端来加速学生模型采样,支持两种部署模式,与 GRPO 一致,参考[GRPO文档](./GRPO/GetStarted/GRPO.md#集群支持), 相关参数参考[GRPO vLLM 参数](./Command-line-parameters.md#vllm_mode)
> **注意**:vLLM 加速仅适用于学生模型的 on-policy 采样(`lmbda > 0`)。教师模型的 sequential KD 采样(`seq_kd=True`)目前仍使用 Transformers,建议使用预采样方案。
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/gkd/vllm_server.sh)
使用 Teacher Server 的训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/gkd/teacher_server.sh)
### 方案 2:教师模型预采样
对于教师模型采样(`seq_kd=True`),推荐使用 **预采样** 方式:先用教师模型离线生成高质量数据,再进行训练。
**步骤 1:使用教师模型生成数据**
```bash
export teacher_model='OpenGVLab/InternVL3-8B'
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift infer \
--model $teacher_model \
--infer_backend vllm \
--val_dataset 'modelscope/coco_2014_caption:validation#5000' \
--vllm_gpu_memory_utilization 0.9 \
--vllm_max_model_len 8192 \
--max_new_tokens 2048 \
--write_batch_size 1000 \
--result_path teacher_generated_data.jsonl
```
**步骤 2:使用预生成数据训练**
```bash
swift rlhf \
--rlhf_type gkd \
--model OpenGVLab/InternVL3-2B-Pretrained \
--teacher_model $teacher_model \
--dataset 'teacher_generated_data.jsonl' \
--seq_kd false \
...
```
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/rlhf/gkd/fast.sh)
## On-Policy Distillation
我们可以通过设置以下参数实现 Thinking Machine Lab blog 中的[On-Policy Distillation](https://thinkingmachines.ai/blog/on-policy-distillation/)训练。
```bash
--lmbda 1 # on-policy
--beta 1 # reverse
```
相关脚本可以参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/on_policy_distillation.sh)
## OPSD(On-Policy Self-Distillation)
OPSD([On-Policy Self-Distillation](https://arxiv.org/abs/2601.18734)) 是一种**单模型自蒸馏**方法,无需额外的教师模型。核心思想是:同一个模型同时扮演教师和学生,教师通过接收**特权信息**(如参考解答)来引导学生学习。
### 核心机制
- **学生**:仅看到问题,正常推理
- **教师**:看到问题 + 参考解答(通过 `teacher_prompt` 列提供特权信息),产出更优的概率分布
- **训练目标**:用 JSD 散度对齐学生和教师的输出分布
### 两种自蒸馏模式
| 模式 | 参数配置 | 教师权重 | 说明 |
|------|---------|---------|------|
| **Dynamic**(动态) | 不传 `--teacher_model` | 学生当前权重 | 教师随训练同步更新 |
| **Fixed**(固定) | `--teacher_model` 设为与学生相同的模型 | 初始教师权重 | 教师权重固定 |
### 数据格式
OPSD 需要数据集包含 `teacher_prompt` 列来提供教师的特权信息。可通过 `--external_plugins` 加载数据处理插件来构建该列。
以数学推理数据集 `open-r1/OpenThoughts-114k-math` 为例:
```python
from swift.dataset import DatasetMeta, RowPreprocessor, register_dataset
class OpenThoughtsOPSDPreprocessor(RowPreprocessor):
def preprocess(self, row):
if not row.get('correct', True):
return None
problem = row.get('problem', '')
solution = row.get('solution', '')
# 教师看到问题 + 参考解答
teacher_prompt = f'{problem}\n\nReference solution:\n{solution}\n\nNow articulate your own reasoning.'
messages = [
{'role': 'system', 'content': 'Please reason step by step, and put your final answer within \\boxed{}.'},
{'role': 'user', 'content': problem},
]
return {'messages': messages, 'teacher_prompt': teacher_prompt}
register_dataset(DatasetMeta(
ms_dataset_id='open-r1/OpenThoughts-114k-math',
preprocess_func=OpenThoughtsOPSDPreprocessor(),
tags=['math', 'opsd'],
))
```
### 参数设置
OPSD 复用 GKD 的所有参数,核心区别在于 `--teacher_model` 的配置:
| 参数 | Dynamic 模式 | Fixed 模式 |
|------|-------------|-----------|
| `--teacher_model` | 不设置 | 设为与 `--model` 相同的模型 |
参考脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/opsd/)
Megatron脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd/opsd.sh)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/CHORD.md
================================================
# On-Policy RL Meets Off-Policy Experts: Harmonizing SFT and RL via Dynamic Weighting (CHORD)
**版本依赖**:ms-swift>=3.9
本文档介绍论文 [On-Policy RL Meets Off-Policy Experts: Harmonizing SFT and RL via Dynamic Weighting](https://arxiv.org/abs/2508.11408) 中提出的 CHORD 算法。CHORD 的核心思想是在强化学习过程中,动态融合专家数据(SFT),通过 全局权重 μ + token 级别权重 φ 的双重控制机制,在模仿与探索之间实现平衡。
## 算法概述
CHORD 算法通过在 GRPO loss 中引入 **SFT loss**,实现动态混合训练。总体目标函数为:
$$
\mathcal{L}_{\text{CHORD}} = (1 - \mu) \cdot \mathcal{L}_{\text{GRPO}} + \mu \cdot \mathcal{L}_{\text{SFT}}
$$
其中:
- $\mathcal{L}_{\text{GRPO}}$:基于 on-policy 采样的强化学习损失(类似 PPO)。
- $\mathcal{L}_{\text{SFT}}$:监督微调损失。
- $\mu \in [0, 1]$:全局平衡系数,控制 SFT 信号在总梯度中的贡献。
### 参数配置(数据与批量大小)
我们可以基于 GRPO 训练实现 CHORD 训练。
CHORD 需要在训练时指定额外的 SFT 数据集和批量大小:
- `chord_sft_dataset`: 用于提供专家数据的 SFT 数据集。
- `chord_sft_per_device_train_batch_size`: 每个设备的 SFT mini-batch 大小。
---
## 两种 CHORD 变体
论文提出了两种算法变体:**CHORD-µ** 和 **CHORD-ϕ**。
### CHORD-µ
通过在训练过程中逐步 **衰减 μ**,实现从模仿专家到自主探索的过渡。
**参数:**
- `chord_mu_peak`:μ 的峰值。
- `chord_mu_valley` μ 的衰减终值。
- `chord_mu_warmup_steps` μ 值上升至峰值的训练步数。
- `chord_mu_decay_steps` μ 从峰值衰减到谷值的训练步数。
### CHORD-ϕ(Token 级加权)
**CHORD-ϕ** 通过 **token-wise 权重函数 φ** 动态控制每个专家 token 的梯度贡献。
**φ 定义:**
$$
\phi(y_t^\star, \pi_\theta) = p_t \cdot (1 - p_t)
$$
其中:
- $p_t = \pi_\theta(y_t^\star \mid x, y_{
mu值衰减与loss计算代码实现
请参考`GRPOTrainer`的`_compute_chord_loss`方法:
训练参考该[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/internal/chord.sh)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/CISPO.md
================================================
# Clipped Importance Sampling Policy Optimization (CISPO)
**版本依赖**:ms-swift>=3.11
Clipped Importance Sampling Policy Optimization (CISPO) 是 [MiniMax-M1](https://arxiv.org/abs/2506.13585) 论文中提出的一种强化学习算法。相比GRPO(Group Relative Policy Optimization)算法,CISPO 对重要性采样权重(importance sampling weights)本身进行裁剪。
## 算法原理
为便于理解,我们基于 GRPO 算法进行对比说明。
GRPO通过裁剪策略比率来限制策略更新幅度,其损失函数为:
$$
\mathcal{L}_{\text{GRPO}}(\theta) = -\mathbb{E}\left[\min\left(r_t(\theta) \cdot \hat{A}_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \cdot \hat{A}_t\right)\right]
$$
其中 $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\text{old}}}(a_t|s_t)}$ 是重要性采样比。
在处理长推理链条时,这种裁剪方式可能导致以下问题:
**关键 Token 的梯度被抑制**:在复杂推理任务中,某些关键的低概率 token(如 *However, Recheck, Wait, Aha*)对于触发深度思考和推理纠错至关重要。这些 token 在旧策略 $\pi_{\theta_{\text{old}}}$ 中概率较低,当新策略试图提高其概率时,会导致较大的策略比率 $r_t(\theta)$,GRPO 的裁剪机制会将这些 token 丢弃。
### CISPO 的解决方案
CISPO 的核心思想是:裁剪重要性采样权重,保留梯度更新。具体来说,CISPO 的损失函数为:
$$
\mathcal{L}_{\text{CISPO}}(\theta) = -\mathbb{E}\left[\text{detach}\left(\min(r_t(\theta), \epsilon_{\text{high}})\right) \cdot \hat{A}_t \cdot \log \pi_\theta(a_t|s_t)\right]
$$
其中 $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\text{old}}}(a_t|s_t)}$ 是重要性采样比。
**关键机制**:
- 对重要性采样权重进行裁剪:$\min(r_t(\theta), \epsilon_{\text{high}})$
- **detach 操作**:裁剪后的权重不参与梯度计算,作为常数系数
- 梯度来自 $\log \pi_\theta(a_t|s_t)$ 项,保证所有 token 都有梯度贡献
## 实现细节
CISPO 的伪代码实现如下:
```python
log_ratio = per_token_logps - old_per_token_logps
importance_weights = torch.exp(log_ratio) # r_t(θ) = π_θ / π_θ_old
clamped_ratios = torch.clamp(importance_weights, max=epsilon_high).detach()
per_token_loss = -clamped_ratios * advantages.unsqueeze(1) * per_token_logps
```
## 参数设置
我们可以基于 `GRPOTrainer`,通过设置以下参数实现 CISPO 训练:
```bash
--loss_type cispo
--epsilon_high 5.0
```
> 相比其他算法, cispo 的 epsilon_high 一般取值较大,minimax论文中未给出具体的参数设置,这里的值参考论文[ScaleRL](https://arxiv.org/pdf/2510.13786)的实验设置
其他训练参数参考 [GRPO参数文档](../../Command-line-parameters.md#grpo参数)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/DAPO.md
================================================
# DAPO: An Open-Source LLM Reinforcement Learning System at Scale
[Decoupled Clip and Dynamic sAmpling Policy Optimization (DAPO)](https://arxiv.org/abs/2503.14476)在GRPO的基础上设置了几种trick,分别是
- [Clip Higher](#clip-higher)
- [Dynamic Sampling](#dynamic-sampling)
- [Token level Loss](#token-level-loss)
- [Overlong Filtering](#overlong-filtering)
- [Soft Overlong Punishment](#soft-overlong-punishment)
## Clip Higher
PPO和GRPO使用对称裁剪范围(如±0.2)限制策略更新幅度,虽然保证了稳定性,但也制约了模型的探索能力。特别是当某些token在旧策略中概率极低时,即使当前梯度显示其应被强化(A>0),最大增幅也被严格限制。
DAPO使用非对称裁剪范围, 提高上裁剪范围来鼓励模型进行探索:
- 上界(鼓励侧)放宽至0.28
- 下界(抑制侧)保持0.2不变
GRPO中,默认使用`epsilon`设置用对称裁剪范围
使用参数
- `epsilon_high` 设置上裁剪范围,此时参数`epsilon` 为下裁剪范围
## Dynamic Sampling
GRPO对每个问题采样多个回答计算组间优势,
$$
\hat{A}_{i,t} = \frac{R_i - \text{mean}(\{R_j\}_{j=1}^G)}{\text{std}(\{R_j\}_{j=1}^G)}
$$
而当生成的所有输出{oi}获得相同奖励时,组间优势等于0,会出现梯度消失导致训练效率下降
DAPO引入动态采样策略解决这一问题:
- 采样阶段跳过组间奖励标准差为0的数据
- 持续生成样本直到填满批次
使用参数
- `dynamic_sample true` 来开启动态采样
- `max_resample_times` 设置最多重采样次数
## Token level Loss
GRPO 在归一化损失时采用句子级归一化,这会导致损失计算具有长度偏差。
DAPO 使用token级归一化,避免了回答长度在损失计算上的偏差。
使用参数
- loss_type bnpo/dapo 来使用token级归一化
> loss_type 计算公式可参考[文档](../DeveloperGuide/loss_types.md)
## Overlong Filtering
DAPO 认为被强制截断的回复的奖励噪声较大,可能会导致模型难以区分质量问题和长度问题。为此,DAPO 筛除了训练中被截断的数据,使其不参与损失计算。
使用参数
- overlong_filter 开启对超长样本的过滤
## Soft Overlong Punishment
语言模型常面临生成长度控制难题:
- 过长输出可能被截断,导致正确内容被误判
- 无约束生成长度影响实用性和计算效率
DAPO 设计了三段式长度惩罚函数:
$$
R_{\text{length}}(L) =
\begin{cases}
0, & L \leq L_{\text{max}} - L_{\text{cache}} \\[10pt]
\dfrac{(L_{\text{max}} - L_{\text{cache}}) - L}{L_{\text{cache}}}, & L_{\text{max}} - L_{\text{cache}} < L \leq L_{\text{max}} \\[10pt]
-1, & L > L_{\text{max}}
\end{cases}
$$
在长度位于 $(L_{\text{max}} - L_{\text{cache}} < L \leq L_{\text{max}})$ 区间时设置线性递增惩罚,在 $(L > L_{\text{max}})$ 时设置最大惩罚(-1)
使用参数
- `reward_funcs soft_overlong` 启用该奖励函数
- `soft_max_length` 设置L_max,默认等于为模型的最长输出长度(max_completion_length)。
- `soft_cache_length`: 设置L_cache
## 参数设置
综上所述,我们可以基于GRPOTrainer,设置以下参数实现 DAPO 训练。
| 参数 | 类型 | 值 |
|----------------------|-----------|-------------|
| `--loss_type` | `str` | `bnpo`/`dapo`|
| `--epsilon_high` | `float` | `0.28` |
| `--dynamic_sample` | `bool` | `true` |
| `--max_resample_times` | `int` | `3` |
| `--overlong_filter` | `bool` | `true` |
| `--reward_funcs` | `str` | `soft_overlong`|
| `--soft_cache_length` | `int` | `4096`|
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md
================================================
# Group Sequence Policy Optimization
**版本依赖**:ms-swift>=3.7
[Group Sequence Policy Optimization](https://arxiv.org/abs/2507.18071)中指出GRPO在计算重要性采样权重时,是在token级别进行操作的。然而,这种做法由于每个token仅采样一次,无法实现有效的分布校正,反而会在模型训练过程中引入高方差噪声,极易导致模型的梯度估计不稳定,最终造成模型训练的崩塌。因此,论文认为,优化目标的单位应该与奖励的单位保持一致。由于奖励通常是在序列级别(即完整生成的回复)给出的,因此更合理的做法是将 off-policy 校正和优化也提升到序列级别,而非 token 级别。以下是三种计算策略对比:
1. GRPO
对每个 token 独立计算重要性采样比,具体公式为
$$
w^{\mathrm{GRPO}}_{i,t} = \frac{\pi_\theta (y_{i, t} \mid x, y_{i, 注意:根据梯度推导(即论文中的公式(11)和(18)),当各 token 的 advantage 相同时,GSPO-token 与 GSPO 等价。当前的 GRPO 实现中,所有 token 的 advantage 实际上都是基于句子级 reward 并在 group 内进行归一化,因此在这种设置下,GSPO-token 和 GSPO 在理论上是等价的。不过,GSPO-token 为未来更细粒度(token 级别)的 advantage 提供了支持。
伪代码实现
```python
log_ratio = per_token_logps - old_per_token_logps
# GRPO
log_importance_weights = log_ratio
# GSPO (Sequence-Level)
seq_weight = (log_ratio * mask).sum(-1) / mask.sum(-1)
log_importance_weights = seq_weight.unsqueeze(-1) # (B,1)
# GSPO-token
seq_weight = (log_ratio * mask).sum(-1) / mask.sum(-1)
log_importance_weights = seq_weight.detach().unsqueeze(-1) + (per_token_logps - per_token_logps.detach())
importance_weights = torch.exp(log_importance_weights)
```
我们可以在 GRPO 训练的基础上,通过参数 `--importance_sampling_level` 选择不同的算法:
- `importance_sampling_level token` (默认,GRPO 实现)
- `importance_sampling_level sequence` (GSPO)
- `importance_sampling_level sequence_token` (GSPO-token)
其中 sequence_token 要求 ms-swift >= 3.8
论文其他超参
```bash
--epsilon 3e-4 # from paper section 5.1
--epsilon_high 4e-4 # from paper section 5.1
--steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
--beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
```
训练可以参考该[脚本](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/gspo.sh)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/REINFORCEPP.md
================================================
# REINFORCE++: An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models
**版本依赖**:ms-swift>=3.10
[REINFORCE++ Baseline](https://arxiv.org/abs/2501.03262) 是 REINFORCE++ 算法的简化版本,适用于 outcome rewards(response-level 标量奖励)。它与 GRPO 类似,对每个prompt输入采样多条模型输出,并使用组内 baseline 来估计优势函数,主要区别在于标准化时使用的统计量不同。
## 算法原理
为便于理解,我们基于 GRPO(Group Relative Policy Optimization)算法进行对比说明。
GRPO 和 REINFORCE++ Baseline 都采用组内对比的方式来估计优势函数,主要区别在于:
### 区别1:标准化时使用的统计量不同
**GRPO (Group Relative Policy Optimization)**
对每个 prompt 生成 $G$ 个响应样本,使用**组内所有样本的均值和标准差**进行标准化:
$$
\hat{A}_{i} = \frac{R_i - \text{mean}(\{R_j\}_{j=1}^G)}{\text{std}(\{R_j\}_{j=1}^G)}
$$
当设置 `scale_rewards='batch'` 时,使用**原始奖励的批次 std**:
$$
\hat{A}_{i} = \frac{R_i - \text{mean}(\{R_j\}_{j=1}^G)}{\text{std}(\{R_j\}_{j=1}^{N})}
$$
其中 $N$ 是批次中所有样本数。
**REINFORCE++ Baseline**
对每个 prompt 生成 $G$ 个响应样本,先减去组内均值,再使用**减去组内均值后的奖励**的标准差进行标准化:
$$
\begin{align}
\tilde{A}_{i} &= R_i - \text{mean}(\{R_j\}_{j=1}^G) \\
\hat{A}_{i} &= \frac{\tilde{A}_{i}}{\text{std}(\{\tilde{A}_k\}_{k=1}^{N})}
\end{align}
$$
其中 $N$ 是批次中所有样本数。
**关键区别**:
- **GRPO**:标准化时使用**原始奖励 $R$** 的标准差
- **REINFORCE++**:标准化时使用**减去组内均值后的奖励 $\tilde{A}$** 的标准差
### 区别2: KL 散度正则化
与 RLOO 类似,REINFORCE++ Baseline 将 KL 散度整合到奖励项中:
$$
R'_i = R_i - \beta \cdot \text{KL}(\pi_\theta || \pi_{\text{ref}})
$$
其中 $\beta$ 是 KL 散度的权重系数(对应参数 `beta`),$\pi_{\text{ref}}$ 是参考策略。
## 参数设置
我们可以基于 `GRPOTrainer`,通过设置以下参数实现 REINFORCE++ Baseline 训练:
```bash
--advantage_estimator reinforce_plus_plus
--scale_rewards batch
--kl_in_reward true
```
训练可以参考该[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/internal/reinforce_plus_plus.sh)
### 重要参数说明
- **`--advantage_estimator`**:选择优势函数估计方法
- `grpo`(默认):标准化时使用原始奖励的标准差
- `reinforce_plus_plus`:标准化时使用减去组内均值后的奖励的标准差
- **`--kl_in_reward`**:控制 KL 散度正则化项的处理位置
- `false`:KL 散度作为损失函数的独立正则化项(GRPO 默认)
- `true`:KL 散度直接从奖励中扣除(REINFORCE++ 原始实现)
- **`--scale_rewards`**:控制标准化方式
- `group`(默认):组内标准化
- `batch`:全局批次标准化(REINFORCE++原始实现)
- `none`:不进行标准化
- **`--num_generations`**:每个 prompt 生成的样本数量 $G$
- **`--beta`**:KL 散度正则化系数 $\beta$
其他参数参考 [GRPO参数](../../Command-line-parameters.md#grpo参数)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/RLOO.md
================================================
# REINFORCE Leave-One-Out (RLOO)
**版本依赖**:ms-swift>=3.10
[REINFORCE Leave-One-Out (RLOO)](https://arxiv.org/abs/2402.14740) 基于经典的 REINFORCE 策略梯度方法,通过留一法(Leave-One-Out)构造无偏的优势函数基线。
## 算法原理
为便于理解,我们基于 GRPO(Group Relative Policy Optimization)算法进行对比说明。
GRPO 和 RLOO 都采用组内对比的方式来估计优势函数,避免了全局基线估计带来的高方差问题。两者的核心区别主要体现在以下两个方面:
### 区别1:优势函数基线的构造方法
**1. GRPO (Group Relative Policy Optimization)**
GRPO 对每个 prompt 生成 $G$ 个响应样本,使用**组内所有样本的均值和标准差**进行标准化:
$$
\hat{A}_{i} = \frac{R_i - \text{mean}(\{R_j\}_{j=1}^G)}{\text{std}(\{R_j\}_{j=1}^G)}
$$
其中:
- $R_i$ 是第 $i$ 个样本的奖励值
- $\text{mean}(\{R_j\}_{j=1}^G) = \frac{1}{G}\sum_{j=1}^G R_j$ 是组内均值
- $\text{std}(\{R_j\}_{j=1}^G)$ 是组内标准差
**2. RLOO (REINFORCE Leave-One-Out)**
RLOO 对每个 prompt 生成 $K$ 个响应样本,使用 **留一法(Leave-One-Out)** 构造基线,即第 $i$ 个样本的基线为除自己外的其他 $K-1$ 个样本的均值:
$$
\hat{A}_{i} = R_i - \frac{1}{K-1}\sum_{j \neq i} R_j
$$
这个公式可以等价地改写为:
$$
\hat{A}_{i} = \frac{K}{K-1} \left(R_i - \bar{R}\right)
$$
其中 $\bar{R} = \frac{1}{K}\sum_{j=1}^K R_j$ 是组内所有样本的均值。
> **说明**:这里使用 $K$ 对齐论文符号,与 GRPO 中的 $G$ 含义一致,均对应配置参数 `num_generations`
**为什么使用留一法?**
留一法的关键优势在于**无偏性**。对于第 $i$ 个样本,其奖励 $R_i$ 和基线 $\frac{1}{K-1}\sum_{j \neq i} R_j$ 是独立的,因此优势估计是无偏的。相比之下,如果使用包含自身的均值作为基线,会引入偏差。
### 区别2:KL 散度正则化项的处理方式
为防止策略偏离参考策略过远,两种算法都引入了 KL 散度正则化,但处理方式不同:
**GRPO**:将 KL 散度作为独立的正则化项添加到[损失函数](../GetStarted/GRPO.md#算法原理)中:
$$
\mathcal{L}(\theta) = -\mathbb{E}\left[\hat{A}_i \log \pi_\theta(a_i|s_i)\right] + \beta \cdot \text{KL}(\pi_\theta || \pi_{\text{ref}})
$$
**RLOO**:将 KL 散度直接整合到奖励项中,构造修正后的奖励:
$$
R'_i = R_i - \beta \cdot \text{KL}(\pi_\theta || \pi_{\text{ref}})
$$
其中 $\beta$ 是 KL 散度的权重系数(对应参数 `beta`),$\pi_{\text{ref}}$ 是参考策略(通常是 SFT 模型或初始策略)。
## 参数设置
我们可以基于 `GRPOTrainer`,通过设置以下参数实现 RLOO 训练:
```bash
# 基本 RLOO 配置
--advantage_estimator rloo # 使用 RLOO 的留一法优势函数计算
--kl_in_reward true # 将 KL 散度项整合到奖励中(RLOO 默认方式)
```
训练可以参考该[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/internal/rloo.sh)
### 重要参数说明
- **`--advantage_estimator`**:选择优势函数估计方法
- `grpo`(默认):使用组内均值和标准差进行标准化
- `rloo`:使用留一法(Leave-One-Out)构造基线
- **`--kl_in_reward`**:控制 KL 散度正则化项的处理位置
- `false`:KL 散度作为损失函数的独立正则化项(GRPO 方式)
- `true`:KL 散度直接从奖励中扣除,构造修正后的奖励(RLOO 方式)
- **`--num_generations`**:每个 prompt 生成的样本数量 $K$
- **`--beta`**:KL 散度正则化系数 $\beta$
- 控制策略更新的保守程度
其他参数与 [GRPO参数](../../Command-line-parameters.md#grpo参数)一致
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/SAPO.md
================================================
# Soft Adaptive Policy Optimization (SAPO)
**版本依赖**:ms-swift>=3.11
[Soft Adaptive Policy Optimization (SAPO)](https://arxiv.org/abs/2511.20347) 针对 GRPO 中硬裁剪(hard clipping)带来的问题,提出了一种基于温度控制的软门控(soft gate)机制,用于平滑地衰减离策略更新,同时保留有用的学习信号。
## 背景与动机
在强化学习训练 LLM 时,GRPO 通过计算 token 级别的重要性采样比(Importance Sampling Ratio)来处理 off-policy 训练:
$$
r_t = \frac{\pi_\theta(y_t|x, y_{ 0$),使用正向门控:
$$
g^{+}_t = \sigma\left( \tau_{\mathrm{pos}} \cdot (r_t - 1) \right) \cdot \frac{4}{\tau_{\mathrm{pos}}}
$$
对于负向优势($A < 0$),使用负向门控:
$$
g^{-}_t = \sigma\left( \tau_{\mathrm{neg}} \cdot (r_t - 1) \right) \cdot \frac{4}{\tau_{\mathrm{neg}}}
$$
其中:
- $\sigma(\cdot)$ 是 sigmoid 函数
- $\tau_{\mathrm{pos}}$ 和 $\tau_{\mathrm{neg}}$ 是温度参数,控制门控函数的斜率
- $r_t$ 是重要性采样比
### SAPO 损失函数
$$
L^{\mathrm{SAPO}} = -g_t \cdot A
$$
其中 $g_t = g^{+}_t$ 当 $A > 0$,$g_t = g^{-}_t$ 当 $A < 0$。
### 温度参数
温度参数 $\tau$ 控制软门控函数的衰减速率,数值越大,衰减越快。

论文指出正向优势会提升采样token的logit,并降低所有未采样token的logit;负向优势相反,提高许多未采样token的logit,可能会扩散到大量无关token上,带来一定的不稳定性。所以论文推荐设置温度 $\tau_\text{neg} > \tau_\text{pos}$,来使负向奖励的token梯度衰减更快,提升训练的稳定性和性能。
论文默认推荐 $\tau_{\mathrm{pos}} = 1.0$,$\tau_{\mathrm{neg}} = 1.05$。
## 参数设置
| 参数 | 类型 | 默认值 | 说明 |
|------|------|--------|------|
| `--loss_type` | `str` | - | 设置为 `sapo` |
| `--tau_pos` | `float` | `1.0` | 正向优势的温度参数,控制门控斜率 |
| `--tau_neg` | `float` | `1.05` | 负向优势的温度参数,控制门控斜率 |
```bash
swift rlhf \
--rlhf_type grpo \
--loss_type sapo \
--tau_pos 1.0 \
--tau_neg 1.05 \
# ... 其他参数
```
训练脚本参考
- [swift](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/sapo.sh)
- [megatron swift](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/grpo/sapo.sh)
> SAPO 的软门控机制仅在 off-policy 训练下生效。
> SAPO 中的重要性采样粒度为 token 级别(即 importance_sampling_level 默认设置为 token),与 GSPO 冲突。
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/deepeyes.md
================================================
# DeepEyes: Incentivizing "Thinking with Images" via Reinforcement Learning
**版本依赖**:ms-swift>=3.7
## 原理介绍
[DeepEyes论文](https://arxiv.org/abs/2505.14362) 提出了一种利用强化学习使模型具备“think with images”(以图辅助思考)能力的方法。该方法通过端到端的强化学习,模型能力自发涌现,无需额外的 SFT(监督微调)过程。模型内置图像定位能力,能够主动调用“图像放大工具”:在推理过程中,模型会自动选取图片中的具体区域进行放大和裁剪,将处理后的区域信息进行进一步推理,实现视觉与文本的链式推理。

## 最佳实践
**数据集下载与注册**
下载 DeepEyes 官方训练数据集到本地
```bash
# modelscope
modelscope download --dataset Lixiang/ChenShawn-DeepEyes-Datasets-47k
# huggingface
huggingface-cli download ChenShawn/DeepEyes-Datasets-47k --repo-type=dataset
```
数据集内有三个parquet文件,`swift/dataset/data/dataset_info.json` 文件中分别进行注册,将数据集中的 `prompt` 列重命名为 `messages`
```json
{
"ms_dataset_id": "path/to/data_0.1.2_visual_toolbox_v2.parquet",
"columns": {
"prompt": "messages"
}
},
{
"ms_dataset_id": "path/to/data/data_thinklite_reasoning_acc.parquet",
"columns": {
"prompt": "messages"
}
},
{
"ms_dataset_id": "path/to/data/data_v0.8_visual_toolbox_v2.parquet",
"columns": {
"prompt": "messages"
}
}
```
在本地注册论文中所用到的奖励函数和工具调用逻辑,实现可以参考[DeepEyes实现示例](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py)
**部署验证模型**
Deepeyes 的奖励函数依赖生成式奖励模型对模型生成结果与标准答案进行对比评估,为了加速这一环节,推荐对模型进行部署。
假设使用 Qwen2.5-VL-72B-Instruct 模型进行评估,参考以下部署命令
```bash
# 4*80G
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift deploy \
--model Qwen/Qwen2.5-VL-72B-Instruct \
--infer_backend vllm \
--vllm_tensor_parallel_size 4 \
```
在 plugin 文件中,使用OpenAI接口进行调用,参考[奖励模型文档](../DeveloperGuide/reward_model.md#外部部署)
训练参考该[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/deepeyes/deepeyes.sh)
## 实现细节
[DeepEyes实现示例](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py)参考[官方实现](https://github.com/Visual-Agent/DeepEyes/blob/main/verl/utils/reward_score/vl_agent.py) 给出了 DeepEyes 训练插件的样例代码,涵盖了奖励函数与多轮交互调用的相关逻辑。
**数据集数据**如下
| 数据集文件名 | data_source | 对应评分函数 | 工具调用 |
|------------------------------------------|-----------------------|----------------------------------|------------------|
| data_v0.8_visual_toolbox_v2.parquet | chart | vl_agent.compute_score | True (image_zoom_in_tool) |
| data_0.1.2_visual_toolbox_v2.parquet | vstar | vl_agent.compute_score | True (image_zoom_in_tool) |
| data_thinklite_reasoning_acc.parquet | thinklite_eureka | vl_agent.compute_score_math | False |
**注意**:多模态大模型在处理图像输入时,可能会对图像进行预处理(例如受 max_pixels 参数限制的裁剪或缩放等操作)。当调用图像放大工具 image_zoom_in_tool 时,模型会根据输入图像输出裁剪后的 bbox。因此,在调用图像放大工具时,需要确保输入的是经过预处理后的图像。示例代码展示了 Qwen2.5-VL 系列模型的实现方式:
```python
from qwen_vl_utils import fetch_image
# 这里的images尚未经过图像处理
infer_request.images
# 通过加载为PIL.Image格式,进行裁剪(使用环境变量MAX_PIXELS时的处理)
img = fetch_image({'image': load_pil_image(infer_request.images[0])})
```
**工具奖励**
论文中指出当最终答案正确,且轨迹至少使用一个工具时给予工具奖励。为了避免模型生成的工具调用是无效的,我们通过图像数量而不是`` 等token进行判断。
```python
tool_reward = 1.0 if num_image > 1 and acc_reward > 0.5 else 0.0
```
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/entropy_mask.md
================================================
# Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning
**版本依赖**:ms-swift>=3.7
[论文](https://arxiv.org/abs/2506.01939)发现在以 RLVR等方法训练大型语言模型推理能力时,驱动学习进步的关键在于一小部分高熵“少数 token”,而并非大多数信息熵低的 token。
论文指出,在模型推理的 token 分布中,只有极少数信息熵较高的 token 起到了主导作用。这些 token 往往出现在推理和决策路径分歧最大的关键节点(如 "wait"、"since" 等),决定了模型能否习得复杂推理任务。而大多数熵低的 token 对模型推理能力的提升作用有限。论文提出只对高熵 token 计算策略梯度、舍弃低熵 token 的梯度。
token 熵公式如下
$
H_t := -∑_{j=1}^{V} p_{t,j} \log p_{t,j}, \qquad where (p_{t,1}, ···, p_{t,V}) = \mathbf{p}_t = π_θ(\cdot | \mathbf{q}, \mathbf{o}_{ "Token entropy" $H_t$ 始终指向位置 $t$ 的生成分布 $\mathbf{p}_t$ 的不确定性,而非 token $o_t$ 本身的属性。即$H_t$ 是位置 $t$ 对应分布 $\mathbf{p}_t$ 的熵,与采样得到的 token $o_t$ 无关。
在实践中,我们可以在 GRPO 训练中通过参数 `top_entropy_quantile` 控制训练范围。论文实验设置该参数为 0.2,即每次仅对处于熵分布前 20% 的 token 进行训练优化。
同时使用参数`log_entropy`,可以记录训练过程中的熵值变化,参考[文档](../GetStarted/GRPO.md#logged-metrics)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/index.rst
================================================
Advanced Research
===============
.. toctree::
:maxdepth: 1
entropy_mask.md
CISPO.md
DAPO.md
deepeyes.md
GSPO.md
CHORD.md
RLOO.md
REINFORCEPP.md
SAPO.md
training_inference_mismatch.md
treepo.md
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/training_inference_mismatch.md
================================================
# Training-Inference-Mismatch
**版本依赖**:ms-swift>=3.11
**TL;DR**: GRPO 引入 vLLM 加速采样过程的同时,也引入了训练-推理不一致(Training-Inference Mismatch)的问题,从而可能影响训练稳定性。本文将解释这个问题的背景、原因以及相应的解决方案。
## Background
### GRPO 的基本假设
GRPO (Group Relative Policy Optimization) 的训练目标可以表示为:
$$
\mathcal{L}_{\text{GRPO}} = - \mathbb{E}_{y \sim \pi_\theta} \left[ \min \left( r_t(\theta) \hat{A}_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t \right) \right]
$$
其中:
- $r_t(\theta) = \frac{\pi_\theta(y_t|x, y_{ \tau$
2. **且** $\hat{A}_i < 0$
其中:
- $\pi_{\text{old}}$ 优先使用 `rollout_per_token_logps`(rollout/行为策略的 logprobs),若不存在则使用 `old_per_token_logps`
- $\tau$ 是用户设置的阈值(`--off_policy_sequence_mask_delta`,默认 None 表示关闭)
参考资料
1. https://yingru.notion.site/When-Speed-Kills-Stability-Demystifying-RL-Collapse-from-the-Training-Inference-Mismatch-271211a558b7808d8b12d403fd15edda
2. https://fengyao.notion.site/off-policy-rl
3. https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/rollout_corr_helper.py
4. [DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models](https://arxiv.org/abs/2512.02556)
================================================
FILE: docs/source/Instruction/GRPO/AdvancedResearch/treepo.md
================================================
# TreePO: Bridging the Gap of Policy Optimization and Efficacy and Inference Efficiency with Heuristic Tree-based Modeling
**版本依赖**:ms-swift>=3.11
作者: [li2zhi](https://github.com/li2zhi)
## 原理介绍
[TreePO论文](https://arxiv.org/abs/2508.17445) 提出了一种树状结构建模方法。该方法将序列生成组织为分段式的树结构搜索,通过动态分支、回退与提前终止机制,显著提高KV缓存复用率,从而降低计算开销,同时保持甚至增强了探索的多样性。

## 实现细节
[TreePO实现示例](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/treepo/tree_rollout_plugin.py)参考[官方实现](https://github.com/multimodal-art-projection/TreePO/blob/main/recipe/treepo/vllm_rollout_tree.py) 给出了 TreePO 训练插件的样例代码,涵盖了多轮交互、终止判断,与分支回退等相关逻辑。
**注意**:在实际使用中,你需要根据自己的场景需求,重写step、check_finished等方法的逻辑,以确保其能够在自定义场景下按照预期执行。而关于自定义奖励的设计与使用,你可以参考[DeepEyes](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py)的实现。
训练参考该[脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/treepo/tree_rollout.sh)
## 测试数据
> model: Qwen/Qwen2.5-0.5B
> dataset: AI-MO/NuminaMath-TIR
> subset size: 1,000 samples
> 1 GPU for training, 1 GPU for inference
| \ | batch_size | num_generation | max_tree_depth | global_step | total inference calls | saving ratio | train_speed(iter/s) | improvement rate |
| ----------------------- | ---------- | -------------- | -------------- | ----------- | --------------------- | ------------ | ------------------- | ---------------- |
| original implementation | 8 | 8 | 4 | 200 | 5965 | 0.00% | 0.292436 | 0.00% |
| tree(max_divergence=3) | 8 | 8 | 4 | 200 | 3678 | 38.34% | 0.31819 | 8.81% |
| | | | | | | | | |
| original implementation | 8 | 8 | 5 | 105 | 4312 | 0.00% | 0.261324 | 0.00% |
| tree(max_divergence=2) | 8 | 8 | 5 | 105 | 2513 | 52.69% | 0.336639 | 28.82% |
| tree(max_divergence=3) | 8 | 8 | 5 | 105 | 2990 | 30.66% | 0.308791 | 18.16% |
| | | | | | | | | |
| original implementation | 8 | 8 | 6 | 105 | 5202 | 0.00% | 0.24832 | 0.00% |
| tree(max_divergence=2) | 8 | 8 | 6 | 105 | 3348 | 35.64% | 0.27755 | 11.77% |
| tree(max_divergence=3) | 8 | 8 | 6 | 105 | 3888 | 25.26% | 0.272339 | 9.67% |
================================================
FILE: docs/source/Instruction/GRPO/DeveloperGuide/gym_env.md
================================================
# GYM环境训练
**注意** GYM环境训练逻辑已在 ms-swift 3.8 中进行重构,如果您的 ms-swift 版本低于该版本,请参考对应版本的文档。
## Gym接口
GYM源自于[OpenAI Gym](https://github.com/openai/gym),是一个抽象的强化学习环境接口,基于现在Model as Agent的趋势,我们在swift中定义了类似的一个接口,为Agent提供端到端的强化学习训练。
```python
class Env(ABC):
def __init__(self, env_config):
"""
Args:
env_config: 环境配置,比如可用工具等
"""
self.env_config = env_config
@abstractmethod
async def reset(self, config: RolloutInferRequest) -> Tuple[str, Dict[str, Any], str]:
"""
Args:
config: 环境初始化信息,应该放在
Returns:
- observation: 第一个user消息作为初始观察或者环境信息,会作为user message
- info: 用于DEBUG和日志的额外信息,会在completions.jsonl中记录
- system_message: 用户当前环境采样的系统提示词
"""
pass
@abstractmethod
async def step(self, action: Messages) -> Tuple[str, float, bool, Dict[str, Any]]:
"""
Args:
action: 所有对话消息,最后一个消息为当前采样回复
Returns:
- next_observation: 环境响应,将作为user message返回
- reward: 奖励
- done: 是否结束
- info: 用于DEBUG和日志的额外信息,会在completions.jsonl中记录
"""
pass
@abstractmethod
async def close(self):
"""Clean up environment resources."""
pass
```
除此之外,根据[Kimi-Reseacher的实践](https://moonshotai.github.io/Kimi-Researcher/),我们还额外提供了一个`ContextMangaer`接口,方便你动态的管理当前的Agent上下文。
**ContextManager指定(非必需)**
1. 在数据集中提供 [ctx_config](#注意事项) 列中的 name 键指定, 初始化相关的参数放在其他键中
2. 使用参数 `--context_manager ctx_name` 指定
```python
class ContextManager(ABC):
def __init__(self,ctx_config):
self.ctx_config = ctx_config
@abstractmethod
def manage_context(self, history: Messages,trajectory_id:str) -> Messages:
"""动态调整当前agent的上下文
Args:
history: 当前的消息历史
Returns:
调整后的消息历史
"""
pass
```
入参示例
```python
infer_request
"""
RolloutInferRequest(
messages=[
{'role': 'system', 'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here \n'}, {'role': 'user', 'content': 'What is the value of $\\sqrt{36 \\times \\sqrt{16}}$?'},
{'role': 'assistant', 'content': 'To find the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\), we will break down the problem step-by-step.\n\nFirst, we need to evaluate the inner square root:\n\\[\n\\sqrt{16}\n\\]\nWe know that:\n\\[\n4^2 = 16 \\implies \\sqrt{16} = 4\n\\]\n\nNext, we substitute this result back into the original expression:\n\\[\n\\sqrt{36 \\times \\sqrt{16}} = \\sqrt{36 \\times 4}\n\\]\n\nNow, we need to evaluate the product inside the square root:\n\\[\n36 \\times 4 = 144\n\\]\n\nSo, the expression simplifies to:\n\\[\n\\sqrt{144}\n\\]\n\nFinally, we determine the square root of 144:\n\\[\n\\sqrt{144} = 12\n\\]\n\nThus, the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is:\n\\[\n\\boxed{12}\n\\]'}
],
images=[],
audios=[],
videos=[],
tools=None,
objects={},
data_dict={
'problem': 'What is the value of $\\sqrt{36 \\times \\sqrt{16}}$?',
'solution': "To solve the problem, we need to evaluate the expression \\(\\sqrt{36 \\times \\sqrt{16}}\\).\n\nWe can break down the steps as follows:\n\n1. Evaluate the inner square root: \\(\\sqrt{16}\\).\n2. Multiply the result by 36.\n3. Take the square root of the product obtained in step 2.\n\nLet's compute this step by step using Python code for accuracy.\n```python\nimport math\n\n# Step 1: Evaluate the inner square root\ninner_sqrt = math.sqrt(16)\n\n# Step 2: Multiply the result by 36\nproduct = 36 * inner_sqrt\n\n# Step 3: Take the square root of the product\nfinal_result = math.sqrt(product)\nprint(final_result)\n```\n```output\n12.0\n```\nThe value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is /\\(\\boxed{12}\\)."
}
)
"""
result
"""
RolloutResponseChoice(
index=0,
message=ChatMessage(
role='assistant',
content='To find the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\), we will break down the problem step-by-step.\n\nFirst, we need to evaluate the inner square root:\n\\[\n\\sqrt{16}\n\\]\nWe know that:\n\\[\n4^2 = 16 \\implies \\sqrt{16} = 4\n\\]\n\nNext, we substitute this result back into the original expression:\n\\[\n\\sqrt{36 \\times \\sqrt{16}} = \\sqrt{36 \\times 4}\n\\]\n\nNow, we need to evaluate the product inside the square root:\n\\[\n36 \\times 4 = 144\n\\]\n\nSo, the expression simplifies to:\n\\[\n\\sqrt{144}\n\\]\n\nFinally, we determine the square root of 144:\n\\[\n\\sqrt{144} = 12\n\\]\n\nThus, the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is:\n\\[\n\\boxed{12}\n\\]', tool_calls=None),
finish_reason='stop',
logprobs=None,
messages=None)
"""
```
GYM环境训练可以视作一种特殊的多轮训练,区别在于使用GYM环境训练,奖励信息通过环境直接获取。
在 `rollout` 命令中使用参数 `use_gym_env` 来指定使用gym作为训练的环境接口。我们提供了兼容GYM环境的多轮规划器参考实现,见[内置多轮调度器实现](https://github.com/modelscope/ms-swift/blob/main/swift/rollout/multi_turn.py)中的 GymScheduler 类
```bash
CUDA_VISIBLE_DEVICES=0 \
swift rollout \
--model xxx \
--use_gym_env true \
--multi_turn_scheduler gym_scheduler \
--max_turns xxx
```
> 注意
**环境选择**
1. 在数据集中需要提供 [env_config](#注意事项) 列中的name键指定, 初始化相关的参数放在其他键中
2. 使用参数 `--gym_env env_name` 指定
## 最佳实践
- [训练脚本](../../../../../examples/train/grpo/external/vllm_gym.sh)
通过参数`external_plugins`, 我们可以将本地的`Env`和`ContextManager`注册进 ms-swift 中,具体实现参考[代码](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)
## 注意事项
1. 参考训练数据格式
```json
{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}],"env_config":{"name":"custom_env","other_config":"xxxx"},"ctx_config":{"name":"custom_ctx","other_config":"xxxx"}}
```
2. 默认仅对最后一轮response进行训练,如果gym涉及到多轮response生成,使用参数`--loss_scale default`对所有轮次的response进行训练,具体参考[文档](./multi_turn.md#损失掩码)
3. 数据流程
整个gym数据流程如下:
4. 奖励日志
由于gym的奖励是在step函数内计算完成,所以需要手动通过`info`返回日志,最终的记录会放在completions.jsonl中的`trajectory_infos`字段.
================================================
FILE: docs/source/Instruction/GRPO/DeveloperGuide/index.rst
================================================
Developer Guide
===============
.. toctree::
:maxdepth: 1
loss_types.md
multi_turn.md
multi_task.md
reward_function.md
reward_model.md
gym_env.md
================================================
FILE: docs/source/Instruction/GRPO/DeveloperGuide/loss_types.md
================================================
# Loss Types
GRPO训练支持多种不同的loss类型,主要区别在于归一化的维度和梯度处理方式上有所不同。
## 损失函数
token 级别上,GRPO 训练使用以下损失函数
$$\mathcal{L}_{i,t} = -\min\left(\rho_{i,t} A_{i,t}, \text{clip}(\rho_{i,t}, 1-\epsilon, 1+\epsilon) A_{i,t}\right)$$
当设置`loss_type cispo`时,使用 cispo 损失
$$\mathcal{L}_{i,t}^{\text{CISPO}} = -\text{detach}\left(\min(\rho_{i,t}, \epsilon_{\text{high}})\right) \cdot A_{i,t} \cdot \log \pi_\theta(y_{i,t}|y_{i,
多轮规划器主要承担两大核心功能:
- **终止条件判断**:通过 `check_finished` 方法判断当前轮次推理是否应该结束
- **推理请求构造**:通过 `step` 方法构建下一轮推理的请求对象
抽象基类 `MultiTurnScheduler` 的核心方法如下:
```python
class MultiTurnScheduler(ABC):
def __init__(self, max_turns: Optional[int] = None, *args, **kwargs):
self.max_turns = max_turns
def step(self, infer_request: 'RolloutInferRequest', response_choice: 'ChatCompletionResponseChoice',
current_turn: int) -> Dict:
"""
处理对话轮次之间的转换。
Args:
infer_request: 当前推理请求
response_choice: 当前轮次的响应
current_turn: 当前轮次数
Returns:
Dict[str, Any]: 包含推理结果的字典,结构如下:
- infer_request (必需): 下一轮的推理请求对象
- response_token_ids (可选): 每个 rollout 轮次的响应 token IDs
- response_loss_mask (可选): 每个 rollout 轮次响应的损失掩码
- rollout_logprobs (可选): 每个 rollout 轮次的响应对应的 logps
- rollout_infos (可选): 额外信息数据
"""
raise NotImplementedError
def check_finished(self, infer_request: 'RolloutInferRequest', response_choice: 'ChatCompletionResponseChoice',
current_turn: int) -> bool:
"""
检查多轮 rollout 是否应该结束的默认终止逻辑。
默认终止条件:
1. 当响应达到长度限制时 (finish_reason == 'length')
2. 当对话达到最大轮数时 (如果设置了 max_turns)
Args:
infer_request: 推理请求对象
response_choice: 包含生成结果的响应选择,包括 finish_reason
current_turn: 当前对话轮数
Returns:
bool: True 表示终止对话,False 表示继续
"""
if response_choice.finish_reason == 'length':
return True
if self.max_turns and current_turn >= self.max_turns:
return True
return False
```
`step` 和 `check_finished` 方法接收的参数说明:
- **infer_request**: 当前的推理请求
- **response_choice**: 当前轮次的推理结果
- **current_turn**: 当前推理轮次(从 1 开始)
入参示例(点击展开)
```python
infer_request
"""
RolloutInferRequest(
messages=[
{'role': 'system', 'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here \n'}, {'role': 'user', 'content': 'What is the value of $\\sqrt{36 \\times \\sqrt{16}}$?'},
{'role': 'assistant', 'content': 'To find the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\), we will break down the problem step-by-step.\n\nFirst, we need to evaluate the inner square root:\n\\[\n\\sqrt{16}\n\\]\nWe know that:\n\\[\n4^2 = 16 \\implies \\sqrt{16} = 4\n\\]\n\nNext, we substitute this result back into the original expression:\n\\[\n\\sqrt{36 \\times \\sqrt{16}} = \\sqrt{36 \\times 4}\n\\]\n\nNow, we need to evaluate the product inside the square root:\n\\[\n36 \\times 4 = 144\n\\]\n\nSo, the expression simplifies to:\n\\[\n\\sqrt{144}\n\\]\n\nFinally, we determine the square root of 144:\n\\[\n\\sqrt{144} = 12\n\\]\n\nThus, the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is:\n\\[\n\\boxed{12}\n\\]'}
],
images=[],
audios=[],
videos=[],
tools=None,
objects={},
data_dict={
'problem': 'What is the value of $\\sqrt{36 \\times \\sqrt{16}}$?',
'solution': "To solve the problem, we need to evaluate the expression \\(\\sqrt{36 \\times \\sqrt{16}}\\).\n\nWe can break down the steps as follows:\n\n1. Evaluate the inner square root: \\(\\sqrt{16}\\).\n2. Multiply the result by 36.\n3. Take the square root of the product obtained in step 2.\n\nLet's compute this step by step using Python code for accuracy.\n```python\nimport math\n\n# Step 1: Evaluate the inner square root\ninner_sqrt = math.sqrt(16)\n\n# Step 2: Multiply the result by 36\nproduct = 36 * inner_sqrt\n\n# Step 3: Take the square root of the product\nfinal_result = math.sqrt(product)\nprint(final_result)\n```\n```output\n12.0\n```\nThe value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is /\\(\\boxed{12}\\)."
}
)
"""
response_choice
"""
ChatCompletionResponseChoice(
index=0,
message=ChatMessage(
role='assistant',
content='To find the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\), we will break down the problem step-by-step.\n\nFirst, we need to evaluate the inner square root:\n\\[\n\\sqrt{16}\n\\]\nWe know that:\n\\[\n4^2 = 16 \\implies \\sqrt{16} = 4\n\\]\n\nNext, we substitute this result back into the original expression:\n\\[\n\\sqrt{36 \\times \\sqrt{16}} = \\sqrt{36 \\times 4}\n\\]\n\nNow, we need to evaluate the product inside the square root:\n\\[\n36 \\times 4 = 144\n\\]\n\nSo, the expression simplifies to:\n\\[\n\\sqrt{144}\n\\]\n\nFinally, we determine the square root of 144:\n\\[\n\\sqrt{144} = 12\n\\]\n\nThus, the value of \\(\\sqrt{36 \\times \\sqrt{16}}\\) is:\n\\[\n\\boxed{12}\n\\]', tool_calls=None),
finish_reason='stop',
logprobs=None,
messages=None)
"""
# response_choice.messages will be copied at the end of multi-turn inference.
```
默认的 `check_finished` 逻辑会在以下两种情况下停止推理:
- 模型回复被截断,即超出了 `max_completion_length`
- 模型推理轮数超出了限制的最大轮数
完整的默认多轮 rollout 逻辑请参考该类的 `run` 方法,我们也可以通过重载`run` 方法来实现自定义多轮逻辑。
## 设置多轮训练参数
在 swift rollout 命令中,设置 multi_turn_scheduler 参数指定规划器
```bash
swift rollout \
--model Qwen/Qwen3-1.7B \
--use_async_engine true \
--multi_turn_scheduler thinking_tips_scheduler \
--vllm_max_model_len 32768 \
--vllm_gpu_memory_utilization 0.8 \
--max_turns 3
```
> 通过参数 `external_plugins`,我们可以将本地的多轮规划器注册到 ms-swift 中,具体实现请参考[代码](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)。
多轮训练脚本请参考[脚本](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/external/vllm_multi_turn.sh)。
对于多轮 rollout,我们使用 AsyncEngine 来实现高效的批量数据异步多轮采样。AsyncEngine 在多轮推理时能够减少推理过程中的计算气泡:
在 `rollout` 命令中使用参数 `use_async_engine` 来指定 engine 的种类(默认使用 async engine):
> 注意: async engine 以及下面的自定义多轮交互逻辑 目前仅支持 server mode,对于 colocate mode 下的多轮交互逻辑,请参考 RolloutTrainerMixin 的 _colocate_multi_turn_infer 方法
## 高级设置
### 自定义多轮交互逻辑
在以上默认逻辑中,我们用一条轨迹来计算多轮 rollout 的损失,这里需要假设多轮交互的过程中,模型的历史信息没有收到改变。
而在一些多轮场景中,我们可以需要在多轮 rollout 过程中动态地修改模型的历史信息(比如压缩历史信息),此时,我们需要将每轮的 rollout 单独作为一条轨迹进行训练。
比较常见的一种场景是对于思考类模型,在实际推理过程中,模型通常只会保留最后一轮的思考内容,而忽略历史模型回复中的思考内容。
对于这类场景,我们需要重写多轮规划器中的交互逻辑,即重载 `run` 方法,从而单独返回每一轮的 Rollout 的结果。
框架内置的 `ThinkingModelTipsScheduler` 类展示了如何通过重写 `run()` 方法来实现完全自定义的多轮推理逻辑。请参考[内置多轮调度器实现](https://github.com/modelscope/ms-swift/blob/main/swift/rollout/multi_turn.py)
**注意**: 这种情况下,相同轨迹的数据会拆分为多条数据,在奖励相关的处理中,需要对相同轨迹的数据分配同样的reward。
可以在kwargs中获取 trajectory_inputs 获取完整轨迹的数据,具体实现参考[MultiTurnThinkingTips类](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)
### 多模态数据修改
在多模态多轮交互场景下,可能需要在对话过程中动态增删或修改多模态数据,并确保这些变更同步至 trainer。
实现方式:借助 rollout_infos,通过指定键值覆盖原始数据集的多模态内容。
现已支持覆盖的键:images、audios、videos。
具体请参考[DeepEyes Schduler](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py#L403-L404)
### 返回 response token ids
在默认的多轮交互流程中,规划器先把模型生成的文本字符串返回给 trainer,trainer 再将其重新 encode 为 token id,用于后续训练。为了避免这一步重复编码的开销,你可以让规划器直接返回 response_token_ids,省去 trainer 侧的再次 encode。
具体做法如下:
- 在 response_choice 对象中读取 token_ids 属性,即可获得本次 rollout 生成的 token 序列。
- 在 step/run 方法的返回值里加入 response_token_ids,trainer 便能直接使用这些 token id 参与训练,无需重新编码。
具体实现可以参考[ThinkingModelTipsScheduler](https://github.com/modelscope/ms-swift/blob/main/swift/rollout/multi_turn.py)类
### 损失掩码
在工具调用或环境交互返回结果时,若需将返回内容作为模型响应的一部分,建议对这些插入内容进行掩码处理,以确保模型在训练过程中不会对这些外部生成的内容计算损失。
我们可以通过两种方式设置损失掩码
**第一种:设置 loss_scale**
ms-swift 提供 loss_scale 参数来对模型回复部分的内容进行损失缩放设置。比如设置`--loss_scale last_round`,可以将非最后一轮的模型回复的损失置零。我们也可以实现自定义 loss_scale,具体请参考[定制化 loss_scale 文档](../../../Customization/Architecture.md#loss-scale)。
> 注:在GRPO中,loss_scale 只提供掩码功能,不提供缩放功能。
**第二种:设置loss_mask**
在`step`或者`run`方法中设置 response_loss_mask, 可以在规划器中自定义损失掩码。前提需要返回response token ids,返回的 response_loss_mask 需要与 response token ids等长。当返回 response_loss_mask 时,loss_scale 参数失效。
response_loss_mask 返回可以参考[ToolCallScheduler类](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)
### 奖励函数相关
在奖励函数中获取多轮 Rollout 中的信息
在`step`或者`run`方法中,返回 `rollout_infos` 对象,在奖励函数的 kwargs 中获取 `rollout_infos`:
```python
class Scheduler():
def step(self, infer_request: 'RolloutInferRequest', response_choice: 'ChatCompletionResponseChoice',
current_turn: int) -> Dict:
...
return {'infer_request': infer_request, 'rollout_infos': extra_dict}
class RewardFunction():
def __call__(self, completions, **kwargs):
infos = kwargs.get('rollout_infos', {})
...
```
### 在 Scheduler 中获取额外的数据集信息
在训练侧设置参数`--vllm_server_pass_dataset`,可将数据集中的其他列传入多轮规划器。在`infer_request.data_dict`中获取。
### 训推一致性兼容
swift >= 3.11 支持从 vLLM 侧返回 rollout 的 logps 用于纠正训推不一致问题,具体请参考该[文档](../AdvancedResearch/training_inference_mismatch.md)
在多轮训练中,如果启用了 `rollout_importance_sampling_mode`,框架会自动收集每轮 rollout 的 log probabilities,用于校正训推不一致带来的 off-policy 问题。
**默认行为**:
- 使用默认的 `run` 方法时,框架会自动从 `response_choice.logprobs` 中提取 log probabilities
- 这些 logprobs 会与 `response_token_ids` 和 `response_loss_mask` 一起传递给 trainer
**自定义 Scheduler 的注意事项**:
如果你在 `step` 方法中修改了 response(如截断、添加内容),需要同步返回对应的 `rollout_logprobs`
**关键规则**:
- `rollout_logprobs` 的长度应该等于 `response_loss_mask` 中值为 1 的数量
- 对于 `loss_mask=0` 的 token(如用户添加的提示、工具返回结果),不需要提供 logprobs
- 如果 `step` 方法没有返回 `rollout_logprobs`,框架会自动从 `response_choice.logprobs` 中提取
**重写 `run` 方法的场景**:
如果你完全重写了 `run` 方法,需要手动收集和传递 `rollout_logprobs`
具体的实现请参考[内置实现](https://github.com/modelscope/ms-swift/blob/main/swift/rollout/multi_turn.py)
================================================
FILE: docs/source/Instruction/GRPO/DeveloperGuide/reward_function.md
================================================
# 奖励函数
## 自定义奖励函数
奖励函数接受模型生成的文本 completions 其他数据集中的列以及训练器状态作为参数(kwargs)进行打分, 其中[训练器状态](https://huggingface.co/docs/transformers/main/main_classes/callback#transformers.TrainerState)包含训练的步数等信息。
注意:模型输入相关的列(比如query,response)会被处理为 messages 键,原数据集中的 assistant response 会被舍弃,请使用额外的列进行保留。
相关处理的列名参考[文档](../../../Customization/Custom-dataset.md#query-response格式)
以下是一个示例,展示了如何实现一个简单的长度奖励函数。该函数会在模型生成的文本长度超过 1024 时,给予 1.0 的奖励信号;否则,奖励信号为 0.0。
```python
from swift.rewards import ORM, orms
class DummyLengthRewardFunction(ORM)
def __call__(completions, **kwargs):
return [1.0 if len(completion) > 1024 else 0.0 for completion in completions]
orms['dummy']= DummyLengthRewardFunction
```
**获取数据集中的其他列**
比如奖励函数需要获取数据集`solution`列、当前训练步数和总步数作为辅助计算,以下是两种获取方式
第一种:在__call__入参中显式定义列名
```python
def __call__(completions, solution, trainer_state, **kwargs):
print(solution)
global_step = trainer_state.global_step
max_steps = trainer_state.max_steps
...
```
第二种:在kwargs中获取
```python
def __call__(completions, **kwargs):
solution = kwargs.get('solution')
trainer_state = kwargs.get('trainer_state')
global_step = trainer_state.global_step
max_steps = trainer_state.max_steps
...
```
**使用自定义奖励函数**
可以在`swift/examples/train/grpo/plugin/plugin.py`中加入该奖励函数,使用参数`--external_plugins examples/train/grpo/plugin/plugin.py`进行注册,并通过 reward_funcs 参数进行指定
执行脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_reward_func.sh)
## 异步奖励函数
**版本依赖**:ms-swift>=3.12.1
对于涉及 I/O 操作的奖励函数(如 API 调用、数据库查询等),可以使用异步(async)奖励函数来提高性能。异步奖励函数使用 `asyncio.gather` 并行执行,可以显著加速奖励计算。
```python
from swift.rewards import AsyncORM, orms
import asyncio
class AsyncAPIReward(AsyncORM):
async def __call__(self, completions, **kwargs):
import aiohttp
async def score_single(session, text):
async with session.post(
'https://api.example.com/score',
json={'text': text}
) as resp:
result = await resp.json()
return result['score']
async with aiohttp.ClientSession() as session:
# 使用 asyncio.gather 并行发送所有请求
tasks = [score_single(session, c) for c in completions]
rewards = await asyncio.gather(*tasks)
return list(rewards)
orms['async_api'] = AsyncAPIReward
```
swift 支持同时使用同步和异步奖励函数。训练器会自动检测奖励函数的类型:
- 同步奖励函数按顺序执行
- 异步奖励函数使用 `asyncio.gather` 并行执行
[plugin](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py)文件中提供了一个调用`swift deploy`服务的生成式奖励模型的例子(async_genrm)
## 内置奖励函数
swift内置了五种基于规则的奖励函数(代码见swift/rewards/orm.py)
| 奖励函数 | 论文 |
|----------------|----------------------------------------------------------------------------|
| accuracy | [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via RL](https://arxiv.org/abs/2501.12948) |
| format | 同上 |
| cosine | [Demystifying Long Chain-of-Thought Reasoning in LLMs](https://arxiv.org/abs/2502.03373) |
| repetition | 同上 |
| soft_overlong | [Decoupled Clip and Dynamic sAmpling Policy Optimization (DAPO)](https://arxiv.org/abs/2503.14476) |
### 1. **accuracy**
该函数将模型的生成结果与数据集中的 solution 列进行比较,计算准确率分数。如果生成结果与标准答案一致,则得分为 1.0;否则为 0.0。
注意:该奖励函数使用`math_verify`库解析生成结果和solution中的答案,可能只适用于特定的数学数据集。
### 2. **format**
论文中使用以下system prompt要求模型按照固定格式进行返回
```
A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here
```
该函数检查模型是否按照 `think content answer content ` 的格式进行生成。如果生成文本符合格式要求,则得分为 1.0;否则为 0.0。
### 3. **cosine**
论文发现,仅使用 accuracy 奖励函数进行训练会导致模型的生成长度趋于超长,从而影响训练效果。cosine 奖励函数通过控制模型的生成长度来优化训练过程:
- 对于生成正确答案的文本,奖励值随长度增加而递减,鼓励模型生成简洁的回答。
- 对于生成错误答案的文本,奖励值随长度增加而递增,鼓励模型进行更深入的思考。
使用余弦函数平滑地调整奖励值,确保奖励变化在合理范围内。余弦函数的参数包括生成文本的长度、最大长度限制以及奖励的最小值和最大值。
参数
- cosine_min_len_value_wrong(默认值:-0.5):生成错误答案时,最小长度对应的奖励值。
- cosine_max_len_value_wrong(默认值:0.0):生成错误答案时,最大长度对应的奖励值。
- cosine_min_len_value_correct(默认值:1.0):生成正确答案时,最小长度对应的奖励值。
- cosine_max_len_value_correct(默认值:0.5):生成正确答案时,最大长度对应的奖励值。
- cosine_max_len(默认值等于模型生成的最大程度):生成文本的最大长度限制。
### 4. **repetition**
惩罚模型生成文本中的重复内容,通过检测生成文本中的重复 n-gram 模式来评估重复程度,并给予相应的惩罚。
函数将生成文本分割为单词,并提取指定大小的 n-gram(默认为 3-gram)。通过统计不同 n-gram 的数量与总 n-gram 数量的比例,计算重复比例。如果生成文本中重复的 n-gram 比例较高,则给予较大的负奖励(惩罚)。惩罚值通过重复比例和最大惩罚值(默认为 -1.0)计算得出。
参数
- repetition_n_grams(默认值:3):用于检测重复的 n-gram 大小。
- repetition_max_penalty(默认值:-1.0):最大惩罚值,用于控制惩罚的强度。
### 5. **soft overlong punishment**
定义长度惩罚区间。在这个区间内,给予[-1,0]的线性惩罚。
参数
- soft_max_length: 论文中的L_max,模型的最大生成长度,默认等于max_completion_length
- soft_cache_length: 论文中的L_cache,控制长度惩罚区间,区间为[soft_max_length-soft_cache_length, soft_max_length]
## 注意事项
如果需要在奖励函数中加载模型,默认会使用训练的 deepspeed 插件(transformers逻辑),zero3下可能会导致模型无法正常推理,参考该[issue](https://github.com/modelscope/ms-swift/issues/4580)来跳过 deepspeed 初始化环境
================================================
FILE: docs/source/Instruction/GRPO/DeveloperGuide/reward_model.md
================================================
# 奖励模型
默认情况下,奖励模型是指具有分类头数值输出的模型,通常称为输出奖励模型(ORM)。这些模型会对其他模型的输出进行评分,从而生成一个标量值,表示模型响应的质量。
我们可以通过使用参数 `reward_models` 来加载具有分类头的奖励模型,或者加载经过[奖励建模](../../RLHF.md#rm)训练的奖励模型,进而使用模型的logits作为奖励。
## 自定义奖励模型
对于生成式奖励模型,有两种常见的调用方式:一种是在 Trainer 内部直接使用 reward_model_plugin 定义奖励模型的逻辑,可以使用TransformersEngine对奖励模型进行推理,另一种是通过外部部署的模型服务进行调用。
- 使用 reward_model_plugin 调用奖励模型时,模型会被内嵌在 Trainer 内部,无需额外占用计算资源。该方式优点是方便集成,但生成速度相对较慢,更适合参数量较小的奖励模型场景。
- 外部部署奖励模型时,可以通过诸如 swift deploy 或 vllm serve 等命令将模型服务部署于独立设备,大幅提升推理速度,适合参数量较大的模型。但这样需要预留额外的硬件资源。
### 内部插件
我们可以在 reward_model_plugin 中灵活地自定义奖励模型的处理逻辑。这使得实现诸如生成式奖励模型等技术成为可能,包括:
- 自定义模型的系统提示:定义特定的指令和上下文以指导评估过程。
- 处理模型交互历史:管理对话上下文,以提供有意义且具有上下文感知的评估。
- 定义自定义评估标准:设置独特的标准和度量,用于评估模型的响应,超越默认的准确性和相关性衡量标准。
通过reward_model_plugin,开发者可以针对其应用的特定需求定制奖励评估过程。这种灵活性允许更细致和有效的基于奖励的训练策略。
奖励模型通过plugin的`__call__`方法进行调用,该方法接受 `inputs` 作为参数,包含了模型输入输出的 messages 和数据集中的其他列
```python
def __call__(self, inputs):
print(inputs)
"""
[
{
'messages': [
{'role': 'system', 'content': 'system prompt'},
{'role': 'query', 'content': 'query'},
{'role': 'user', 'content': 'completions1'},
],
'solution': "abc",
},
{
'messages': [
{'role': 'system', 'content': 'system prompt'},
{'role': 'query', 'content': 'query'},
{'role': 'user', 'content': 'completions2'},
],
'solution': "abc",
}
]
```
在插件中使用 TransformersEngine 进行奖励模型的推理, 我们只需构造 messages ,并通过 infer 接口调用:
```python
class RMPlugin(DefaultRMPlugin):
def __init__(self, model, template):
super().__init__(model, template)
# initilize TransformersEngine to infer
self.engine = TransformersEngine(self.model, template=self.template, max_batch_size=0)
def __call__(self, inputs):
system_prompt = ...
query = ...
messages = [{'role': 'system', 'content': system_prompt}, {'role': 'query', 'content': query}]
result = self.engine.infer([messages], self.request_config, use_tqdm=False)
rewards = ...
return rewards
```
我们在 [rm_plugin.py](https://github.com/modelscope/ms-swift/blob/main/swift/rewards/rm_plugin.py) 中提供了一个简单的生成式奖励模型示例(GenRMPlugin)。
在 [plugin.py](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/plugin/plugin.py) 中自定义奖励模型插件,并使用 `external_plugins` 参数进行注册。
注意:
1. 在 GRPOTrainer 中,reward_model 会依次append到 reward_funcs 中。因此,reward_weights 的顺序对应 [reward_funcs, reward_model]。
2. reward_model_plugin 默认为 default,即使用 ORM 处理逻辑。
3. 对于参数量较大的模型,TransformersEngine 生成速度较慢,请使用[外部部署](#外部部署)方法
对于 BERT 这类无法通过 reward_model 加载的模型,我们可以内置在 reward_function 中进行加载,参考[issue](https://github.com/modelscope/ms-swift/issues/4580)
### 外部部署
**示例 2:使用 swift deploy 部署奖励模型并进行远程调用**
这类方法则不需要使用 reward_model_plugin , 而是直接在奖励函数中进行调用即可
首先用如下命令启动模型服务:
```bash
# 注意部署的设备不要与训练设备重叠
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift deploy \
--model Qwen/Qwen2.5-72B-Instruct \
--vllm_tensor_parallel_size 4
# [INFO:swift] model_list: ['Qwen2.5-72B-Instruct']
# INFO: Started server process [xxxxxx]
# INFO: Waiting for application startup.
# INFO: Application startup complete.
# INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
```
在奖励函数中通过 OpenAI 库初始化客户端,指定模型服务的地址和端口,示例代码如下:
```python
from openai import OpenAI
class RMReward(ORM):
def __init__(self):
super().__init__()
try:
self.client = OpenAI(
api_key='EMPTY',
base_url='http://127.0.0.1:8000/v1', # 若在本地部署则为 127.0.0.1
)
self.verify_model_name = self.client.models.list().data[0].id
except Exception as e:
raise RuntimeError('Failed to connect to the model service. Please deploy the model '
"using 'swift deploy' or 'vllm serve'.") from e
def __call__(self, completions, messages, **kwargs) -> List[float]:
rewards = []
for completion, message in zip(completions, messages):
rm_prompt = ... # 构建 reward model 的prompt
chat_response = self.client.chat.completions.create(
model=self.verify_model_name,
messages=[
{
'role': 'system',
'content': 'You are a helpful assistant.'
},
{
'role': 'user',
'content': rm_prompt
},
],
)
response = chat_response.choices[0].message.content.strip()
reward = ... # 根据奖励模型生成结果提取奖励值
rewards.append(reward)
return rewards
```
================================================
FILE: docs/source/Instruction/GRPO/GetStarted/GRPO.md
================================================
# GRPO
GRPOTrainer在ms-swift3.5进行了代码重构,如果你使用的swift版本<3.5, 请参考[stable文档](https://github.com/modelscope/ms-swift/blob/v3.4.1/docs/source/Instruction/GRPO.md)
[GRPO(Group Relative Policy Optimization)](https://arxiv.org/abs/2402.03300) 算法利用组内相对优势计算来替代 PPO 算法中独立的价值模型,并直接在损失函数中加入 KL 散度惩罚来提高训练稳定性。
## 算法原理
GRPO 目标函数
$
{\scriptstyle
\begin{aligned}
\mathcal{J}_{G R P O}(\theta) & =\mathbb{E}_{\left[q \sim P(Q),\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{o l d}}(O \mid q)\right]} \\
& \frac{1}{G} \sum_{i=1}^G \frac{1}{\left|o_i\right|} \sum_{t=1}^{\left|o_i\right|}\left\{\min \left[\frac{\pi_\theta\left(o_{i, t} \mid q, o_{i, GRPO算法伪代码
```python
# ========== 1. Rollout Generation Phase ==========
prompt = "Question: Which is bigger? 9.11 or 9.9?"
# Generate multiple completions through parallel sampling
completions = rollout_function(
model=current_policy_model,
prompt=prompt,
num_generations=8, # Hyperparameter: number of samples per prompt
temperature=1.0 # Hyperparameter: sampling diversity
)
"""
completions = [
(completion 1) "The larger number is 9.11...",
(completion 2) "9.9 is bigger than...",
...
(completion 8) "After calculation, 9.11..."
]
"""
# ========== 2. Reward Calculation Phase ==========
# Evaluate generated completions using reward model
rewards = reward_function(
completions=completions,
ground_truth="9.11" # Expected correct answer
)
"""
rewards = [
(reward 1) 1.0, # Correct answer
(reward 2) 0.0, # Incorrect
...
(reward 8) 1.0 # Correct
]
"""
# Normalize rewards to advantages
rewards_mean = mean(rewards) # μ = 0.5
rewards_std = std(rewards) # σ = 0.25
advantages = (rewards - rewards_mean) / (rewards_std + 1e-8) # Standardization
"""
advantages = [
(advantage 1) 2.0, # (1.0 - 0.5)/0.25
(advantage 2) -2.0,
...
(advantage 8) 2.0
]
"""
# ========== 3. Policy Optimization Phase ==========
# Get token-level log probabilities from different models
current_logps = get_per_token_logps(current_policy_model, prompt, completions) # π_θ
old_logps = get_per_token_logps(old_policy_model, prompt, completions) # π_θ_old
ref_logps = get_per_token_logps(reference_model, prompt, completions) # π_ref
# PPO Clipped Objective
is_ratio = exp(current_logps - old_logps) # Importance sampling ratio: e^(π_θ - π_θ_old)
clipped_ratio = clip(is_ratio, 1-ε, 1+ε) # ε=0.2 typically
# Policy gradient term (dual form)
policy_loss = -mean(
minimum(is_ratio * advantages, # Unclipped objective
clipped_ratio * advantages) # Clipped objective
)
# KL Divergence Penalty (K3 estimator)
# KL(π_θ||π_ref) ≈ e^(logπ_ref - logπ_θ) - (logπ_ref - logπ_θ) - 1
kl_penalty = beta * mean(
exp(ref_logps - current_logps) -
(ref_logps - current_logps) - 1
)
# Total Loss = Policy Loss + KL Penalty
total_loss = policy_loss + kl_penalty
# ========== 4. Update Rule ==========
# Apply gradient descent to minimize total_loss
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
```
训练脚本示例参考[examples](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo)
GRPO参数参考[文档](../../../Instruction/Command-line-parameters.md#grpo参数)
## 集群支持

GRPO 训练框架支持集成高性能推理引擎(如 vLLM)来加速采样过程,提供以下两种部署模式:
### 1. Colocate(Internal) Mode
训练与推理共享GPU资源,在 Trainer 内部启动推理服务,
启动参数
```bash
--use_vllm true \
--vllm_mode colocate
```
#### Colocate 模式下的显存优化方案
在 Colocate 模式下运行时,容易出现显存不足(OOM)的情况。以下是几种有效的显存优化方法和参数配置:
1. 降低`vllm_gpu_memory_utilization` 参数
2. 在训练阶段,释放 vLLM 占用的显存:
```bash
--sleep_level 1
```
3. 在vLLM 推理阶段,释放模型和优化器占用的显存:
```bash
--offload_optimizer true \
--offload_model true \
```
4. 在vLLM中使用 Tensor Parallel 技术:
```bash
--vllm_tensor_parallel_size [tp_size]
```
5. 分批 Gather 模型权重(zero3下同步 vLLM 权重时):
```bash
--move_model_batches [批次数量]
```
6. 将 Megatron 导出的用于 vLLM 更新的 HF 格式权重存放在 CPU 主存中,以降低 GPU 显存占用:
```bash
--offload_bridge true
```
### 2. Async(External) Mode
训练与推理资源分离,启动单独的推理服务器
使用`swift rollout`命令部署vLLM 服务器, 现仅支持vLLM backend
```bash
CUDA_VISIBLE_DEVICES=0 \
swift rollout \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--vllm_tensor_parallel_size 2 \
--vllm_data_parallel_size 1
CUDA_VISIBLE_DEVICES=0,1 \
swift rollout \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--vllm_tensor_parallel_size 2 \
--vllm_data_parallel_size 1
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift rollout \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--vllm_tensor_parallel_size 2 \
--vllm_data_parallel_size 2
```
更多 rollout 参数参考[vLLM参数](../../../Instruction/Command-line-parameters.md#vllm参数)和[rollout 参数](../../../Instruction/Command-line-parameters.md#rollout参数)
注意:在使用 use_async_engine 时,仅开启 DP 可能会导致错误,相关问题参考: [vllm issue](https://github.com/vllm-project/vllm/issues/18567)。如果出现错误,请尝试同时启用 TP 和 DP,或升级vLLM
训练使用以下参数配置外部 vLLM 服务器
```bash
--use_vllm true \
--vllm_mode server \
--vllm_server_host <服务器IP> \
--vllm_server_port <服务端口> \
--vllm_server_timeout <超时时间> \
```
#### 权重同步加速
swift 3.10 优化了权重同步,设置以下参数可以进一步优化 LoRA 训练的权重同步速度。
```bash
# rollout(server mode)
swift rollout \
--vllm_enable_lora true \
--vllm_max_lora_rank xxx # 与训练脚本lora_rank一致
...
# grpo(colocate mode)
swift rlhf \
--rlhf_type grpo \
--vllm_mode colocate \
--vllm_enable_lora true \
...
```
注意:以下情况无法使用该优化:
- 训练多模态模型的ViT层(freeze_vit false)
- MoE 模型
优化实现细节请参考该[PR](https://github.com/modelscope/ms-swift/pull/5773)
## logged metrics
- completions/mean_length:生成的 completion 的平均长度。
- completions/min_length:生成的 completion 的最小长度。
- completions/max_length:生成的 completion 的最大长度。
- completions/clipped_ratio:被长度截断的 completion 占比。
- reward/{reward_func_name}/mean:某个特定 reward function 的平均奖励值。
- reward/{reward_func_name}/std:某个特定 reward function 的奖励标准差。
> 注意, 上述两个指标是在所有 completions 范围内统计得到的。
- reward:加权 reward_weights 后的整体平均奖励。
- reward_std:加权 reward_weights 后,每个 batch 内整体奖励的标准差。
> 注意:上述两个指标是先在每个组内分别计算均值/std,然后再对各组的结果取平均。
- frac_reward_zero_std:在生成 batch 中,reward 标准差为零的样本比例,意味着该 prompt 上的答案几乎无多样性(所有回答奖励一致)。
- kl:生成的 completion 上,模型与参考模型之间的平均 KL 散度。仅当 beta 非零时记录。
- clip_ratio/region_mean:不同句子中被 CLIP 的的 token 平均比例
- clip_ratio/low_mean:不同句子中被 下CLIP 的的 token 平均比例
- clip_ratio/low_min:不同句子中被 下CLIP 的的 token 最小比例
- clip_ratio/high_mean:不同句子中被 上CLIP 的的 token 平均比例
- clip_ratio/high_max:不同句子中被 上CLIP 的的 token 最大比例
> 注意:如果开启`overlong_filter`, kl 和 clip_ratio 指标会过滤超长的样本
如果设置了`log_entropy`参数,则会额外记录entropy相关指标,包括
- entropy/mean: 不同句子中的 entropy 均值
- entropy/max: 不同句子中的 entropy 最大值
- entropy/min: 不同句子中的 entropy 最小值
> 注意这里的 句子 entropy 指 completion 中的 token entropy 均值
如果设置了`top_entropy_quantile`参数<1.0, 则会记录entropy threshold的值
- entropy/threshold: 分位点处的 entropy 值,小于该值的 token 将不会被计算 loss
训推一致性指标,前缀为rollout_correction (ms-swift>=3.11),需设置`log_rollout_offpolicy_metrics=true`或`rollout_importance_sampling_mode`:
- `kl` / `k3_kl`:训练策略与 rollout 策略之间的 KL 散度(直接估计器 / K3 估计器)
- `training_ppl` / `rollout_ppl`:训练策略和 rollout 策略的困惑度
- `log_ppl_diff`:log PPL 差异,反映分布偏移程度
- `ppl_ratio`:PPL 比率
- `chi2_token` / `chi2_seq`:Token/Sequence 级别的 χ² 散度
IS 校正指标(需设置`rollout_importance_sampling_mode`):
- `is_weight_mean`:平均重要性采样权重
- `ess`:有效样本大小(Effective Sample Size)
- `clipped_frac`:被截断或屏蔽的样本比例
> 训推一致性指标详细说明请参考文档 [Training-Inference-Mismatch](../AdvancedResearch/training_inference_mismatch.md)
如果设置了`log_completions`, 将保存训练动态在output对应文件夹中,包括
- step:记录时的训练步数
- prompt:模型输入
- completion:模型采样回答
- {reward_func_name}:特定奖励
- entropy:entropy token 均值,在设置`log_entropy`时记录
设置 `report_to wandb/swanlab` 将训练动态Table推送到对应的平台
如果需要在Table中额外记录其他列,请在 `GRPOTrainer._generate_and_score_completions` 方法中,设置 metrics_to_gather 字典。
默认自动检测
- `image`:视觉数据集图像输入。(暂时只支持wandb)
- `solution`:数据集中的 solution 列。
## FAQ
**1. 训练过程中 loss 等于0 / 接近0 / 小于0**
正常情况, 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
**2. num_generations / 批量大小相关**
在 GRPO 中,batch_size 以 completion(模型生成结果) 为单位。例如,设置 per_device_train_batch_size=8 表示每张 GPU 在训练过程中会同时处理 8 个 completion 的 loss 计算。
训练阶段,在一次完整的梯度累计 batch 中,总的批量大小等于:
```
effective_batch_size = num_processes * per_device_train_batch_size * gradient_accumulation_steps
```
采样阶段,总的批量大小 (completion-level) 数量等于:
1. 设置 generation_batch_size 下,等于 generation_batch_size
2. 设置 steps_per_generation 下,等于 per_device_train_batch_size * steps_per_generation * num_processes
3. 默认情况下,steps_per_generation = gradient_accumulation_steps,generation_batch_size = per_device_train_batch_size * steps_per_generation * num_processes = per_device_train_batch_size * gradient_accumulation_steps * num_processes = effective_batch_size
在评估阶段,completion 的数量等于:
```
num_processes * per_device_eval_batch_size
```
参数 `num_generations` 必须能够被以上采样阶段和评估的总批量大小整除,以保证生成任务可以均匀分配到各个设备上。
**示例**
- num_processes = 8
- per_device_train_batch_size = 4
- gradient_accumulation_steps = 8
- generation_batch_size = 512
- num_generations = 64
1. 采样需要的总数据(prompt)量等于 512 / 64 = 8
2. 每次采样 512 条模型回复
3. 每次更新模型权重批量大小为 8 *4 * 8 = 256
**3. 为什么 KL 出现了NaN**
开启 overlong_filter 后,某一卡上的所有 completion 都被截断
**4. 训练的steps怎么计算?**
参考[issue](https://github.com/modelscope/ms-swift/issues/3912)
**5. clip_ratio为什么总是0?**
Clip机制的核心目的是限制策略更新的幅度,防止因单次更新过大而导致策略性能崩溃(即策略更新后表现急剧下降)。
Clip操作的具体公式如下:
$
L_{\text{CLIP}}(\theta) = \mathbb{E}_{t} \left[ \min\left(r_{t}(\theta) \hat{A}_{t}, \text{clip}(r_{t}(\theta), 1 - \epsilon, 1 + \epsilon) \hat{A}_{t} \right) \right]
$
其中:$r_{t}(\theta) = \frac{\pi_{\theta}(a_{t} \mid s_{t})}{\pi_{\text{old}}(a_{t} \mid s_{t})}$ 是重要性采样比,衡量新旧策略的差异。$\hat{A}_{t}$ 是优势函数(advantage function),表示动作的相对收益。$\epsilon$ 用于限制 $r_{t}(\theta)$ 的偏离范围。
在 on-policy 训练过程中,由于每次更新都使用最新策略生成的数据,新旧策略相同,即 $\pi_{\theta} = \pi_{\text{old}}$
因此重要性采样比恒为 1,此时,clip 操作不会生效。
在设置以下参数情况下,算法为off-policy (near-on-policy)
1. num_iterations > 1, 或者
2. gradient_accumulation_steps % steps_per_generation != 0
参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851)
**6. 为什么没有设置val_dataset,仍然有验证过程,如何取消**
当没有显式传入`val_dataset`时,参数`split_dataset_ratio`负责切分部分`dataset`为验证数据集,默认切分1%数据(在"ms-swift>=3.6"中,`split_dataset_ratio`的默认值将从0.01修改为0.)
通过设置`--split_dataset_ratio 0` 来取消验证过程
**7. 如何设置训练的 `mini-batch size`**
在 GRPO 训练中,我们可以通过以下两种方式配置 mini-batch 更新:
- 设置 `generation_batch_size` 为训练 global batch size (effective_batch_size) 的整数倍
- 或设置 `steps_per_generation` 为 `gradient_accumulation_steps` 的整数倍
典型配置示例:
- 当配置:
steps_per_generation = 16, gradient_accumulation_steps = 8, mini_batch_size = steps_per_generation / gradient_accumulation_steps = 2. 则 1 次 rollout 结果将拆分成 2 批 mini-batch 进行更新。
**8. swift deploy 与 swift rollout 的区别**
- swift deploy 主要用于模型的部署和推理,支持 PT、vLLM、SGLang 等多种引擎,兼容流式推理与 OpenAI API 的调用格式。
- swift rollout 则专注于 GRPO 推理加速,目前仅支持 vLLM 引擎,并内置了权重自动同步的功能。
**9. 如何取消 KL 项损失**
将参数设置为 `--beta 0`,即可关闭 KL 损失的计算,并且不会加载参考模型(ref model)。
## RL微信群
================================================
FILE: docs/source/Instruction/GRPO/GetStarted/index.rst
================================================
Get Started
===============
.. toctree::
:maxdepth: 1
GRPO.md
================================================
FILE: docs/source/Instruction/GRPO/index.rst
================================================
GRPO
===============
.. toctree::
:maxdepth: 2
:caption: Get Started
GetStarted/index.rst
.. toctree::
:maxdepth: 2
:caption: Developer Guide
DeveloperGuide/index.rst
.. toctree::
:maxdepth: 2
:caption: Advanced Research
AdvancedResearch/index.rst
================================================
FILE: docs/source/Instruction/Inference-and-deployment.md
================================================
# 推理和部署
以下为swift支持的推理引擎以及接入部分的相应能力,三种推理加速引擎为SWIFT的推理、部署、评测模块提供推理加速:
| 推理加速引擎 | OpenAI API | 多模态 | 量化模型 | 多LoRA | QLoRA | Batch推理 | 并行技术 |
| ------------ | -------------- | ---------- | ------ | -------- | ------ | ----- | ----- |
| transformers | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/client/llm/chat/openai_client.py) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/app/mllm.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_lora.py) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/batch_ddp.sh) |DDP/device_map |
| [vllm](https://github.com/vllm-project/vllm) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_tp.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/lora/server.sh) | ❌ | ✅ | TP/PP/DP |
| [sglang](https://github.com/sgl-project/sglang) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | TP/PP/DP/EP |
| [lmdeploy](https://github.com/InternLM/lmdeploy) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/lmdeploy/mllm_tp.sh) | ✅ | ❌ | ❌ | ✅ | TP/DP |
## 推理
ms-swift使用了分层式的设计思想,用户可以使用命令行界面、Web-UI界面和直接使用Python的方式进行推理。
如果要查看LoRA微调后模型的推理,可以参考[预训练与微调文档](./Pre-training-and-Fine-tuning.md#推理微调后模型)。
### 使用CLI
**全参数模型:**
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
```
**LoRA模型:**
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--adapters swift/test_lora \
--stream true \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048
```
**命令行推理指令**
以上为交互式命令行界面推理,脚本运行后仅需在terminal中输入query即可。你也可以输入以下特殊指令:
- `multi-line`: 切换到多行模式,在输入中支持换行输入,以`#`代表输入结束。
- `single-line`: 切换到单行模式,以换行代表输入结束。
- `reset-system`: 重置system并清空历史记录。
- `clear`: 清除历史记录。
- `quit` or `exit`: 退出对话。
**多模态模型**
```shell
CUDA_VISIBLE_DEVICES=0 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
swift infer \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--stream true \
--infer_backend transformers \
--max_new_tokens 2048
```
如果要进行多模态模型的推理,可以在query中添加`//`等标签(代表图像表征在`inputs_embeds`中的位置),例如输入`这两张图有啥区别`,`描述这段视频`。然后根据提示输入相应的图像/视频/音频即可。
以下为一个推理的示例:
```
<<< 这两张图有什么区别
Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png
这两张图片的区别在于它们所展示的动物和场景。
1. **第一张图片**:
- 展示了一只小猫。
- 小猫有大大的眼睛,表情显得有些困惑或好奇。
- 背景是模糊的,可能是室内环境。
2. **第二张图片**:
- 展示了一群羊。
- 羊们站在草地上,背景是绿色的山丘和蓝天白云。
- 羊的表情看起来很平静,似乎在享受大自然的环境。
总结来说,第一张图片是一只小猫,而第二张图片是一群羊。
--------------------------------------------------
<<< clear
<<< 描述这段视频
Input a video path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4
The video shows a baby wearing sunglasses sitting on a bed and reading a book. The baby is holding the book with both hands and appears to be focused on the pages. The baby's feet are visible in the frame, and they are moving slightly as they read. The background of the video shows a room with a bed and some furniture.
```
**数据集推理:**
```shell
CUDA_VISIBLE_DEVICES=0 swift infer \
--model Qwen/Qwen2.5-7B-Instruct \
--stream true \
--infer_backend transformers \
--val_dataset AI-ModelScope/alpaca-gpt4-data-zh \
--max_new_tokens 2048
```
以上提供了全参数和LoRA流式推理的例子,以下介绍更多SWIFT中的推理技术:
- 界面推理:你可以将`swift infer`改成`swift app`。
- batch推理:`infer_backend=transformers`可以指定`--max_batch_size`对大模型和多模态大模型进行batch推理,具体参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/batch_ddp.sh)。在进行batch推理时,你不能设置`--stream true`。
- DDP/device_map推理:`infer_backend=transformers`支持使用DDP/device_map技术进行并行推理,具体参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/mllm_device_map.sh)。
- 推理加速:swift支持使用vllm/sglang/lmdeploy对推理、部署和评测模块进行推理加速,只需要额外指定`--infer_backend vllm/sglang/lmdeploy`即可。可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_ddp.sh)。
- 多模态模型:我们提供了[transformers](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/mllm_device_map.sh)/[vllm](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_tp.sh)/[lmdeploy](https://github.com/modelscope/ms-swift/blob/main/examples/infer/lmdeploy/mllm_tp.sh)对多模态模型进行多GPU推理的shell脚本。
- 量化模型:直接选择GPTQ、AWQ、BNB量化的模型,例如:`--model Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4`即可。
- 更多模型类型:我们提供了[bert](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/bert.sh)、[reward_model](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/reward_model.sh)、[prm](https://github.com/modelscope/ms-swift/blob/main/examples/infer/transformers/prm.sh)的推理脚本。
**小帖士:**
- SWIFT会将推理结果保存起来,你可以通过`--result_path`指定保存路径。
- 如果要输出logprobs,只需要在推理时,指定`--logprobs true`即可。SWIFT会保存。注意,设置`--stream true`将不会存储。
- infer_backend为'transformers'支持所有swift已支持模型的推理,而infer_backend为vllm/sglang/lmdeploy只支持部分模型,具体请参考[vllm](https://docs.vllm.ai/en/latest/models/supported_models.html)、[sglang](https://docs.sglang.ai/supported_models/generative_models.html)、[lmdeploy](https://lmdeploy.readthedocs.io/en/latest/supported_models/supported_models.html)文档。
- 使用`--infer_backend vllm`出现OOM,可以通过降低`--vllm_max_model_len`,`--vllm_max_num_seqs`,选择合适的`--vllm_gpu_memory_utilization`,设置`--vllm_enforce_eager true`。或者使用tensor并行`--vllm_tensor_parallel_size`来解决。
- 使用`--infer_backend vllm`推理多模态模型,需要传入多张图片。可以设置`--vllm_limit_mm_per_prompt`解决,例如:`--vllm_limit_mm_per_prompt '{"image": 10, "video": 5}'`。
- 推理qwen2-vl/qwen2.5-vl出现OOM,可以通过设置`MAX_PIXELS`、`VIDEO_MAX_PIXELS`、`FPS_MAX_FRAMES`解决,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/app/mllm.sh)。
- swift内置对话模板与使用transformers运行的对话模板对齐,测试参考[这里](https://github.com/modelscope/ms-swift/blob/main/tests/test_align/test_template/test_vision.py)。如果出现未对齐情况,欢迎提issue/PR修正。
### 使用Web-UI
如果你要使用界面的方式进行推理,可以查看[Web-UI文档](../GetStarted/Web-UI.md)。
### 使用Python
文本模型:
```python
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.infer_engine import TransformersEngine, RequestConfig, InferRequest
model = 'Qwen/Qwen2.5-0.5B-Instruct'
# 加载推理引擎
engine = TransformersEngine(model, max_batch_size=2)
request_config = RequestConfig(max_tokens=512, temperature=0)
# 这里使用了2个infer_request来展示batch推理
infer_requests = [
InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
InferRequest(messages=[{'role': 'user', 'content': '浙江的省会在哪?'},
{'role': 'assistant', 'content': '浙江省的省会是杭州。'},
{'role': 'user', 'content': '这里有什么好玩的地方'},]),
]
resp_list = engine.infer(infer_requests, request_config)
query0 = infer_requests[0].messages[0]['content']
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')
```
多模态模型:
```python
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['MAX_PIXELS'] = '1003520'
os.environ['VIDEO_MAX_PIXELS'] = '50176'
os.environ['FPS_MAX_FRAMES'] = '12'
from swift.infer_engine import TransformersEngine, RequestConfig, InferRequest
model = 'Qwen/Qwen2.5-VL-3B-Instruct'
# 加载推理引擎
engine = TransformersEngine(model, max_batch_size=2)
request_config = RequestConfig(max_tokens=512, temperature=0)
# 这里使用了3个infer_request来展示batch推理
infer_requests = [
InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
InferRequest(messages=[{'role': 'user', 'content': '两张图的区别是什么?'}],
images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png']),
InferRequest(messages=[{'role': 'user', 'content': 'describe the video'}],
videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']),
]
resp_list = engine.infer(infer_requests, request_config)
query0 = infer_requests[0].messages[0]['content']
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')
print(f'response2: {resp_list[2].choices[0].message.content}')
```
我们也提供了更多使用python推理的demo:
- 使用流式推理以及`VllmEngine`、`SglangEngine`、`LmdeployEngine`进行推理加速,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py)。
- 多模态推理:除了上述多模态输入格式外,swift兼容OpenAI的多模态输入格式,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py)。
- grounding任务:对多模态模型进行Grounding任务画框,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_grounding.py)。
- 多LoRA推理:参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_lora.py)。
- agent推理:参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py)。
- 异步接口:使用`engine.infer_async`进行python方式推理,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py)。
## 部署
如果要查看LoRA微调后模型的部署,可以参考[预训练与微调文档](./Pre-training-and-Fine-tuning.md#部署微调后模型)。
这里主要介绍对多模态模型的部署和调用,文本大模型我们给出简单的部署和调用的简单案例:
服务端部署:
```shell
CUDA_VISIBLE_DEVICES=0 swift deploy \
--model Qwen/Qwen2.5-7B-Instruct \
--infer_backend vllm \
--max_new_tokens 2048 \
--served_model_name Qwen2.5-7B-Instruct
```
客户端调用测试:
```shell
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen2.5-7B-Instruct",
"messages": [{"role": "user", "content": "晚上睡不着觉怎么办?"}],
"max_tokens": 256,
"temperature": 0
}'
```
### 服务端
```shell
# test env: pip install transformers==4.51.3 vllm==0.8.5.post1
CUDA_VISIBLE_DEVICES=0 \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
swift deploy \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--infer_backend vllm \
--vllm_gpu_memory_utilization 0.9 \
--vllm_max_model_len 8192 \
--max_new_tokens 2048 \
--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}' \
--served_model_name Qwen2.5-VL-3B-Instruct
```
### 客户端
这里介绍3种调用客户端的方式,分别是curl、openai库和swift客户端。
方案一: curl
```shell
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen2.5-VL-3B-Instruct",
"messages": [{"role": "user", "content": [
{"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"},
{"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png"},
{"type": "text", "text": "两张图的区别是什么?"}
]}],
"max_tokens": 256,
"temperature": 0
}'
```
方案2: openai库
```python
from openai import OpenAI
client = OpenAI(
api_key='EMPTY',
base_url=f'http://127.0.0.1:8000/v1',
)
model = client.models.list().data[0].id
print(f'model: {model}')
messages = [{'role': 'user', 'content': [
{'type': 'video', 'video': 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'},
{'type': 'text', 'text': 'describe the video'}
]}]
resp = client.chat.completions.create(model=model, messages=messages, max_tokens=512, temperature=0)
query = messages[0]['content']
response = resp.choices[0].message.content
print(f'query: {query}')
print(f'response: {response}')
# base64
import base64
import requests
resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
base64_encoded = base64.b64encode(resp.content).decode('utf-8')
messages = [{'role': 'user', 'content': [
{'type': 'video', 'video': f'data:video/mp4;base64,{base64_encoded}'},
{'type': 'text', 'text': 'describe the video'}
]}]
gen = client.chat.completions.create(model=model, messages=messages, stream=True, temperature=0)
print(f'query: {query}\nresponse: ', end='')
for chunk in gen:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
```
方案三:swift客户端
```python
from swift import InferRequest, InferClient, RequestConfig, InferStats
engine = InferClient(host='127.0.0.1', port=8000)
print(f'models: {engine.models}')
metric = InferStats()
request_config = RequestConfig(max_tokens=512, temperature=0)
# 这里使用了3个infer_request来展示batch推理
# 支持传入本地路径、base64和url
infer_requests = [
InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
InferRequest(messages=[{'role': 'user', 'content': '两张图的区别是什么?'}],
images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png']),
InferRequest(messages=[{'role': 'user', 'content': 'describe the video'}],
videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']),
]
resp_list = engine.infer(infer_requests, request_config, metrics=[metric])
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')
print(f'response2: {resp_list[2].choices[0].message.content}')
print(metric.compute())
metric.reset()
# base64
import base64
import requests
resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
base64_encoded = base64.b64encode(resp.content).decode('utf-8')
messages = [{'role': 'user', 'content': [
{'type': 'video', 'video': f'data:video/mp4;base64,{base64_encoded}'},
{'type': 'text', 'text': 'describe the video'}
]}]
infer_request = InferRequest(messages=messages)
request_config = RequestConfig(max_tokens=512, temperature=0, stream=True)
gen_list = engine.infer([infer_request], request_config, metrics=[metric])
print(f'response0: ', end='')
for chunk in gen_list[0]:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
print(metric.compute())
```
我们也提供了更多部署的demo:
- 多LoRA部署与调用: 参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/lora)。
- Base模型的部署与调用: 参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client/llm/base)。
- 更多模型类型: 我们提供了[bert](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/bert)、[reward_model](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/reward_model)的部署脚本。
================================================
FILE: docs/source/Instruction/Pre-training-and-Fine-tuning.md
================================================
# 预训练与微调
训练能力:
| 方法 | 全参数 | LoRA | QLoRA | Deepspeed | 多机 | 多模态 |
| ------ | ------ |---------------------------------------------------------------------------------------------| ----- | ------ | ------ |----------------------------------------------------------------------------------------------|
| [预训练](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
| [GRPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/gkd) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/gkd) |
| [PPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [DPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo) |
| [KTO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
| [奖励模型](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [CPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SimPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ | ✅ | ✅| ✅ | ✅ |
| [ORPO](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/train/reranker) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
## 环境准备
推荐的第三方库版本参考[SWIFT安装文档](../GetStarted/SWIFT-installation.md)
```bash
pip install ms-swift -U
# 若使用deepspeed zero2/zero3
pip install deepspeed -U
```
## 预训练
预训练使用`swift pt`命令,这将自动使用生成式而非对话式的template,即将`use_chat_template`设置为False(其他所有的命令,例如`swift sft/rlhf/infer`,都默认将`use_chat_template`设置为True)。此外,`swift pt`与`swift sft`相比,具有不同的数据集格式,可以参考[自定义数据集文档](../Customization/Custom-dataset.md)。
使用CLI进行预训练的脚本可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh)。更多训练技术的介绍可以参考微调章节。
小贴士:
- `swift pt`与`swift sft --use_chat_template false --loss_scale all`等价。
## 微调
ms-swift使用了分层式的设计思想,用户可以使用命令行界面、Web-UI界面和直接使用Python的方式进行微调。
### 使用CLI
我们提供了10分钟在单卡3090上对Qwen2.5-7B-Instruct进行自我认知微调的最佳实践,具体参考[这里](../GetStarted/Quick-start.md),这可以帮助您快速了解SWIFT。
此外,我们给出了一系列脚本帮助您了解SWIFT的训练能力:
- 轻量化训练:SWIFT支持的轻量微调示例可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/tuners)。(注意:这些方式预训练也可以使用,但预训练通常使用全参数训练)。
- 分布式训练:SWIFT支持的分布式训练技术包括:DDP、device_map、DeepSpeed ZeRO2/ZeRO3、FSDP。
- device_map: 简易模型并行。如果存在多GPU,device_map会自动开启。这会将模型按层均匀的划分到可见的GPU中,显著降低显存消耗,但是训练速度通常会降低,因为是串行的。
- DDP+device_map:将按组对模型进行device_map划分,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multi-gpu/ddp_device_map/train.sh)。
- DeepSpeed ZeRO2/ZeRO3: 节约显存资源,但训练速度下降。ZeRO2将对优化器状态、模型梯度进行分片。ZeRO3在ZeRO2基础上,对模型参数进行分片,更加节约显存,但训练速度更慢。参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed)。
- FSDP+QLoRA: 双卡3090运行70B模型的训练,参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/fsdp_qlora/train.sh)。
- 多机多卡训练: 我们书写了使用swift、torchrun、dlc、deepspeed、accelerate启动多节点运行的shell脚本示例。除了dlc和deepspeed,其他启动脚本都需要在所有节点中启动才可运行。具体参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multi-node)。
- 量化训练:支持使用GPTQ、AWQ、AQLM、BNB、HQQ、EETQ量化技术的QLoRA训练。微调7B模型只需要9GB显存资源。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora)。
- 多模态训练:SWIFT支持多模态模型的预训练、微调和RLHF。支持Caption、VQA、OCR、[Grounding](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-vl-grounding/zh.ipynb)任务。支持图像、视频和音频三种模态。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)。多模态自定义数据集格式参考[自定义数据集文档](../Customization/Custom-dataset.md)。
- 对ViT/Aligner使用全参数训练,LLM使用LoRA训练,并采用不同学习率的例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/lora_llm_full_vit)。
- 多模态模型packing,增加训练速度,例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/packing)。
- RLHF训练:参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf)。多模态模型参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/rlhf)。GRPO训练参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal)。强化微调查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft)。
- Megatron训练:支持使用Megatron的并行技术来加速大模型的训练,包括数据并行、张量并行、流水线并行、序列并行,上下文并行。参考[Megatron-SWIFT训练文档](../Megatron-SWIFT/Quick-start.md)。
- 序列分类模型训练:参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls)。
- Embedding模型训练:参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/embedding)
- Agent训练:参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent)。
- Any-to-Any模型训练:参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/all_to_all)。
- 其他能力:
- 数据流式读取: 在数据量较大时减少内存使用。参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/streaming/streaming.sh)。
- packing: 将多个序列拼成一个,让每个训练样本尽可能接近max_length,提高显卡利用率,参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/packing)。
- 长文本训练: 参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel)。
- lazy tokenize: 在训练期间对数据进行tokenize而不是在训练前tokenize(多模态模型可以避免在训练前读入所有多模态资源),这可以避免预处理等待并节约内存。参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/streaming/lazy_tokenize.sh)。
小帖士:
- 在使用`swift sft`通过LoRA技术微调base模型为chat模型时,有时需要手动设置模板。通过添加`--template default`参数来避免base模型因未见过对话模板中的特殊字符而无法正常停止的情况。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/base_to_chat)。
- 如果需要在**断网**环境下进行训练,请设置`--model `和`--check_model false`。如果对应的模型需要`git clone`github的仓库,例如`deepseek-ai/Janus-Pro-7B`,请设置手动下载仓库,并设置`--local_repo_path `。具体参数含义请参考[命令行参数文档](Command-line-parameters.md)。
- 无法对QLoRA训练的模型进行Merge LoRA,因此不建议使用QLoRA进行微调,无法在推理和部署时使用vLLM/Sglang/LMDeploy进行推理加速。建议使用LoRA/全参数进行微调,合并为完整权重后再使用GPTQ/AWQ/BNB进行[量化](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize)。
- 如果使用NPU进行训练,只需要将shell中的`CUDA_VISIBLE_DEVICES`修改为`ASCEND_RT_VISIBLE_DEVICES`。
- SWIFT默认在训练时设置`--gradient_checkpointing true`来节约显存,这会略微降低训练速度。
- 若使用DDP进行训练,出现报错:`RuntimeError: Expected to mark a variable ready only once.`,请额外设置参数`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`或者使用DeepSpeed进行训练。
- 如果要使用deepspeed,你需要安装deepspeed:`pip install deepspeed -U`。使用deepspeed可以节约显存,但会略微降低训练速度。
- 如果您的机器是A100等高性能显卡,且模型支持flash-attn,推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention/releases),并设置`--attn_impl flash_attn`,这将会加快训练和推理的速度并略微降低显存占用。
**如何debug:**
你可以使用以下方式进行debug,这与使用命令行微调是等价的,但此方式不支持分布式。微调命令行运行入口可以查看[这里](https://github.com/modelscope/ms-swift/blob/main/swift/cli/sft.py)。
```python
from swift import sft_main, SftArguments
result = sft_main(SftArguments(
model='Qwen/Qwen2.5-7B-Instruct',
tuner_type='lora',
dataset=['AI-ModelScope/alpaca-gpt4-data-zh#500',
'AI-ModelScope/alpaca-gpt4-data-en#500',
'swift/self-cognition#500'],
torch_dtype='bfloat16',
# ...
))
```
### 使用Web-UI
如果你要使用界面的方式进行训练,可以查看[Web-UI文档](../GetStarted/Web-UI.md)。
### 使用python
- Qwen2.5自我认知微调notebook查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)。
- Qwen2VL进行OCR任务notebook查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb)。
## Merge LoRA
- 查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lora.sh)。
## 推理(微调后模型)
使用CLI对LoRA训练的checkpoint进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--infer_backend transformers \
--stream true \
--temperature 0 \
--max_new_tokens 2048
```
- adapters文件夹中包含了训练的参数文件`args.json`,因此不需要额外指定`--model`,`--system`,swift会自动读取这些参数。如果要关闭此行为,可以设置`--load_args false`。
- 如果使用全参数训练,请使用`--model`替代`--adapters`指定训练的checkpoint目录。更多参考[推理和部署文档](./Inference-and-deployment.md#推理)。
- 你可以使用`swift app`替代`swift infer`进行界面推理。
- 你可以选择对LoRA进行merge(额外指定`--merge_lora true`),然后指定`--infer_backend vllm/sglang/lmdeploy`进行推理加速。
对数据集中的验证集进行批量推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048 \
--load_data_args true \
--max_batch_size 1
```
- 你可以设置`--max_batch_size 8`,从而使用`--infer_backend transformers`进行批量处理。若使用`infer_backend vllm/sglang/lmdeploy`则无需指定,会进行自动batch。
- `--load_data_args true`会额外读取训练存储参数文件`args.json`中的数据参数。
若想对额外的测试集进行推理,而不使用训练时的验证集,使用`--val_dataset `进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters output/vx-xxx/checkpoint-xxx \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048 \
--val_dataset \
--max_batch_size 1
```
使用Python对训练后LoRA推理的例子如下:
```python
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.infer_engine import TransformersEngine, RequestConfig, InferRequest
from swift import get_model_processor, get_template
from swift.utils import safe_snapshot_download
from peft import PeftModel
# 请调整下面几行
model = 'Qwen/Qwen2.5-7B-Instruct'
lora_checkpoint = safe_snapshot_download('swift/test_lora') # 修改成checkpoint_dir
template_type = None # None: 使用对应模型默认的template_type
default_system = "You are a helpful assistant." # None: 使用对应模型默认的default_system
# 加载模型和对话模板
model, tokenizer = get_model_processor(model)
if lora_checkpoint is not None:
model = PeftModel.from_pretrained(model, lora_checkpoint)
template_type = template_type or model.model_meta.template
template = get_template(tokenizer, template_type=template_type, default_system=default_system)
engine = TransformersEngine(model, template=template, max_batch_size=2)
request_config = RequestConfig(max_tokens=512, temperature=0)
# 这里使用了2个infer_request来展示batch推理
infer_requests = [
InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
InferRequest(messages=[{'role': 'user', 'content': '浙江的省会在哪?'},
{'role': 'assistant', 'content': '浙江的省会在哪?'},
{'role': 'user', 'content': '这里有什么好吃的'},]),
]
resp_list = engine.infer(infer_requests, request_config)
query0 = infer_requests[0].messages[0]['content']
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')
```
多模态模型的LoRA推理示例如下:
```python
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.infer_engine import TransformersEngine, RequestConfig, InferRequest
from swift import get_model_processor, get_template
from swift.utils import safe_snapshot_download
from peft import PeftModel
# 请调整下面几行
model = 'Qwen/Qwen2.5-VL-7B-Instruct'
lora_checkpoint = safe_snapshot_download('swift/test_grounding') # 修改成checkpoint_dir
template_type = None # None: 使用对应模型默认的template_type
default_system = None # None: 使用对应模型默认的default_system
# 加载模型和对话模板
model, tokenizer = get_model_processor(model)
if lora_checkpoint is not None:
model = PeftModel.from_pretrained(model, lora_checkpoint)
template_type = template_type or model.model_meta.template
template = get_template(tokenizer, template_type=template_type, default_system=default_system)
engine = TransformersEngine(model, template=template, max_batch_size=2)
request_config = RequestConfig(max_tokens=512, temperature=0)
# 这里使用了2个infer_request来展示batch推理
infer_requests = [
InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
InferRequest(messages=[{'role': 'user', 'content': 'Task: Object Detection'}],
images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png']),
]
resp_list = engine.infer(infer_requests, request_config)
query0 = infer_requests[0].messages[0]['content']
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')
```
如果使用ms-swift训练的模型,可以通过以下方式获取训练的配置:
```python
from swift import safe_snapshot_download, BaseArguments
lora_adapters = safe_snapshot_download('swift/test_lora')
args = BaseArguments.from_pretrained(lora_adapters)
print(f'args.model: {args.model}')
print(f'args.model_type: {args.model_type}')
print(f'args.template_type: {args.template}')
print(f'args.default_system: {args.system}')
```
- 对全参数训练的checkpoint进行推理,将`model`设置为checkpoint_dir,并将lora_checkpoint设置为None即可。更多参考[推理和部署文档](./Inference-and-deployment.md#推理)。
- 使用流式推理以及`VllmEngine`、`SglangEngine`、`LmdeployEngine`进行推理加速,可以参考[大模型](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py)和[多模态大模型](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py)推理示例。
- 微调后的模型使用huggingface transformers/peft生态推理,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_hf.py)。
- 若训练了多个LoRA,要进行多LoRA切换,可以参考[推理](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_lora.py)、[部署](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/lora)样例。
- 对多模态模型进行Grounding任务的画框,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_grounding.py)。
- 对LoRA微调后的Bert进行推理,可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_bert.py)。
## 部署(微调后模型)
使用以下命令启动部署服务端。如果权重使用全参数训练,请使用`--model`替代`--adapters`指定训练的checkpoint目录。你可以参考[推理和部署文档](./Inference-and-deployment.md#部署)介绍的客户端调用方式:curl、openai库和swift客户端进行调用。
```shell
CUDA_VISIBLE_DEVICES=0 \
swift deploy \
--adapters output/vx-xxx/checkpoint-xxx \
--infer_backend transformers \
--temperature 0 \
--max_new_tokens 2048 \
--served_model_name ''
```
这里将给出使用vLLM对多LoRA进行部署并调用的完整例子。
### 服务端
首先你需要安装vLLM:`pip install vllm -U`,并在部署时使用`--infer_backend vllm`,这通常可以显著加速推理速度。
我们预先训练了2个基模型为`Qwen/Qwen2.5-7B-Instruct`的不同自我认知LoRA增量权重(可以直接跑通),我们可以在[args.json](https://modelscope.cn/models/swift/test_lora/file/view/master)中找到相关信息。你需要在部署时修改`--adapters`指定训练好的LoRA权重本地路径即可。
```bash
CUDA_VISIBLE_DEVICES=0 \
swift deploy \
--adapters lora1=swift/test_lora lora2=swift/test_lora2 \
--infer_backend vllm \
--temperature 0 \
--max_new_tokens 2048
```
### 客户端
这里只介绍使用openai库进行调用。使用curl、swift客户端调用的例子可以参考[推理和部署文档](./Inference-and-deployment.md#部署)。
```python
from openai import OpenAI
client = OpenAI(
api_key='EMPTY',
base_url=f'http://127.0.0.1:8000/v1',
)
models = [model.id for model in client.models.list().data]
print(f'models: {models}')
query = 'who are you?'
messages = [{'role': 'user', 'content': query}]
resp = client.chat.completions.create(model=models[1], messages=messages, max_tokens=512, temperature=0)
query = messages[0]['content']
response = resp.choices[0].message.content
print(f'query: {query}')
print(f'response: {response}')
gen = client.chat.completions.create(model=models[2], messages=messages, stream=True, temperature=0)
print(f'query: {query}\nresponse: ', end='')
for chunk in gen:
if chunk is None:
continue
print(chunk.choices[0].delta.content, end='', flush=True)
print()
"""
models: ['Qwen2.5-7B-Instruct', 'lora1', 'lora2']
query: who are you?
response: I am an artificial intelligence model named swift-robot, developed by swift. I can answer your questions, provide information, and engage in conversation. If you have any inquiries or need assistance, feel free to ask me at any time.
query: who are you?
response: I am an artificial intelligence model named Xiao Huang, developed by ModelScope. I can answer your questions, provide information, and engage in conversation. If you have any inquiries or need assistance, feel free to ask me at any time.
"""
```
================================================
FILE: docs/source/Instruction/RLHF.md
================================================
# 人类对齐
本文档提供了各种人类偏好对齐算法的训练脚本。若您希望深入了解更详尽的算法信息及其选择方法,请参考[文档](https://github.com/modelscope/modelscope-classroom/blob/main/LLM-tutorial/M.%E4%BA%BA%E7%B1%BB%E5%81%8F%E5%A5%BD%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83.md)
## 数据集
PPO与GRPO算法所需的数据仅为模型输入,也就是system prompt(可选)加上query。其中GRPO中的奖励函数可能需要额外的数据列,比如计算准确率需要`solution`列作为参考答案。
RM和DPO类算法如ORPO,CPO,SimPO,则需要 $(x,y_w,y_l)$ 格式的数据,其中 $x$ 表示模型输入,$y_w,y_l$ 分别表示符合人类偏好的偏好回答和不符合人类偏好的拒绝回答,比如
而KTO算法的数据比较特殊,只需要 $(x,y,\text{label})$ 格式的数据,其中 $x$ 表示模型输入,$y$ 表示模型输出,label表示回答是否符合人类偏好
比如
使用自定义数据集对文本模型或者多模态大模型进行RLHF训练可以参考[自定义数据集文档](../Customization/Custom-dataset.md#rlhf)。
## GRPO
[论文arvix](https://arxiv.org/abs/2402.03300)
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo).
## DPO
[论文arvix](https://arxiv.org/abs/2305.18290)
超参
- beta:KL正则系数,值越大表示对偏离参考模型的惩罚越强。默认为0.1。
- loss_type:不同DPO算法变种,可选值参考[文档](https://huggingface.co/docs/trl/main/en/dpo_trainer#loss-functions),默认为 'sigmoid'。
- (可选) loss_weights: 多种 loss 混合时的权重设置。
- (可选) ld_alpha: 来自[LD-DPO 论文](https://arxiv.org/abs/2409.06411),对超出公共前缀部分的logps加权 $\alpha$ 抑制长度偏好。
- (可选) discopop_tau: 来自 [DiscoPOP 论文](https://arxiv.org/abs/2406.08414)的温度参数 $\tau$ ,用于缩放 log-ratio。默认值0.05。在 loss_type 为 discopop 时生效。
建议在开始 DPO 训练前,先用偏好数据集中用户偏好答案部分进行一次 SFT 训练,以确保数据分布更符合 DPO 算法的要求。
如需同时混合多个 loss(如用于 [MPO](https://arxiv.org/abs/2411.10442) 训练),可指定多个 loss_type,并通过 loss_weights 设置各自权重。
通过设置超参数 `rpo_alpha`,可在 loss 中混合一定比例的 SFT loss,从而提升训练的稳定性。
训练脚本参考
- [DPO脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/dpo).
- [MPO脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/mpo.sh).
## RM
[论文arvix](https://arxiv.org/abs/2203.02155)
RLHF中的Reward Modeling阶段
使用sft训练后的base model或者instruct model作为基底模型, 增加value head, 使用偏好数据集训练为reward model
增加的value head权重会保存在`value_head.safetensors` 或 `value_head.bin`文件中
RM损失函数如下
$
\text{loss} = -\log \sigma \left( r^{(c)} - r^{(r)} - m \right) + \lambda \left( r^{(c)} + r^{(r)} \right)^2
$
- $r^{(c)}$: 模型对 chosen response 的打分
- $r^{(r)}$: 模型对 rejected response 的打分
- $\lambda$: L2正则项系数,鼓励模型输出接近0,使用参数`center_rewards_coefficient`进行设置,来自[论文](https://arxiv.org/pdf/2307.09288), 默认为0
- $m$: margin项,鼓励模型根据不同难度的样本进行区分,需要数据集中提供`margin`列,默认为0,来自[论文](https://arxiv.org/pdf/2307.09288)
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/rm.sh).
## PPO
[论文arvix](https://arxiv.org/abs/2203.02155)
RLHF中的PPO(proximal policy optimization)阶段, 涉及到四个模型
- model: 训练模型, sft训练后的base model或者instruct model
- ref_model: 参考模型, 默认为 model
- reward_model: 奖励模型, 由RM阶段训练得到
- value_model: 价值模型, 由reward_model初始化, 在训练中同步更新
超参
- local_rollout_forward_batch_size: 每次数据采样的批量大小, 默认为64
- whiten_rewards: 对奖励进行归一化处理, 默认为False
- kl_coef: KL散度项的系数, 默认为0.05
- cliprange: PPO策略损失函数中的clip范围, 默认为0.2
- vf_coef: 价值损失函数系数, 默认为0.1
- cliprange_value: PPO价值损失函数中的clip范围, 默认为0.2
- gamma: 累计奖励的折扣因子, 默认为1.0
- lam: [GAE](https://arxiv.org/abs/1506.02438)中的lambda系数, 默认为0.95
- num_sample_generations: 训练过程中生成的调试样本数量, 默认为10
注意: 训练base model时, 需要先sft后再进行rlhf, 指定chat template, `tuner_type` 建议使用full
训练中的指标解释参考[文档](https://huggingface.co/docs/trl/ppov2_trainer#explanation-of-the-logged-metrics)
## KTO
[论文arvix](https://arxiv.org/abs/2402.01306)
超参
- beta: KL正则系数,值越大表示对偏离参考模型的惩罚越大。默认为0.1
- desirable_weight :损失函数中的$\lambda_D$项,偏好回答样本的损失权重, 默认为1.0
- undesirable_weight :损失函数中的$\lambda_U$项,拒绝回答样本的损失权重,默认为1.0
用 $n_D$ 和 $n_U$ 分别表示数据集中偏好回答和拒绝回答的样本数量,对于超参 $\lambda_D$ 和 $\lambda_U$ ,作者推荐设置 $\frac{\lambda_Dn_D}{\lambda_Un_U}\in[1,\frac{4}{3}]$
训练脚本
使用 $(x,y,\text{label})$ 格式数据训练
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/kto.sh).
## CPO
[论文arvix](https://arxiv.org/abs/2401.08417)
超参
- beta:隐含奖励前的系数,默认为0.1
- cpo_alpha: nll loss系数, 默认为1.0
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/cpo.sh).
## ORPO
[论文arvix](https://arxiv.org/abs/2403.07691)
超参
- lambda: Odds Ratio loss系数
注意:ORPO使用参数`--beta`传入超参`lambda`
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/orpo.sh).
## SimPO
[论文arvix](https://arxiv.org/abs/2405.14734)
超参
- beta:隐含奖励前的系数,默认为2.0
- simpo_gamma:reward margin项,默认为1.0
- cpo_alpha: 混合CPO nll loss提高训练稳定性, 默认为1.0, 设置0.0使用原始SimPO算法
训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/simpo.sh).
================================================
FILE: docs/source/Instruction/Ray.md
================================================
# ray的支持
SWIFT已经支持使用ray来进行多卡或多节点训练。已有功能中对ray的支持情况如下:
| 功能 | 支持ray | 例子 | 可分配角色 |
|----------|-------|--------------------------------------------------------------------------------|-----------------|
| pt/sft | ✅ | https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node/ray | default |
| dpo | ❎ | | |
| grpo | ❎ | | |
| ppo | ❎ | | |
| megatron | ❎ | | |
| sampling | ✅ | https://github.com/modelscope/ms-swift/tree/main/examples/sampler/distill | sampler/prm/orm |
| distill | ✅ | https://github.com/modelscope/ms-swift/tree/main/examples/sampler/sample | sampler/prm/orm |
## 技术细节
在叙述参数设置之前,我们有必要先行讲一下技术细节。由于SWIFT的内部当前使用了大量transformers和trl的已有实现,像veRL或ROLL一样拆解为不同的ray角色是不现实的,而且拆解后会以ray为中心,对非ray的场景的支持会不良。
因此SWIFT采取了装饰器为主的技术方案,以函数级别定义了不同角色,这些角色可以在参数中被定义如何使用。看下面的例子:
```python
from swift.ray import RayHelper
@RayHelper.worker(group=['model1', 'model2'])
class MyTrainer:
def __init__(self, args):
self._prepare_model1()
self._prepare_model2()
self._prepare_datasets()
@RayHelper.function(group='model1')
def _prepare_model1(self):
...
@RayHelper.function(group='model2')
def _prepare_model2(self):
...
@RayHelper.function(group='model1')
def rollout(self, inputs):
return self.model1.generate(inputs)
@RayHelper.function(group='model2')
def forward_model2(self, inputs):
loss = self.model2.forward(inputs)
loss.backward()
def _prepare_datasets(self):
self.dataset = ...
def train(self):
for batch in DataLoader(self.dataset):
generated = self.rollout(batch)
self.forward_model2(generated)
...
if __name__ == '__main__':
...
MyTrainer(args).train()
```
RayHelper会将被装饰的方法分配到不同的硬件集群中,本地调用会被平滑转换到ray集群中进行远程调用。也可以以类为中心进行划分:
```python
@RayHelper.worker(group=['model1'])
class Model1:
...
@RayHelper.function(group='model1')
def rollout(self):
...
@RayHelper.worker(group=['model2'])
class Model2:
...
@RayHelper.function(group='model2')
def forward_and_optimize(self):
...
class Trainer:
...
```
SWIFT对ray的支持本质上是使用@worker和@function两个注解的组合使用,worker指定ray集群的角色,function指定如何分配数据。
function注解有额外的几个参数:
```python
@staticmethod
def function(group: str,
dispatch: Union[Literal['slice', 'all'], Callable] = 'all',
execute: Literal['first', 'all'] = 'all',
collect: Union[Literal['none', 'flatten'], Callable] = 'none'):
```
- dispatch: 如何分配调用入参
- slice:对入参切分,也就是worker负载均衡执行
- all:各个worker入参完全相同
- 自定义切分方式,格式为:
```python
def my_custom_slice(n, i, data):
# n是worker数量,i是当前worker索引,data是原始入参
# 返回第i个的入参
```
- execute: 如何执行
- first: rank0执行,此时slice和Callable方式切分无效
- all: 全部执行
- collect: 如何收集返回数据
- none:原样返回,格式为各个worker返回值的列表
- flatten: 将worker返回的结果进行拉平,支持tuple的拉平
- Callable: 自定义collect方式,格式为:
```python
def my_custom_collect(result):
# result是各个worker返回的列表
# 输入你想要的格式
```
## 参数设置
讲完技术细节后,可以将参数配置了。开发者可以根据不同的流程中的角色列表,设置不同的硬件搭配方式,例如采样功能中,共有三个角色,sampler、prm、orm,可以这样配置:
```yaml
device_groups:
nproc_per_node: 4
sample_group:
device: GPU
ranks: list(range(0, 2))
workers:
- sampler
rm_group:
device: GPU
ranks: list(range(2, 4))
workers:
- prm
- orm
```
- nproc_per_node: ray集群中需要的每个node的最小卡数。
xxx_group: 每个ray组的名称,可以随意指定
- device: 设备类型,当前支持GPU/CPU等。
- ranks: 当前组分配到哪些ranks上。如果是CPU,ranks只能为整数,代表共需要多少进程,如果是GPU,可以为`[0,1,2,3]`, `4`, `list(range(0, 4))`等格式。
- workers: 哪些角色分配到当前组中。
所有可用的角色可以见本文最上面的表。
如果使用命令行,device_groups也可以以`--device_groups xxx`方式传入,xxx为jsonstring。为了配置的简便,我们强烈推荐使用yaml方式搭配ray使用。
================================================
FILE: docs/source/Instruction/Reinforced-Fine-tuning.md
================================================
# 强化微调
强化微调是目前模型训练非常重要的功能之一,它本身的实现是多种多样的,SWIFT目前已经支持了强化微调所需要的原子能力,如采样、强化学习和微调。目前我们提供了拒绝采样微调的一个具体示例,可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py)。
## 强化微调的概念
强化微调是从2022年开始(甚至更早)就被提出的概念。其方式一般有下列流程:
1. 使用某个模型生成数据,或进行原始数据扩充
2. 使用数据训练目标模型
3. 如果有必要,重复上述过程
步骤1:
- 如果生成数据的模型是更大的模型,如GPT、Qwen-Max、DeepSeek-V3/R1等,则该强化微调可以理解为蒸馏
- 如果生成数据的模型是本模型,则可以理解为自我提升(self-improvement)微调
- 如果采样过程是采样一个batch,然后通过KL散度和reward进行拟合训练并不断循环,则可以理解为PPO、GRPO等on-policy算法
- 采样数据的算法包含蒙特卡洛采样、do_sample采样、group beam search、dvts等
- 采样过程可以引入ORM(结果判断),PRM(过程打分),多样性过滤,语种过滤等
步骤2:
- 如果使用SFT,则称为拒绝采样微调
- 如果是强化学习,则称为强化学习微调
步骤3:
- 如果使用更大的模型蒸馏,例如更大模型的蒙特卡洛采样蒸馏,一般不会有循环
- 如果使用本模型进行采样,或者PPO等算法,则会有循环
泛泛来说,常见强化微调的方式有下面几种:
1. 蒸馏:使用蒙特卡洛、do_sample等方式从超大模型中采样大量优质数据,训练小模型
2. 自我提升:从本模型中采样部分优质数据,筛选后训练本模型,循环执行
3. on-policy RL:使用PPO、GRPO等方式循环训练
采样过程一般很漫长,比训练过程漫长的多。如果使用GPT等模型蒸馏数据,则需要购买token。因此,强化微调的时间成本和花费成本比较高,所以一般作为微调的补充机制出现,当然也有特例,例如最近的DeepSeek-R1。
DeepSeek-R1使用了GRPO算法从零使base模型涌现CoT能力,该方法需要大规模集群支持,且模型需要足够大才能发生能力涌现,在本文中不详细讨论。如果需要了解该过程,请查看[论文解析](https://zhuanlan.zhihu.com/p/19714987272)。
有关强化微调的一些论文:
- 拒绝采样微调:https://arxiv.org/pdf/2308.01825
- ReST:https://arxiv.org/pdf/2308.08998
- B-STAR:https://arxiv.org/pdf/2412.17256
- DeepSeekMath:https://arxiv.org/pdf/2402.03300
- Qwen-math-PRM:https://arxiv.org/pdf/2501.07301
- DeepSeek-R1:https://github.com/deepseek-ai/DeepSeek-R1/tree/main
## 什么时候使用强化微调
在LLaMA3之后,我们发现一个非常明显但却是不常被提及的特点:使用某个含有CoT的train数据集训练Instruct模型,再通过对应的test集进行评测,会发现test集评测效果变差。例如,使用gsm8k训练集训练llama3.1-8b-instruct,对生成的ckpt使用test集进行评测,会发现掉点。
这个特性主要来源于模型的知识遗忘问题。在模型厂商的微调中,会加入非常多的CoT数据集,模型在解决数学任务的时候,用到的能力很有可能不是来自于math数据集,而是来自arc数据集,这个推论有[一些工作可以证明](https://zhuanlan.zhihu.com/p/19269451950)。在继续训练通用任务后,知识遗忘破坏了模型原有能力,导致了掉点。
然而,优先使用微调方式训练模型总是正确的。微调可以使模型快速适应数据集的分布,并且微调的成本很低。当有如下条件之一时使用强化微调:
1. 已经微调过模型,能力不满足需求
2. 需要更强的CoT能力
3. 对基模型训练通用能力,而原始数据集已经导致模型效果无法提升
4. 对应query的输出结果可以相对准确地评估好坏,例如结果清晰(数学,代码),过程清晰(翻译,风格)等
强化微调非常依赖于reward评估是否准确。如果评估结果不准确,可能导致模型训练原地震荡,甚至越训越差。
## SWIFT的实现
SWIFT支持sample命令,该命令就是用于模型采样。目前支持的采样方式有:
- sample:以generate方式对模型进行采样
目前我们给出了一个较为通用的[RFT脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py)。该脚本适用于自我提升方式的训练,且支持动态调整采样温度值、PRM阈值等超参数,并且训练方式灵活可变(微调、DPO等;或者每次迭代重新训练原模型或继续训练上个迭代的模型,甚至加载上个迭代的所有训练状态等)。开发者可以在该脚本中增加其他数据过滤(生成的数据集中,id相同的行来自同一个query),例如多样性判断、语种判断等。
## 实验结果
我们对该RFT脚本针对数学领域使用competition_math数据集进行了训练和评测,结果如下:
| 模型 | MATH指标 | 训练方式 | 迭代次数 | 训练后MATH指标 |
| ------------------------ | -------- | -------- | -------- | --------------------- |
| LLaMA3.1_8b | 12.0 | SFT | 3 | 25.2(LLaMA3.1_8b_sft) |
| LLaMA3.1_8b_sft | 25.2 | RFT | 2 | 32.4 |
| LLaMA3.1_8b_instruct | 52.2 | SFT | 2 | 39.0 |
| LLaMA3.1_8b_instruct | 52.2 | RFT | 3 | 58 |
| Qwen2.5_math_7b_instruct | 79.6 | RFT | 2 | 83.2 |
可以看到,使用competition_math直接SFT后,instruct模型的掉点十分严重。而RFT后模型能力有提升,即使对Qwen2.5_math_7b_instruct这个SOTA的math模型也同样有一定提升空间。
特别地,针对Qwen2.5_math_7b_instruct我们测试了gsm8k的指标:
| 模型 | gsm8k指标 | RFT后gsm8k指标 |
| ------------------------ | --------- | -------------- |
| Qwen2.5_math_7b_instruct | 92.8 | 91.6 |
可以看到,RFT训练后gsm8k指标变化不大,并没有出现前述的掉点现象。
================================================
FILE: docs/source/Instruction/Sample.md
================================================
# 采样
采样是SWIFT新支持的重要能力之一,这部分可以理解为`test-time compute`的落地实现。同时,该能力对RFT(强化微调)的实现也至关重要。
## 能力介绍
SWIFT的sample能力可以使用下面的例子进行:
```shell
swift sample --model LLM-Research/Meta-Llama-3.1-8B-Instruct --sampler_engine transformers --num_return_sequences 5 --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
```
在当前文件夹的`sample_output`目录下,会生成以时间戳为文件名的jsonl文件,该文件应该包含25行,每一行都是一个完整`messages`格式的数据。
采样的参数列表请参考[这里](Command-line-parameters.md)。
## 环境准备
```shell
pip install ms-swift[llm] -U
```
或从源代码安装:
```shell
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e '.[llm]'
```
## 使用PRM和ORM进行结果过滤
采样重要的能力就是对过程和结果进行监督,这可以通过设置额外参数来支持。
```shell
swift sample --model LLM-Research/Meta-Llama-3.1-8B-Instruct --sampler_engine lmdeploy --num_return_sequences 5 --n_best_to_keep 2 --dataset tastelikefeet/competition_math#5 --prm_model AI-ModelScope/GRM-llama3.2-3B-rewardmodel-ft --orm_model math
```
在当前文件夹的`sample_output`目录下,会生成以时间戳为文件名的jsonl文件,该文件**至多包含**10行,每一行都是一个完整`messages`格式的数据。
> 之所以至多包含10行,是因为虽然设置了共处理5个数据,每个数据保留2个(n_best_to_keep),但是orm可能会校验失败,失败数据不会保留到文件中。
> 另外,增加了--prm_model或--orm_model后文件格式有所不同,包含了rejected_response key,内容来自于prm评分最低的行。
## 自定义PRM或ORM
PRM和ORM的自定义可以在plugin中按照现有代码增加一个新的实现。例如:
```python
class CustomPRM:
# 构造需要是无参的
def __init__(self):
# init here
pass
def __call__(self, infer_requests: List[InferRequest], ground_truths: List[str], **kwargs) -> List[Union[float, List[float]]]:
...
prms = {'custom': CustomPRM}
```
之后在命令行中使用`--prm_model custom`即可。
## 显存控制
如果被采样模型和PRM共同加载进显存,则可能出现OOM的问题。因此采样可以分为两段进行:
- 第一段指定`--model`和``--sampler_engine`,同时不指定`--orm_model`和`--prm_model`,仅进行采样,并存储为文件
- 第二段指定`--sampler_engine no`,指定`--orm_model`和`--prm_model`,并同时指定`--cache_files`,仅进行RM数据过滤,不重新采样
通过两段方式可以每次仅加载一个模型,防止OOM。
## 实际例子
请参考[强化微调脚本](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py)。该脚本给出了使用采样进行强化微调的实际例子。
> 注意:该脚本的实际效果和模型、数据、RM的质量强相关,因此仅作为样例出现,用户请自行修改该脚本并训练自己的RM和generator模型。
## 大模型蒸馏采样
SWIFT的sample支持使用OpenAI API的方式,用大模型蒸馏数据,如下示例:
```shell
OPENAI_API_KEY="your_api_key" \
swift sample \
--sampler_type distill \
--sampler_engine client \
--model deepseek-r1 \
--stream true \
--dataset tastelikefeet/competition_math#5 \
--num_return_sequences 1 \
--temperature 0.6 \
--top_p 0.95 \
--engine_kwargs '{"base_url":"https://dashscope.aliyuncs.com/compatible-mode/v1"}'
```
在以上示例中,base_url和model分别是api地址和模型名称,stream表示发起请求的stream参数。
注意,对于Deepseek-R1系列模型,输出会被格式化为:`{reasoning_content} \n\n{content} `。
================================================
FILE: docs/source/Instruction/Supported-models-and-datasets.md
================================================
# 支持的模型和数据集
> [!WARNING]
> 本文档适配swift4.0.0,若要查看swift3.x模型对应的model_type和template,请查看[v3.12文档](https://swift.readthedocs.io/zh-cn/v3.12/Instruction/Supported-models-and-datasets.html)
## 模型
下表介绍了ms-swift接入的模型的相关信息:
- Model ID: ModelScope模型id
- HF Model ID: HuggingFace模型id
- Model Type: 模型类型
- Default Template: 默认对话模板
- Requires: 使用该模型的额外依赖(**如果训练时遇到transformers版本不匹配的问题,你可以查看模型`config.json`文件中的`transformers_version`字段**)
- Support Megatron: 是否支持Megatron-SWIFT训练
- Tags: 模型的tags
### 大语言模型
| Model ID | Model Type | Default Template | Requires | Support Megatron | Tags | HF Model ID |
| -------- | -----------| ---------------- | -------- | ---------------- | ---- | ----------- |
|[Qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/Qwen/Qwen-1_8B-Chat)|qwen|qwen|-|✘|-|[Qwen/Qwen-1_8B-Chat](https://huggingface.co/Qwen/Qwen-1_8B-Chat)|
|[Qwen/Qwen-7B-Chat](https://modelscope.cn/models/Qwen/Qwen-7B-Chat)|qwen|qwen|-|✘|-|[Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)|
|[Qwen/Qwen-14B-Chat](https://modelscope.cn/models/Qwen/Qwen-14B-Chat)|qwen|qwen|-|✘|-|[Qwen/Qwen-14B-Chat](https://huggingface.co/Qwen/Qwen-14B-Chat)|
|[Qwen/Qwen-72B-Chat](https://modelscope.cn/models/Qwen/Qwen-72B-Chat)|qwen|qwen|-|✘|-|[Qwen/Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)|
|[Qwen/Qwen-1_8B](https://modelscope.cn/models/Qwen/Qwen-1_8B)|qwen|qwen|-|✘|-|[Qwen/Qwen-1_8B](https://huggingface.co/Qwen/Qwen-1_8B)|
|[Qwen/Qwen-7B](https://modelscope.cn/models/Qwen/Qwen-7B)|qwen|qwen|-|✘|-|[Qwen/Qwen-7B](https://huggingface.co/Qwen/Qwen-7B)|
|[Qwen/Qwen-14B](https://modelscope.cn/models/Qwen/Qwen-14B)|qwen|qwen|-|✘|-|[Qwen/Qwen-14B](https://huggingface.co/Qwen/Qwen-14B)|
|[Qwen/Qwen-72B](https://modelscope.cn/models/Qwen/Qwen-72B)|qwen|qwen|-|✘|-|[Qwen/Qwen-72B](https://huggingface.co/Qwen/Qwen-72B)|
|[Qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-1_8B-Chat-Int4)|qwen|qwen|-|✘|-|[Qwen/Qwen-1_8B-Chat-Int4](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int4)|
|[Qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-7B-Chat-Int4)|qwen|qwen|-|✘|-|[Qwen/Qwen-7B-Chat-Int4](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)|
|[Qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-14B-Chat-Int4)|qwen|qwen|-|✘|-|[Qwen/Qwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)|
|[Qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-72B-Chat-Int4)|qwen|qwen|-|✘|-|[Qwen/Qwen-72B-Chat-Int4](https://huggingface.co/Qwen/Qwen-72B-Chat-Int4)|
|[Qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/Qwen/Qwen-1_8B-Chat-Int8)|qwen|qwen|-|✘|-|[Qwen/Qwen-1_8B-Chat-Int8](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int8)|
|[Qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/Qwen/Qwen-7B-Chat-Int8)|qwen|qwen|-|✘|-|[Qwen/Qwen-7B-Chat-Int8](https://huggingface.co/Qwen/Qwen-7B-Chat-Int8)|
|[Qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/Qwen/Qwen-14B-Chat-Int8)|qwen|qwen|-|✘|-|[Qwen/Qwen-14B-Chat-Int8](https://huggingface.co/Qwen/Qwen-14B-Chat-Int8)|
|[Qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/Qwen/Qwen-72B-Chat-Int8)|qwen|qwen|-|✘|-|[Qwen/Qwen-72B-Chat-Int8](https://huggingface.co/Qwen/Qwen-72B-Chat-Int8)|
|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat)|qwen|qwen|-|✘|financial|[jxy/Tongyi-Finance-14B-Chat](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat)|
|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B)|qwen|qwen|-|✘|financial|-|
|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4)|qwen|qwen|-|✘|financial|[jxy/Tongyi-Finance-14B-Chat-Int4](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat-Int4)|
|[Qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-0.5B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-0.5B-Chat](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat)|
|[Qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-1.8B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-1.8B-Chat](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat)|
|[Qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-4B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-4B-Chat](https://huggingface.co/Qwen/Qwen1.5-4B-Chat)|
|[Qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-7B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat)|
|[Qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-14B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat)|
|[Qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)|
|[Qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-72B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-72B-Chat](https://huggingface.co/Qwen/Qwen1.5-72B-Chat)|
|[Qwen/Qwen1.5-110B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-110B-Chat)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-110B-Chat](https://huggingface.co/Qwen/Qwen1.5-110B-Chat)|
|[Qwen/Qwen1.5-0.5B](https://modelscope.cn/models/Qwen/Qwen1.5-0.5B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B)|
|[Qwen/Qwen1.5-1.8B](https://modelscope.cn/models/Qwen/Qwen1.5-1.8B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)|
|[Qwen/Qwen1.5-4B](https://modelscope.cn/models/Qwen/Qwen1.5-4B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B)|
|[Qwen/Qwen1.5-7B](https://modelscope.cn/models/Qwen/Qwen1.5-7B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B)|
|[Qwen/Qwen1.5-14B](https://modelscope.cn/models/Qwen/Qwen1.5-14B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B)|
|[Qwen/Qwen1.5-32B](https://modelscope.cn/models/Qwen/Qwen1.5-32B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-32B](https://huggingface.co/Qwen/Qwen1.5-32B)|
|[Qwen/Qwen1.5-72B](https://modelscope.cn/models/Qwen/Qwen1.5-72B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-72B](https://huggingface.co/Qwen/Qwen1.5-72B)|
|[Qwen/Qwen1.5-110B](https://modelscope.cn/models/Qwen/Qwen1.5-110B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen1.5-110B](https://huggingface.co/Qwen/Qwen1.5-110B)|
|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-110B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-110B-Chat-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-110B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-110B-Chat-GPTQ-Int4)|
|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)|
|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-0.5B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ)|
|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-1.8B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ)|
|[Qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-4B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-4B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ)|
|[Qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-7B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ)|
|[Qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-14B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)|
|[Qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)|
|[Qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-72B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)|
|[Qwen/Qwen1.5-110B-Chat-AWQ](https://modelscope.cn/models/Qwen/Qwen1.5-110B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen1.5-110B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-110B-Chat-AWQ)|
|[Qwen/CodeQwen1.5-7B](https://modelscope.cn/models/Qwen/CodeQwen1.5-7B)|qwen2|qwen|transformers>=4.37|✔|coding|[Qwen/CodeQwen1.5-7B](https://huggingface.co/Qwen/CodeQwen1.5-7B)|
|[Qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/Qwen/CodeQwen1.5-7B-Chat)|qwen2|qwen|transformers>=4.37|✔|coding|[Qwen/CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat)|
|[Qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/Qwen/CodeQwen1.5-7B-Chat-AWQ)|qwen2|qwen|transformers>=4.37|✘|coding|[Qwen/CodeQwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat-AWQ)|
|[Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)|
|[Qwen/Qwen2-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-1.5B-Instruct)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)|
|[Qwen/Qwen2-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)|
|[Qwen/Qwen2-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)|
|[Qwen/Qwen2-0.5B](https://modelscope.cn/models/Qwen/Qwen2-0.5B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-0.5B](https://huggingface.co/Qwen/Qwen2-0.5B)|
|[Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B)|
|[Qwen/Qwen2-7B](https://modelscope.cn/models/Qwen/Qwen2-7B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B)|
|[Qwen/Qwen2-72B](https://modelscope.cn/models/Qwen/Qwen2-72B)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2-72B](https://huggingface.co/Qwen/Qwen2-72B)|
|[Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-7B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct-GPTQ-Int4)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-72B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-7B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct-GPTQ-Int8)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-72B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-0.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-AWQ)|
|[Qwen/Qwen2-1.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-1.5B-Instruct-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-AWQ)|
|[Qwen/Qwen2-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-7B-Instruct-AWQ)|
|[Qwen/Qwen2-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct-AWQ)|qwen2|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-72B-Instruct-AWQ)|
|[Qwen/Qwen2-Math-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Math-1.5B-Instruct)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-1.5B-Instruct)|
|[Qwen/Qwen2-Math-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Math-7B-Instruct)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-7B-Instruct)|
|[Qwen/Qwen2-Math-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Math-72B-Instruct)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-72B-Instruct)|
|[Qwen/Qwen2-Math-1.5B](https://modelscope.cn/models/Qwen/Qwen2-Math-1.5B)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-1.5B](https://huggingface.co/Qwen/Qwen2-Math-1.5B)|
|[Qwen/Qwen2-Math-7B](https://modelscope.cn/models/Qwen/Qwen2-Math-7B)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-7B](https://huggingface.co/Qwen/Qwen2-Math-7B)|
|[Qwen/Qwen2-Math-72B](https://modelscope.cn/models/Qwen/Qwen2-Math-72B)|qwen2|qwen|transformers>=4.37|✔|math|[Qwen/Qwen2-Math-72B](https://huggingface.co/Qwen/Qwen2-Math-72B)|
|[Qwen/Qwen2.5-7B-Instruct-1M](https://modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct-1M)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2.5-7B-Instruct-1M](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M)|
|[Qwen/Qwen2.5-14B-Instruct-1M](https://modelscope.cn/models/Qwen/Qwen2.5-14B-Instruct-1M)|qwen2|qwen|transformers>=4.37|✔|-|[Qwen/Qwen2.5-14B-Instruct-1M](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M)|
|[PowerInfer/SmallThinker-3B-Preview](https://modelscope.cn/models/PowerInfer/SmallThinker-3B-Preview)|qwen2|qwen|transformers>=4.37|✔|-|[PowerInfer/SmallThinker-3B-Preview](https://huggingface.co/PowerInfer/SmallThinker-3B-Preview)|
|[Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)|
|[Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)|
|[Qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-3B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)|
|[Qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)|
|[Qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-14B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)|
|[Qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-32B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)|
|[Qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)|
|[Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B)|
|[Qwen/Qwen2.5-1.5B](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)|
|[Qwen/Qwen2.5-3B](https://modelscope.cn/models/Qwen/Qwen2.5-3B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B)|
|[Qwen/Qwen2.5-7B](https://modelscope.cn/models/Qwen/Qwen2.5-7B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)|
|[Qwen/Qwen2.5-14B](https://modelscope.cn/models/Qwen/Qwen2.5-14B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)|
|[Qwen/Qwen2.5-32B](https://modelscope.cn/models/Qwen/Qwen2.5-32B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)|
|[Qwen/Qwen2.5-72B](https://modelscope.cn/models/Qwen/Qwen2.5-72B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)|
|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)|
|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)|
|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-3B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)|
|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)|
|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-14B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)|
|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-32B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)|
|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-0.5B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct)|
|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-1.5B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)|
|[Qwen/Qwen2.5-Coder-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-3B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct)|
|[Qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)|
|[Qwen/Qwen2.5-Coder-14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-14B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct)|
|[Qwen/Qwen2.5-Coder-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-32B-Instruct)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)|
|[Qwen/Qwen2.5-Coder-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-0.5B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-0.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B)|
|[Qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-1.5B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)|
|[Qwen/Qwen2.5-Coder-3B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-3B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-3B](https://huggingface.co/Qwen/Qwen2.5-Coder-3B)|
|[Qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)|
|[Qwen/Qwen2.5-Coder-14B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-14B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-14B](https://huggingface.co/Qwen/Qwen2.5-Coder-14B)|
|[Qwen/Qwen2.5-Coder-32B](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-32B)|qwen2|qwen2_5|transformers>=4.37|✔|coding|[Qwen/Qwen2.5-Coder-32B](https://huggingface.co/Qwen/Qwen2.5-Coder-32B)|
|[Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-3B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-3B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-14B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-14B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-32B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-32B-Instruct-AWQ)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct-AWQ)|
|[Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8)|qwen2|qwen2_5|transformers>=4.37|✘|coding|[Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8)|
|[moonshotai/Kimi-Dev-72B](https://modelscope.cn/models/moonshotai/Kimi-Dev-72B)|qwen2|qwen2_5|transformers>=4.37|✔|-|[moonshotai/Kimi-Dev-72B](https://huggingface.co/moonshotai/Kimi-Dev-72B)|
|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Math-1.5B-Instruct)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)|
|[Qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Math-7B-Instruct)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)|
|[Qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-Math-72B-Instruct)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)|
|[Qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-1.5B)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)|
|[Qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-7B)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)|
|[Qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-72B)|qwen2|qwen2_5_math|transformers>=4.37|✔|math|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)|
|[AIDC-AI/Marco-o1](https://modelscope.cn/models/AIDC-AI/Marco-o1)|qwen2|marco_o1|transformers>=4.37|✔|-|[AIDC-AI/Marco-o1](https://huggingface.co/AIDC-AI/Marco-o1)|
|[Qwen/QwQ-32B-Preview](https://modelscope.cn/models/Qwen/QwQ-32B-Preview)|qwen2|qwq_preview|transformers>=4.37|✔|-|[Qwen/QwQ-32B-Preview](https://huggingface.co/Qwen/QwQ-32B-Preview)|
|[Qwen/QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B)|qwen2|qwq|transformers>=4.37|✔|-|[Qwen/QwQ-32B](https://huggingface.co/Qwen/QwQ-32B)|
|[Qwen/QwQ-32B-AWQ](https://modelscope.cn/models/Qwen/QwQ-32B-AWQ)|qwen2|qwq|transformers>=4.37|✘|-|[Qwen/QwQ-32B-AWQ](https://huggingface.co/Qwen/QwQ-32B-AWQ)|
|[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)|qwen2|deepseek_r1|transformers>=4.37|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)|
|[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)|qwen2|deepseek_r1|transformers>=4.37|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)|
|[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)|qwen2|deepseek_r1|transformers>=4.37|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)|
|[deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)|qwen2|deepseek_r1|transformers>=4.37|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)|
|[iic/QwenLong-L1-32B](https://modelscope.cn/models/iic/QwenLong-L1-32B)|qwen2|deepseek_r1|transformers>=4.37|✔|-|[Tongyi-Zhiwen/QwenLong-L1-32B](https://huggingface.co/Tongyi-Zhiwen/QwenLong-L1-32B)|
|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat)|qwen2_moe|qwen|transformers>=4.40|✔|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
|[Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B)|qwen2_moe|qwen|transformers>=4.40|✔|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|✘|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|
|[Qwen/Qwen2-57B-A14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct)|qwen2_moe|qwen|transformers>=4.40|✔|-|[Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)|
|[Qwen/Qwen2-57B-A14B](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B)|qwen2_moe|qwen|transformers>=4.40|✔|-|[Qwen/Qwen2-57B-A14B](https://huggingface.co/Qwen/Qwen2-57B-A14B)|
|[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|✘|-|[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen3-0.6B-Base](https://modelscope.cn/models/Qwen/Qwen3-0.6B-Base)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base)|
|[Qwen/Qwen3-1.7B-Base](https://modelscope.cn/models/Qwen/Qwen3-1.7B-Base)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-1.7B-Base](https://huggingface.co/Qwen/Qwen3-1.7B-Base)|
|[Qwen/Qwen3-4B-Base](https://modelscope.cn/models/Qwen/Qwen3-4B-Base)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base)|
|[Qwen/Qwen3-8B-Base](https://modelscope.cn/models/Qwen/Qwen3-8B-Base)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-8B-Base](https://huggingface.co/Qwen/Qwen3-8B-Base)|
|[Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base)|
|[Qwen/Qwen3-0.6B](https://modelscope.cn/models/Qwen/Qwen3-0.6B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)|
|[Qwen/Qwen3-1.7B](https://modelscope.cn/models/Qwen/Qwen3-1.7B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)|
|[Qwen/Qwen3-4B](https://modelscope.cn/models/Qwen/Qwen3-4B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)|
|[Qwen/Qwen3-8B](https://modelscope.cn/models/Qwen/Qwen3-8B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)|
|[Qwen/Qwen3-14B](https://modelscope.cn/models/Qwen/Qwen3-14B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-14B](https://huggingface.co/Qwen/Qwen3-14B)|
|[Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B)|
|[Qwen/Qwen3-0.6B-FP8](https://modelscope.cn/models/Qwen/Qwen3-0.6B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-0.6B-FP8](https://huggingface.co/Qwen/Qwen3-0.6B-FP8)|
|[Qwen/Qwen3-1.7B-FP8](https://modelscope.cn/models/Qwen/Qwen3-1.7B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-1.7B-FP8](https://huggingface.co/Qwen/Qwen3-1.7B-FP8)|
|[Qwen/Qwen3-4B-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-FP8](https://huggingface.co/Qwen/Qwen3-4B-FP8)|
|[Qwen/Qwen3-8B-FP8](https://modelscope.cn/models/Qwen/Qwen3-8B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-8B-FP8](https://huggingface.co/Qwen/Qwen3-8B-FP8)|
|[Qwen/Qwen3-14B-FP8](https://modelscope.cn/models/Qwen/Qwen3-14B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-14B-FP8](https://huggingface.co/Qwen/Qwen3-14B-FP8)|
|[Qwen/Qwen3-32B-FP8](https://modelscope.cn/models/Qwen/Qwen3-32B-FP8)|qwen3|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-32B-FP8](https://huggingface.co/Qwen/Qwen3-32B-FP8)|
|[Qwen/Qwen3-4B-AWQ](https://modelscope.cn/models/Qwen/Qwen3-4B-AWQ)|qwen3|qwen3|transformers>=4.51|✘|-|[Qwen/Qwen3-4B-AWQ](https://huggingface.co/Qwen/Qwen3-4B-AWQ)|
|[Qwen/Qwen3-8B-AWQ](https://modelscope.cn/models/Qwen/Qwen3-8B-AWQ)|qwen3|qwen3|transformers>=4.51|✘|-|[Qwen/Qwen3-8B-AWQ](https://huggingface.co/Qwen/Qwen3-8B-AWQ)|
|[Qwen/Qwen3-14B-AWQ](https://modelscope.cn/models/Qwen/Qwen3-14B-AWQ)|qwen3|qwen3|transformers>=4.51|✘|-|[Qwen/Qwen3-14B-AWQ](https://huggingface.co/Qwen/Qwen3-14B-AWQ)|
|[Qwen/Qwen3-32B-AWQ](https://modelscope.cn/models/Qwen/Qwen3-32B-AWQ)|qwen3|qwen3|transformers>=4.51|✘|-|[Qwen/Qwen3-32B-AWQ](https://huggingface.co/Qwen/Qwen3-32B-AWQ)|
|[swift/Qwen3-32B-AWQ](https://modelscope.cn/models/swift/Qwen3-32B-AWQ)|qwen3|qwen3|transformers>=4.51|✘|-|-|
|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|qwen3|deepseek_r1|transformers>=4.51|✔|-|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|
|[Qwen/Qwen3Guard-Gen-0.6B](https://modelscope.cn/models/Qwen/Qwen3Guard-Gen-0.6B)|qwen3|qwen3_guard|transformers>=4.51|✔|-|[Qwen/Qwen3Guard-Gen-0.6B](https://huggingface.co/Qwen/Qwen3Guard-Gen-0.6B)|
|[Qwen/Qwen3Guard-Gen-4B](https://modelscope.cn/models/Qwen/Qwen3Guard-Gen-4B)|qwen3|qwen3_guard|transformers>=4.51|✔|-|[Qwen/Qwen3Guard-Gen-4B](https://huggingface.co/Qwen/Qwen3Guard-Gen-4B)|
|[Qwen/Qwen3Guard-Gen-8B](https://modelscope.cn/models/Qwen/Qwen3Guard-Gen-8B)|qwen3|qwen3_guard|transformers>=4.51|✔|-|[Qwen/Qwen3Guard-Gen-8B](https://huggingface.co/Qwen/Qwen3Guard-Gen-8B)|
|[Alibaba-AAIG/YuFeng-XGuard-Reason-0.6B](https://modelscope.cn/models/Alibaba-AAIG/YuFeng-XGuard-Reason-0.6B)|qwen3|yufeng_xguard|transformers>=4.51|✔|-|[Alibaba-AAIG/YuFeng-XGuard-Reason-0.6B](https://huggingface.co/Alibaba-AAIG/YuFeng-XGuard-Reason-0.6B)|
|[Alibaba-AAIG/YuFeng-XGuard-Reason-8B](https://modelscope.cn/models/Alibaba-AAIG/YuFeng-XGuard-Reason-8B)|qwen3|yufeng_xguard|transformers>=4.51|✔|-|[Alibaba-AAIG/YuFeng-XGuard-Reason-8B](https://huggingface.co/Alibaba-AAIG/YuFeng-XGuard-Reason-8B)|
|[Qwen/Qwen3-4B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507)|qwen3|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)|
|[Qwen/Qwen3-4B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507-FP8)|qwen3|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507-FP8)|
|[Qwen/Qwen3-4B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507)|qwen3|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507)|
|[Qwen/Qwen3-4B-Instruct-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507-FP8)|qwen3|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-4B-Instruct-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507-FP8)|
|[Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base)|
|[Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)|
|[Qwen/Qwen3-235B-A22B](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B)|
|[Qwen/Qwen3-30B-A3B-FP8](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-FP8)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-FP8](https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8)|
|[Qwen/Qwen3-235B-A22B-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-FP8)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8)|
|[swift/Qwen3-30B-A3B-AWQ](https://modelscope.cn/models/swift/Qwen3-30B-A3B-AWQ)|qwen3_moe|qwen3|transformers>=4.51|✘|-|[cognitivecomputations/Qwen3-30B-A3B-AWQ](https://huggingface.co/cognitivecomputations/Qwen3-30B-A3B-AWQ)|
|[swift/Qwen3-235B-A22B-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-AWQ)|qwen3_moe|qwen3|transformers>=4.51|✘|-|[cognitivecomputations/Qwen3-235B-A22B-AWQ](https://huggingface.co/cognitivecomputations/Qwen3-235B-A22B-AWQ)|
|[iic/Tongyi-DeepResearch-30B-A3B](https://modelscope.cn/models/iic/Tongyi-DeepResearch-30B-A3B)|qwen3_moe|qwen3|transformers>=4.51|✔|-|[Alibaba-NLP/Tongyi-DeepResearch-30B-A3B](https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B)|
|[Qwen/Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)|qwen3_moe|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507)|
|[Qwen/Qwen3-30B-A3B-Instruct-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507-FP8)|qwen3_moe|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-Instruct-2507-FP8](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507-FP8)|
|[Qwen/Qwen3-235B-A22B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Instruct-2507)|qwen3_moe|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507)|
|[Qwen/Qwen3-235B-A22B-Instruct-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Instruct-2507-FP8)|qwen3_moe|qwen3_nothinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Instruct-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507-FP8)|
|[swift/Qwen3-235B-A22B-Instruct-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Instruct-2507-AWQ)|qwen3_moe|qwen3_nothinking|transformers>=4.51|✘|-|-|
|[Qwen/Qwen3-Coder-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Coder-30B-A3B-Instruct)|qwen3_moe|qwen3_coder|transformers>=4.51|✔|coding|[Qwen/Qwen3-Coder-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct)|
|[Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8)|qwen3_moe|qwen3_coder|transformers>=4.51|✔|coding|[Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8)|
|[Qwen/Qwen3-Coder-480B-A35B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct)|qwen3_moe|qwen3_coder|transformers>=4.51|✔|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct)|
|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|qwen3_moe|qwen3_coder|transformers>=4.51|✔|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|
|[swift/Qwen3-Coder-480B-A35B-Instruct-AWQ](https://modelscope.cn/models/swift/Qwen3-Coder-480B-A35B-Instruct-AWQ)|qwen3_moe|qwen3_coder|transformers>=4.51|✘|coding|-|
|[iic/QwenLong-L1.5-30B-A3B](https://modelscope.cn/models/iic/QwenLong-L1.5-30B-A3B)|qwen3_moe|qwen3_thinking|transformers>=4.51|✔|-|[Tongyi-Zhiwen/QwenLong-L1.5-30B-A3B](https://huggingface.co/Tongyi-Zhiwen/QwenLong-L1.5-30B-A3B)|
|[Qwen/Qwen3-30B-A3B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Thinking-2507)|qwen3_moe|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507)|
|[Qwen/Qwen3-30B-A3B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Thinking-2507-FP8)|qwen3_moe|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-30B-A3B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507-FP8)|
|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe|qwen3_thinking|transformers>=4.51|✘|-|-|
|[Qwen/Qwen3-Next-80B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct)|qwen3_next|qwen3_nothinking|transformers>=4.57|✔|-|-|
|[Qwen/Qwen3-Next-80B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct-FP8)|qwen3_next|qwen3_nothinking|transformers>=4.57|✔|-|-|
|[Qwen/Qwen3-Next-80B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Thinking)|qwen3_next|qwen3_thinking|transformers>=4.57|✔|-|-|
|[Qwen/Qwen3-Next-80B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Thinking-FP8)|qwen3_next|qwen3_thinking|transformers>=4.57|✔|-|-|
|[Qwen/Qwen3-Coder-Next-Base](https://modelscope.cn/models/Qwen/Qwen3-Coder-Next-Base)|qwen3_next|qwen3_coder|transformers>=4.57|✔|-|[Qwen/Qwen3-Coder-Next-Base](https://huggingface.co/Qwen/Qwen3-Coder-Next-Base)|
|[Qwen/Qwen3-Coder-Next](https://modelscope.cn/models/Qwen/Qwen3-Coder-Next)|qwen3_next|qwen3_coder|transformers>=4.57|✔|-|[Qwen/Qwen3-Coder-Next](https://huggingface.co/Qwen/Qwen3-Coder-Next)|
|[Qwen/Qwen3-Coder-Next-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-Next-FP8)|qwen3_next|qwen3_coder|transformers>=4.57|✔|-|[Qwen/Qwen3-Coder-Next-FP8](https://huggingface.co/Qwen/Qwen3-Coder-Next-FP8)|
|[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|✔|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
|[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|✔|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
|[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|✔|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|
|[Qwen/Qwen3-Reranker-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Reranker-0.6B)|qwen3_reranker|qwen3_reranker|-|✔|-|[Qwen/Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B)|
|[Qwen/Qwen3-Reranker-4B](https://modelscope.cn/models/Qwen/Qwen3-Reranker-4B)|qwen3_reranker|qwen3_reranker|-|✔|-|[Qwen/Qwen3-Reranker-4B](https://huggingface.co/Qwen/Qwen3-Reranker-4B)|
|[Qwen/Qwen3-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-Reranker-8B)|qwen3_reranker|qwen3_reranker|-|✔|-|[Qwen/Qwen3-Reranker-8B](https://huggingface.co/Qwen/Qwen3-Reranker-8B)|
|[iic/gte_Qwen2-1.5B-instruct](https://modelscope.cn/models/iic/gte_Qwen2-1.5B-instruct)|qwen2_gte|dummy|-|✘|-|[Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)|
|[iic/gte_Qwen2-7B-instruct](https://modelscope.cn/models/iic/gte_Qwen2-7B-instruct)|qwen2_gte|dummy|-|✘|-|[Alibaba-NLP/gte-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)|
|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B)|codefuse_qwen|codefuse|-|✘|coding|[codefuse-ai/CodeFuse-QWen-14B](https://huggingface.co/codefuse-ai/CodeFuse-QWen-14B)|
|[iic/ModelScope-Agent-7B](https://modelscope.cn/models/iic/ModelScope-Agent-7B)|modelscope_agent|modelscope_agent|-|✘|-|-|
|[iic/ModelScope-Agent-14B](https://modelscope.cn/models/iic/ModelScope-Agent-14B)|modelscope_agent|modelscope_agent|-|✘|-|-|
|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)|
|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)|
|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)|
|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|
|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)|
|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms)|llama|llama|-|✔|-|[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)|
|[AI-ModelScope/chinese-llama-2-1.3b](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-1.3b)|llama|llama|-|✔|-|[hfl/chinese-llama-2-1.3b](https://huggingface.co/hfl/chinese-llama-2-1.3b)|
|[AI-ModelScope/chinese-llama-2-7b](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-7b)|llama|llama|-|✔|-|[hfl/chinese-llama-2-7b](https://huggingface.co/hfl/chinese-llama-2-7b)|
|[AI-ModelScope/chinese-llama-2-7b-16k](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-7b-16k)|llama|llama|-|✔|-|[hfl/chinese-llama-2-7b-16k](https://huggingface.co/hfl/chinese-llama-2-7b-16k)|
|[AI-ModelScope/chinese-llama-2-7b-64k](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-7b-64k)|llama|llama|-|✔|-|[hfl/chinese-llama-2-7b-64k](https://huggingface.co/hfl/chinese-llama-2-7b-64k)|
|[AI-ModelScope/chinese-llama-2-13b](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-13b)|llama|llama|-|✔|-|[hfl/chinese-llama-2-13b](https://huggingface.co/hfl/chinese-llama-2-13b)|
|[AI-ModelScope/chinese-llama-2-13b-16k](https://modelscope.cn/models/AI-ModelScope/chinese-llama-2-13b-16k)|llama|llama|-|✔|-|[hfl/chinese-llama-2-13b-16k](https://huggingface.co/hfl/chinese-llama-2-13b-16k)|
|[AI-ModelScope/chinese-alpaca-2-1.3b](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-1.3b)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-1.3b](https://huggingface.co/hfl/chinese-alpaca-2-1.3b)|
|[AI-ModelScope/chinese-alpaca-2-7b](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-7b)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-7b](https://huggingface.co/hfl/chinese-alpaca-2-7b)|
|[AI-ModelScope/chinese-alpaca-2-7b-16k](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-7b-16k)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-7b-16k](https://huggingface.co/hfl/chinese-alpaca-2-7b-16k)|
|[AI-ModelScope/chinese-alpaca-2-7b-64k](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-7b-64k)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-7b-64k](https://huggingface.co/hfl/chinese-alpaca-2-7b-64k)|
|[AI-ModelScope/chinese-alpaca-2-13b](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-13b)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-13b](https://huggingface.co/hfl/chinese-alpaca-2-13b)|
|[AI-ModelScope/chinese-alpaca-2-13b-16k](https://modelscope.cn/models/AI-ModelScope/chinese-alpaca-2-13b-16k)|llama|llama|-|✔|-|[hfl/chinese-alpaca-2-13b-16k](https://huggingface.co/hfl/chinese-alpaca-2-13b-16k)|
|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf)|llama|llama|transformers>=4.38, aqlm, torch>=2.2.0|✘|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B)|llama|atom|-|✔|-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat)|llama|atom|-|✔|-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base)|llama|mengzi|-|✔|-|[Langboat/Mengzi3-13B-Base](https://huggingface.co/Langboat/Mengzi3-13B-Base)|
|[AI-ModelScope/NuminaMath-7B-TIR](https://modelscope.cn/models/AI-ModelScope/NuminaMath-7B-TIR)|llama|numina|-|✔|math|[AI-MO/NuminaMath-7B-TIR](https://huggingface.co/AI-MO/NuminaMath-7B-TIR)|
|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base)|llama|ziya|-|✔|-|[IDEA-CCNL/Ziya2-13B-Base](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Base)|
|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat)|llama|ziya|-|✔|-|[IDEA-CCNL/Ziya2-13B-Chat](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Chat)|
|[InfiniAI/Megrez-3b-Instruct](https://modelscope.cn/models/InfiniAI/Megrez-3b-Instruct)|llama|megrez|-|✔|-|[Infinigence/Megrez-3B-Instruct](https://huggingface.co/Infinigence/Megrez-3B-Instruct)|
|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base)|llama|deepseek|-|✔|-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)|
|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat)|llama|deepseek|-|✔|-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)|
|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base)|llama|deepseek|-|✔|-|[deepseek-ai/deepseek-llm-67b-base](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)|
|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat)|llama|deepseek|-|✔|-|[deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)|
|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base)|llama|deepseek|-|✔|math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct)|llama|deepseek|-|✔|math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl)|llama|deepseek|-|✔|math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)|
|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-1.3b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)|
|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-6.7b-base](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base)|
|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct)|llama|deepseek|-|✔|coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
|[gongjy/MiniMind2](https://modelscope.cn/models/gongjy/MiniMind2)|llama|minimind|transformers>=4.57.1|✔|-|[jingyaogong/MiniMind2](https://huggingface.co/jingyaogong/MiniMind2)|
|-|llama|minimind|transformers>=4.57.1|✔|-|[jingyaogong/MiniMind2-Small](https://huggingface.co/jingyaogong/MiniMind2-Small)|
|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct)|llama|llama3|-|✔|-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct)|llama|llama3|-|✔|-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B)|llama|llama3|-|✔|-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B)|llama|llama3|-|✔|-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
|[swift/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/swift/Meta-Llama-3-8B-Instruct-GPTQ-Int4)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://huggingface.co/study-hjt/Meta-Llama-3-8B-Instruct-GPTQ-Int4)|
|[swift/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/swift/Meta-Llama-3-8B-Instruct-GPTQ-Int8)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://huggingface.co/study-hjt/Meta-Llama-3-8B-Instruct-GPTQ-Int8)|
|[swift/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/swift/Meta-Llama-3-8B-Instruct-AWQ)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-8B-Instruct-AWQ](https://huggingface.co/study-hjt/Meta-Llama-3-8B-Instruct-AWQ)|
|[swift/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/swift/Meta-Llama-3-70B-Instruct-GPTQ-Int4)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://huggingface.co/study-hjt/Meta-Llama-3-70B-Instruct-GPTQ-Int4)|
|[swift/Meta-Llama-3-70B-Instruct-GPTQ-Int8](https://modelscope.cn/models/swift/Meta-Llama-3-70B-Instruct-GPTQ-Int8)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-70B-Instruct-GPTQ-Int8](https://huggingface.co/study-hjt/Meta-Llama-3-70B-Instruct-GPTQ-Int8)|
|[swift/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/swift/Meta-Llama-3-70B-Instruct-AWQ)|llama|llama3|-|✘|-|[study-hjt/Meta-Llama-3-70B-Instruct-AWQ](https://huggingface.co/study-hjt/Meta-Llama-3-70B-Instruct-AWQ)|
|[ChineseAlpacaGroup/llama-3-chinese-8b-instruct](https://modelscope.cn/models/ChineseAlpacaGroup/llama-3-chinese-8b-instruct)|llama|llama3|-|✔|-|[hfl/llama-3-chinese-8b-instruct](https://huggingface.co/hfl/llama-3-chinese-8b-instruct)|
|[ChineseAlpacaGroup/llama-3-chinese-8b](https://modelscope.cn/models/ChineseAlpacaGroup/llama-3-chinese-8b)|llama|llama3|-|✔|-|[hfl/llama-3-chinese-8b](https://huggingface.co/hfl/llama-3-chinese-8b)|
|[LLM-Research/Meta-Llama-3.1-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|
|[LLM-Research/Meta-Llama-3.1-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)|
|[LLM-Research/Meta-Llama-3.1-405B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct)|
|[LLM-Research/Meta-Llama-3.1-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)|
|[LLM-Research/Meta-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)|
|[LLM-Research/Meta-Llama-3.1-405B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Meta-Llama-3.1-405B](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B)|
|[LLM-Research/Meta-Llama-3.1-70B-Instruct-FP8](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B-Instruct-FP8)|llama|llama3_2|transformers>=4.43|✘|-|[meta-llama/Meta-Llama-3.1-70B-Instruct-FP8](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct-FP8)|
|[LLM-Research/Meta-Llama-3.1-405B-Instruct-FP8](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-FP8)|llama|llama3_2|transformers>=4.43|✘|-|[meta-llama/Meta-Llama-3.1-405B-Instruct-FP8](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct-FP8)|
|[LLM-Research/Meta-Llama-3.1-8B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct-BNB-NF4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4)|
|[LLM-Research/Meta-Llama-3.1-70B-Instruct-bnb-4bit](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B-Instruct-bnb-4bit)|llama|llama3_2|transformers>=4.43|✘|-|[unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit](https://huggingface.co/unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit)|
|[LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|
|[LLM-Research/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4)|
|[LLM-Research/Meta-Llama-3.1-70B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B-Instruct-GPTQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-70B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-70B-Instruct-GPTQ-INT4)|
|[LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|
|[LLM-Research/Meta-Llama-3.1-8B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct-AWQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4)|
|[LLM-Research/Meta-Llama-3.1-70B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-70B-Instruct-AWQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4)|
|[LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|llama|llama3_2|transformers>=4.43|✘|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|
|[AI-ModelScope/Llama-3.1-Nemotron-70B-Instruct-HF](https://modelscope.cn/models/AI-ModelScope/Llama-3.1-Nemotron-70B-Instruct-HF)|llama|llama3_2|transformers>=4.43|✔|-|[nvidia/Llama-3.1-Nemotron-70B-Instruct-HF](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF)|
|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|llama|skywork_o1|transformers>=4.43|✔|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
|[LLM-Research/Llama-3.3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.3-70B-Instruct)|llama|llama3_2|transformers>=4.43|✔|-|[meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)|
|[unsloth/Llama-3.3-70B-Instruct-bnb-4bit](https://modelscope.cn/models/unsloth/Llama-3.3-70B-Instruct-bnb-4bit)|llama|llama3_2|transformers>=4.43|✘|-|[unsloth/Llama-3.3-70B-Instruct-bnb-4bit](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit)|
|[ZhipuAI/LongWriter-llama3.1-8b](https://modelscope.cn/models/ZhipuAI/LongWriter-llama3.1-8b)|llama|longwriter_llama|transformers>=4.43|✔|-|[zai-org/LongWriter-llama3.1-8b](https://huggingface.co/zai-org/LongWriter-llama3.1-8b)|
|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|llama|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|
|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|llama|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|
|[LLM-Research/Reflection-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Reflection-Llama-3.1-70B)|llama|reflection|transformers>=4.43|✔|-|[mattshumer/Reflection-Llama-3.1-70B](https://huggingface.co/mattshumer/Reflection-Llama-3.1-70B)|
|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B)|yi|chatml|-|✔|-|[01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)|
|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K)|yi|chatml|-|✔|-|[01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)|
|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat)|yi|chatml|-|✔|-|[01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)|
|[01ai/Yi-6B-Chat-4bits](https://modelscope.cn/models/01ai/Yi-6B-Chat-4bits)|yi|chatml|-|✔|-|[01-ai/Yi-6B-Chat-4bits](https://huggingface.co/01-ai/Yi-6B-Chat-4bits)|
|[01ai/Yi-6B-Chat-8bits](https://modelscope.cn/models/01ai/Yi-6B-Chat-8bits)|yi|chatml|-|✔|-|[01-ai/Yi-6B-Chat-8bits](https://huggingface.co/01-ai/Yi-6B-Chat-8bits)|
|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B)|yi|chatml|-|✔|-|[01-ai/Yi-9B](https://huggingface.co/01-ai/Yi-9B)|
|[01ai/Yi-9B-200K](https://modelscope.cn/models/01ai/Yi-9B-200K)|yi|chatml|-|✔|-|[01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)|
|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B)|yi|chatml|-|✔|-|[01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)|
|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K)|yi|chatml|-|✔|-|[01-ai/Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)|
|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat)|yi|chatml|-|✔|-|[01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)|
|[01ai/Yi-34B-Chat-4bits](https://modelscope.cn/models/01ai/Yi-34B-Chat-4bits)|yi|chatml|-|✔|-|[01-ai/Yi-34B-Chat-4bits](https://huggingface.co/01-ai/Yi-34B-Chat-4bits)|
|[01ai/Yi-34B-Chat-8bits](https://modelscope.cn/models/01ai/Yi-34B-Chat-8bits)|yi|chatml|-|✔|-|[01-ai/Yi-34B-Chat-8bits](https://huggingface.co/01-ai/Yi-34B-Chat-8bits)|
|[01ai/Yi-1.5-6B](https://modelscope.cn/models/01ai/Yi-1.5-6B)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-6B](https://huggingface.co/01-ai/Yi-1.5-6B)|
|[01ai/Yi-1.5-6B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-6B-Chat)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)|
|[01ai/Yi-1.5-9B](https://modelscope.cn/models/01ai/Yi-1.5-9B)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-9B](https://huggingface.co/01-ai/Yi-1.5-9B)|
|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)|
|[01ai/Yi-1.5-9B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat-16K)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
|[01ai/Yi-1.5-34B](https://modelscope.cn/models/01ai/Yi-1.5-34B)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-34B](https://huggingface.co/01-ai/Yi-1.5-34B)|
|[01ai/Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)|
|[01ai/Yi-1.5-34B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat-16K)|yi|chatml|-|✔|-|[01-ai/Yi-1.5-34B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-34B-Chat-16K)|
|[AI-ModelScope/Yi-1.5-6B-Chat-GPTQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-6B-Chat-GPTQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-6B-Chat-GPTQ](https://huggingface.co/modelscope/Yi-1.5-6B-Chat-GPTQ)|
|[AI-ModelScope/Yi-1.5-6B-Chat-AWQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-6B-Chat-AWQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-6B-Chat-AWQ](https://huggingface.co/modelscope/Yi-1.5-6B-Chat-AWQ)|
|[AI-ModelScope/Yi-1.5-9B-Chat-GPTQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-9B-Chat-GPTQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-9B-Chat-GPTQ](https://huggingface.co/modelscope/Yi-1.5-9B-Chat-GPTQ)|
|[AI-ModelScope/Yi-1.5-9B-Chat-AWQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-9B-Chat-AWQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-9B-Chat-AWQ](https://huggingface.co/modelscope/Yi-1.5-9B-Chat-AWQ)|
|[AI-ModelScope/Yi-1.5-34B-Chat-GPTQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-34B-Chat-GPTQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-34B-Chat-GPTQ](https://huggingface.co/modelscope/Yi-1.5-34B-Chat-GPTQ)|
|[AI-ModelScope/Yi-1.5-34B-Chat-AWQ](https://modelscope.cn/models/AI-ModelScope/Yi-1.5-34B-Chat-AWQ)|yi|chatml|-|✘|-|[modelscope/Yi-1.5-34B-Chat-AWQ](https://huggingface.co/modelscope/Yi-1.5-34B-Chat-AWQ)|
|[01ai/Yi-Coder-1.5B](https://modelscope.cn/models/01ai/Yi-Coder-1.5B)|yi|yi_coder|-|✔|coding|[01-ai/Yi-Coder-1.5B](https://huggingface.co/01-ai/Yi-Coder-1.5B)|
|[01ai/Yi-Coder-9B](https://modelscope.cn/models/01ai/Yi-Coder-9B)|yi|yi_coder|-|✔|coding|[01-ai/Yi-Coder-9B](https://huggingface.co/01-ai/Yi-Coder-9B)|
|[01ai/Yi-Coder-1.5B-Chat](https://modelscope.cn/models/01ai/Yi-Coder-1.5B-Chat)|yi|yi_coder|-|✔|coding|[01-ai/Yi-Coder-1.5B-Chat](https://huggingface.co/01-ai/Yi-Coder-1.5B-Chat)|
|[01ai/Yi-Coder-9B-Chat](https://modelscope.cn/models/01ai/Yi-Coder-9B-Chat)|yi|yi_coder|-|✔|coding|[01-ai/Yi-Coder-9B-Chat](https://huggingface.co/01-ai/Yi-Coder-9B-Chat)|
|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B)|yi|sus|-|✔|-|[SUSTech/SUS-Chat-34B](https://huggingface.co/SUSTech/SUS-Chat-34B)|
|[openai-mirror/gpt-oss-20b](https://modelscope.cn/models/openai-mirror/gpt-oss-20b)|gpt_oss|gpt_oss|transformers>=4.55|✔|-|[openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b)|
|[openai-mirror/gpt-oss-120b](https://modelscope.cn/models/openai-mirror/gpt-oss-120b)|gpt_oss|gpt_oss|transformers>=4.55|✔|-|[openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b)|
|[ByteDance-Seed/Seed-OSS-36B-Instruct](https://modelscope.cn/models/ByteDance-Seed/Seed-OSS-36B-Instruct)|seed_oss|seed_oss|transformers>=4.56|✘|-|[ByteDance-Seed/Seed-OSS-36B-Instruct](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct)|
|[ByteDance-Seed/Seed-OSS-36B-Base](https://modelscope.cn/models/ByteDance-Seed/Seed-OSS-36B-Base)|seed_oss|seed_oss|transformers>=4.56|✘|-|[ByteDance-Seed/Seed-OSS-36B-Base](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Base)|
|[ByteDance-Seed/Seed-OSS-36B-Base-woSyn](https://modelscope.cn/models/ByteDance-Seed/Seed-OSS-36B-Base-woSyn)|seed_oss|seed_oss|transformers>=4.56|✘|-|[ByteDance-Seed/Seed-OSS-36B-Base-woSyn](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Base-woSyn)|
|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B)|codefuse_codellama|codefuse_codellama|-|✔|coding|[codefuse-ai/CodeFuse-CodeLlama-34B](https://huggingface.co/codefuse-ai/CodeFuse-CodeLlama-34B)|
|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b)|chatglm2|chatglm2|transformers<4.42|✘|-|[zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b)|
|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k)|chatglm2|chatglm2|transformers<4.42|✘|-|[zai-org/chatglm2-6b-32k](https://huggingface.co/zai-org/chatglm2-6b-32k)|
|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b)|chatglm2|chatglm2|transformers<4.34|✘|coding|[zai-org/codegeex2-6b](https://huggingface.co/zai-org/codegeex2-6b)|
|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b)|chatglm3|chatglm4|transformers<4.42|✘|-|[zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b)|
|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base)|chatglm3|chatglm4|transformers<4.42|✘|-|[zai-org/chatglm3-6b-base](https://huggingface.co/zai-org/chatglm3-6b-base)|
|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k)|chatglm3|chatglm4|transformers<4.42|✘|-|[zai-org/chatglm3-6b-32k](https://huggingface.co/zai-org/chatglm3-6b-32k)|
|[ZhipuAI/chatglm3-6b-128k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-128k)|chatglm3|chatglm4|transformers<4.42|✘|-|[zai-org/chatglm3-6b-128k](https://huggingface.co/zai-org/chatglm3-6b-128k)|
|[ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)|chatglm4|chatglm4|transformers>=4.42|✘|-|[zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat)|
|[ZhipuAI/glm-4-9b](https://modelscope.cn/models/ZhipuAI/glm-4-9b)|chatglm4|chatglm4|transformers>=4.42|✘|-|[zai-org/glm-4-9b](https://huggingface.co/zai-org/glm-4-9b)|
|[ZhipuAI/glm-4-9b-chat-1m](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)|chatglm4|chatglm4|transformers>=4.42|✘|-|[zai-org/glm-4-9b-chat-1m](https://huggingface.co/zai-org/glm-4-9b-chat-1m)|
|[ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b)|chatglm4|chatglm4|transformers>=4.42|✘|-|[zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b)|
|[ZhipuAI/GLM-4-9B-0414](https://modelscope.cn/models/ZhipuAI/GLM-4-9B-0414)|glm4|glm4|transformers>=4.51|✔|-|[zai-org/GLM-4-9B-0414](https://huggingface.co/zai-org/GLM-4-9B-0414)|
|[ZhipuAI/GLM-4-32B-0414](https://modelscope.cn/models/ZhipuAI/GLM-4-32B-0414)|glm4|glm4|transformers>=4.51|✔|-|[zai-org/GLM-4-32B-0414](https://huggingface.co/zai-org/GLM-4-32B-0414)|
|[ZhipuAI/GLM-4-32B-Base-0414](https://modelscope.cn/models/ZhipuAI/GLM-4-32B-Base-0414)|glm4|glm4|transformers>=4.51|✔|-|[zai-org/GLM-4-32B-Base-0414](https://huggingface.co/zai-org/GLM-4-32B-Base-0414)|
|[ZhipuAI/GLM-Z1-9B-0414](https://modelscope.cn/models/ZhipuAI/GLM-Z1-9B-0414)|glm4|glm4|transformers>=4.51|✔|-|[zai-org/GLM-Z1-9B-0414](https://huggingface.co/zai-org/GLM-Z1-9B-0414)|
|[ZhipuAI/GLM-Z1-32B-0414](https://modelscope.cn/models/ZhipuAI/GLM-Z1-32B-0414)|glm4|glm4|transformers>=4.51|✔|-|[zai-org/GLM-Z1-32B-0414](https://huggingface.co/zai-org/GLM-Z1-32B-0414)|
|[ZhipuAI/GLM-Z1-Rumination-32B-0414](https://modelscope.cn/models/ZhipuAI/GLM-Z1-Rumination-32B-0414)|glm4|glm4_z1_rumination|transformers>=4.51|✔|-|[zai-org/GLM-Z1-Rumination-32B-0414](https://huggingface.co/zai-org/GLM-Z1-Rumination-32B-0414)|
|[ZhipuAI/GLM-4.5-Air-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-Base)|glm4_moe|glm4_5|transformers>=4.54|✔|-|[zai-org/GLM-4.5-Air-Base](https://huggingface.co/zai-org/GLM-4.5-Air-Base)|
|[ZhipuAI/GLM-4.5-Air](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air)|glm4_moe|glm4_5|transformers>=4.54|✔|-|[zai-org/GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air)|
|[ZhipuAI/GLM-4.5-Air-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-FP8)|glm4_moe|glm4_5|transformers>=4.54|✘|-|[zai-org/GLM-4.5-Air-FP8](https://huggingface.co/zai-org/GLM-4.5-Air-FP8)|
|[ZhipuAI/GLM-4.5-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Base)|glm4_moe|glm4_5|transformers>=4.54|✔|-|[zai-org/GLM-4.5-Base](https://huggingface.co/zai-org/GLM-4.5-Base)|
|[ZhipuAI/GLM-4.5](https://modelscope.cn/models/ZhipuAI/GLM-4.5)|glm4_moe|glm4_5|transformers>=4.54|✔|-|[zai-org/GLM-4.5](https://huggingface.co/zai-org/GLM-4.5)|
|[ZhipuAI/GLM-4.5-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5-FP8)|glm4_moe|glm4_5|transformers>=4.54|✘|-|[zai-org/GLM-4.5-FP8](https://huggingface.co/zai-org/GLM-4.5-FP8)|
|[ZhipuAI/GLM-4.6](https://modelscope.cn/models/ZhipuAI/GLM-4.6)|glm4_moe|glm4_5|transformers>=4.54|✔|-|[zai-org/GLM-4.6](https://huggingface.co/zai-org/GLM-4.6)|
|[ZhipuAI/GLM-4.6-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6-FP8)|glm4_moe|glm4_5|transformers>=4.54|✘|-|[zai-org/GLM-4.6-FP8](https://huggingface.co/zai-org/GLM-4.6-FP8)|
|[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|✔|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
|[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|✘|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
|[ZhipuAI/GLM-5](https://modelscope.cn/models/ZhipuAI/GLM-5)|glm_moe_dsa|glm4_7|transformers>=5.2.0|✔|-|[zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)|
|[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|✘|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|
|[ZhipuAI/codegeex4-all-9b](https://modelscope.cn/models/ZhipuAI/codegeex4-all-9b)|codegeex4|codegeex4|transformers<4.42|✘|coding|[zai-org/codegeex4-all-9b](https://huggingface.co/zai-org/codegeex4-all-9b)|
|[Shanghai_AI_Laboratory/internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b)|internlm|internlm|-|✘|-|[internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)|
|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b)|internlm|internlm|-|✘|-|[internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)|
|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k)|internlm|internlm|-|✘|-|-|
|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b)|internlm|internlm|-|✘|-|[internlm/internlm-20b](https://huggingface.co/internlm/internlm-20b)|
|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b)|internlm|internlm|-|✘|-|[internlm/internlm-chat-20b](https://huggingface.co/internlm/internlm-chat-20b)|
|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b)|
|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b)|
|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)|
|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b)|
|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)|
|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b)|
|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-7b-sft](https://huggingface.co/internlm/internlm2-chat-7b-sft)|
|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-base-20b](https://huggingface.co/internlm/internlm2-base-20b)|
|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)|
|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)|
|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2-chat-20b-sft](https://huggingface.co/internlm/internlm2-chat-20b-sft)|
|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b)|internlm2|internlm2|transformers>=4.38|✘|math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)|
|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b)|internlm2|internlm2|transformers>=4.38|✘|math|[internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b)|
|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b)|internlm2|internlm2|transformers>=4.38|✘|math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)|
|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b)|internlm2|internlm2|transformers>=4.38|✘|math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)|
|[Shanghai_AI_Laboratory/internlm2_5-1_8b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-1_8b-chat)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-1_8b-chat](https://huggingface.co/internlm/internlm2_5-1_8b-chat)|
|[Shanghai_AI_Laboratory/internlm2_5-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-1_8b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-1_8b](https://huggingface.co/internlm/internlm2_5-1_8b)|
|[Shanghai_AI_Laboratory/internlm2_5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-7b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-7b](https://huggingface.co/internlm/internlm2_5-7b)|
|[Shanghai_AI_Laboratory/internlm2_5-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-7b-chat)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-7b-chat](https://huggingface.co/internlm/internlm2_5-7b-chat)|
|[Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-7b-chat-1m](https://huggingface.co/internlm/internlm2_5-7b-chat-1m)|
|[Shanghai_AI_Laboratory/internlm2_5-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-20b)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-20b](https://huggingface.co/internlm/internlm2_5-20b)|
|[Shanghai_AI_Laboratory/internlm2_5-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-20b-chat)|internlm2|internlm2|transformers>=4.38|✘|-|[internlm/internlm2_5-20b-chat](https://huggingface.co/internlm/internlm2_5-20b-chat)|
|[Shanghai_AI_Laboratory/internlm3-8b-instruct](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm3-8b-instruct)|internlm3|internlm2|transformers>=4.48|✔|-|[internlm/internlm3-8b-instruct](https://huggingface.co/internlm/internlm3-8b-instruct)|
|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat)|deepseek|deepseek|-|✔|-|[deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)|
|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base)|deepseek|deepseek|-|✔|-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)|
|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
|[deepseek-ai/DeepSeek-Coder-V2-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Base)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Coder-V2-Base](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Base)|
|[deepseek-ai/DeepSeek-Coder-V2-Lite-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Base)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Coder-V2-Lite-Base](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Base)|
|[deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)|
|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat)|
|[deepseek-ai/DeepSeek-V2](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)|
|[deepseek-ai/DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat)|deepseek_v2|deepseek|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat)|
|[deepseek-ai/DeepSeek-V2.5](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5)|deepseek_v2|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5)|
|[deepseek-ai/DeepSeek-V2.5-1210](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5-1210)|deepseek_v2|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V2.5-1210](https://huggingface.co/deepseek-ai/DeepSeek-V2.5-1210)|
|[deepseek-ai/DeepSeek-V3-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-Base)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)|
|[deepseek-ai/DeepSeek-V3](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)|
|[deepseek-ai/DeepSeek-V3-0324](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-0324)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324)|
|[cognitivecomputations/DeepSeek-V3-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-awq)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✘|-|[cognitivecomputations/DeepSeek-V3-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-AWQ)|
|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-0324-AWQ)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✘|-|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-0324-AWQ)|
|[deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B)|
|[deepseek-ai/DeepSeek-Prover-V2-671B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-671B)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-Prover-V2-671B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-671B)|
|[unsloth/DeepSeek-V3-bf16](https://modelscope.cn/models/unsloth/DeepSeek-V3-bf16)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-V3-bf16](https://huggingface.co/unsloth/DeepSeek-V3-bf16)|
|[unsloth/DeepSeek-V3-0324-BF16](https://modelscope.cn/models/unsloth/DeepSeek-V3-0324-BF16)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-V3-0324-BF16](https://huggingface.co/unsloth/DeepSeek-V3-0324-BF16)|
|[unsloth/DeepSeek-Prover-V2-671B-BF16](https://modelscope.cn/models/unsloth/DeepSeek-Prover-V2-671B-BF16)|deepseek_v3|deepseek_v2_5|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-Prover-V2-671B-BF16](https://huggingface.co/unsloth/DeepSeek-Prover-V2-671B-BF16)|
|[deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)|
|[deepseek-ai/DeepSeek-R1-Zero](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Zero)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-R1-Zero](https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero)|
|[deepseek-ai/DeepSeek-R1-0528](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-0528)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528)|
|[cognitivecomputations/DeepSeek-R1-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✘|-|[cognitivecomputations/DeepSeek-R1-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ)|
|[cognitivecomputations/DeepSeek-R1-0528-AWQ](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-0528-AWQ)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✘|-|[cognitivecomputations/DeepSeek-R1-0528-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-0528-AWQ)|
|[unsloth/DeepSeek-R1-BF16](https://modelscope.cn/models/unsloth/DeepSeek-R1-BF16)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-R1-BF16](https://huggingface.co/unsloth/DeepSeek-R1-BF16)|
|[unsloth/DeepSeek-R1-Zero-BF16](https://modelscope.cn/models/unsloth/DeepSeek-R1-Zero-BF16)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-R1-Zero-BF16](https://huggingface.co/unsloth/DeepSeek-R1-Zero-BF16)|
|[unsloth/DeepSeek-R1-0528-BF16](https://modelscope.cn/models/unsloth/DeepSeek-R1-0528-BF16)|deepseek_v3|deepseek_r1|transformers>=4.39.3|✔|-|[unsloth/DeepSeek-R1-0528-BF16](https://huggingface.co/unsloth/DeepSeek-R1-0528-BF16)|
|[moonshotai/Moonlight-16B-A3B](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B)|deepseek_v3|moonlight|transformers<4.49|✔|-|[moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)|
|[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|deepseek_v3|moonlight|transformers<4.49|✔|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)|
|[moonshotai/Kimi-K2-Base](https://modelscope.cn/models/moonshotai/Kimi-K2-Base)|deepseek_v3|kimi_k2|transformers>=4.39.3|✔|-|[moonshotai/Kimi-K2-Base](https://huggingface.co/moonshotai/Kimi-K2-Base)|
|[moonshotai/Kimi-K2-Instruct](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct)|deepseek_v3|kimi_k2|transformers>=4.39.3|✔|-|[moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)|
|[moonshotai/Kimi-K2-Instruct-0905](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct-0905)|deepseek_v3|kimi_k2|transformers>=4.39.3|✔|-|[moonshotai/Kimi-K2-Instruct-0905](https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905)|
|[moonshotai/Kimi-K2-Thinking](https://modelscope.cn/models/moonshotai/Kimi-K2-Thinking)|deepseek_v3|kimi_k2|transformers>=4.39.3|✔|-|[moonshotai/Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking)|
|[deepseek-ai/DeepSeek-V3.1-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1-Base)|deepseek_v3|deepseek_v3_1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3.1-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Base)|
|[deepseek-ai/DeepSeek-V3.1](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1)|deepseek_v3|deepseek_v3_1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1)|
|[deepseek-ai/DeepSeek-V3.1-Terminus](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1-Terminus)|deepseek_v3|deepseek_v3_1|transformers>=4.39.3|✔|-|[deepseek-ai/DeepSeek-V3.1-Terminus](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus)|
|[deepseek-ai/DeepSeek-V3.2](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.2)|deepseek_v32|deepseek_v3_1|-|✔|-|[deepseek-ai/DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)|
|[deepseek-ai/DeepSeek-V3.2-Speciale](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.2-Speciale)|deepseek_v32|deepseek_v3_1|-|✔|-|[deepseek-ai/DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale)|
|[deepseek-ai/DeepSeek-V3.2-Exp](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.2-Exp)|deepseek_v32|deepseek_v3_1|-|✔|-|[deepseek-ai/DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)|
|[deepseek-ai/DeepSeek-V3.2-Exp-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.2-Exp-Base)|deepseek_v32|deepseek_v3_1|-|✔|-|[deepseek-ai/DeepSeek-V3.2-Exp-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp-Base)|
|[deepseek-ai/DeepSeek-Math-V2](https://modelscope.cn/models/deepseek-ai/DeepSeek-Math-V2)|deepseek_v32|deepseek_v3_1|-|✔|-|[deepseek-ai/DeepSeek-Math-V2](https://huggingface.co/deepseek-ai/DeepSeek-Math-V2)|
|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)|
|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|
|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|
|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://huggingface.co/OpenBuddy/openbuddy-deepseek-67b-v15.2)|
|[OpenBuddy/openbuddy-llama3-8b-v21.1-8k](https://modelscope.cn/models/OpenBuddy/openbuddy-llama3-8b-v21.1-8k)|openbuddy_llama|openbuddy2|-|✔|-|[OpenBuddy/openbuddy-llama3-8b-v21.1-8k](https://huggingface.co/OpenBuddy/openbuddy-llama3-8b-v21.1-8k)|
|[OpenBuddy/openbuddy-llama3-70b-v21.1-8k](https://modelscope.cn/models/OpenBuddy/openbuddy-llama3-70b-v21.1-8k)|openbuddy_llama|openbuddy2|-|✔|-|[OpenBuddy/openbuddy-llama3-70b-v21.1-8k](https://huggingface.co/OpenBuddy/openbuddy-llama3-70b-v21.1-8k)|
|[OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k)|openbuddy_llama|openbuddy2|-|✔|-|[OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k](https://huggingface.co/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k)|
|[OpenBuddy/openbuddy-llama3.1-8b-v22.1-131k](https://modelscope.cn/models/OpenBuddy/openbuddy-llama3.1-8b-v22.1-131k)|openbuddy_llama|openbuddy2|transformers>=4.43|✔|-|[OpenBuddy/openbuddy-llama3.1-8b-v22.1-131k](https://huggingface.co/OpenBuddy/openbuddy-llama3.1-8b-v22.1-131k)|
|[OpenBuddy/openbuddy-nemotron-70b-v23.2-131k](https://modelscope.cn/models/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k)|openbuddy_llama|openbuddy2|transformers>=4.43|✔|-|[OpenBuddy/openbuddy-nemotron-70b-v23.2-131k](https://huggingface.co/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k)|
|[OpenBuddy/openbuddy-llama3.3-70b-v24.3-131k](https://modelscope.cn/models/OpenBuddy/openbuddy-llama3.3-70b-v24.3-131k)|openbuddy_llama|openbuddy2|transformers>=4.45|✔|-|[OpenBuddy/openbuddy-llama3.3-70b-v24.3-131k](https://huggingface.co/OpenBuddy/openbuddy-llama3.3-70b-v24.3-131k)|
|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)|openbuddy_mistral|openbuddy|transformers>=4.34|✘|-|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)|
|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1)|openbuddy_mistral|openbuddy|transformers>=4.34|✘|-|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://huggingface.co/OpenBuddy/openbuddy-zephyr-7b-v14.1)|
|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)|openbuddy_mixtral|openbuddy|transformers>=4.36|✘|-|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)|
|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat)|baichuan|baichuan|transformers<4.34|✘|-|[baichuan-inc/Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)|
|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base)|baichuan|baichuan|transformers<4.34|✘|-|[baichuan-inc/Baichuan-13B-Base](https://huggingface.co/baichuan-inc/Baichuan-13B-Base)|
|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B)|baichuan|baichuan|transformers<4.34|✘|-|[baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B)|
|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat)|baichuan2|baichuan|-|✘|-|[baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)|
|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base)|baichuan2|baichuan|-|✘|-|[baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)|
|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat)|baichuan2|baichuan|-|✘|-|[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)|
|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base)|baichuan2|baichuan|-|✘|-|[baichuan-inc/Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)|
|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits)|baichuan2|baichuan|bitsandbytes<0.41.2, accelerate<0.26|✘|-|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat-4bits)|
|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits)|baichuan2|baichuan|bitsandbytes<0.41.2, accelerate<0.26|✘|-|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat-4bits)|
|[baichuan-inc/Baichuan-M1-14B-Instruct](https://modelscope.cn/models/baichuan-inc/Baichuan-M1-14B-Instruct)|baichuan_m1|baichuan_m1|transformers>=4.48|✘|-|[baichuan-inc/Baichuan-M1-14B-Instruct](https://huggingface.co/baichuan-inc/Baichuan-M1-14B-Instruct)|
|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32)|minicpm|minicpm|transformers>=4.36.0|✘|-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|
|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32)|minicpm|minicpm|transformers>=4.36.0|✘|-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|
|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16)|minicpm|minicpm|transformers>=4.36.0|✘|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)|
|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k)|minicpm_chatml|chatml|transformers>=4.36|✘|-|[openbmb/MiniCPM-2B-128k](https://huggingface.co/openbmb/MiniCPM-2B-128k)|
|[OpenBMB/MiniCPM4-0.5B](https://modelscope.cn/models/OpenBMB/MiniCPM4-0.5B)|minicpm_chatml|chatml|transformers>=4.36|✘|-|[openbmb/MiniCPM4-0.5B](https://huggingface.co/openbmb/MiniCPM4-0.5B)|
|[OpenBMB/MiniCPM4-8B](https://modelscope.cn/models/OpenBMB/MiniCPM4-8B)|minicpm_chatml|chatml|transformers>=4.36|✘|-|[openbmb/MiniCPM4-8B](https://huggingface.co/openbmb/MiniCPM4-8B)|
|[OpenBMB/MiniCPM3-4B](https://modelscope.cn/models/OpenBMB/MiniCPM3-4B)|minicpm3|chatml|transformers>=4.36|✘|-|[openbmb/MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B)|
|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B)|minicpm_moe|minicpm|transformers>=4.36|✘|-|[openbmb/MiniCPM-MoE-8x2B](https://huggingface.co/openbmb/MiniCPM-MoE-8x2B)|
|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B)|telechat|telechat|-|✘|-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B)|telechat|telechat|-|✘|-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2)|telechat|telechat|-|✘|-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
|[TeleAI/TeleChat-52B](https://modelscope.cn/models/TeleAI/TeleChat-52B)|telechat|telechat|-|✘|-|[TeleAI/TeleChat-52B](https://huggingface.co/TeleAI/TeleChat-52B)|
|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4)|telechat|telechat|-|✘|-|-|
|[TeleAI/TeleChat2-35B](https://modelscope.cn/models/TeleAI/TeleChat2-35B)|telechat|telechat|-|✘|-|[Tele-AI/TeleChat2-35B](https://huggingface.co/Tele-AI/TeleChat2-35B)|
|[TeleAI/TeleChat2-115B](https://modelscope.cn/models/TeleAI/TeleChat2-115B)|telechat|telechat|-|✘|-|[Tele-AI/TeleChat2-115B](https://huggingface.co/Tele-AI/TeleChat2-115B)|
|[TeleAI/TeleChat2-3B](https://modelscope.cn/models/TeleAI/TeleChat2-3B)|telechat2|telechat2|-|✘|-|[Tele-AI/TeleChat2-3B](https://huggingface.co/Tele-AI/TeleChat2-3B)|
|[TeleAI/TeleChat2-7B-32K](https://modelscope.cn/models/TeleAI/TeleChat2-7B-32K)|telechat2|telechat2|-|✘|-|[Tele-AI/TeleChat2-7B-32K](https://huggingface.co/Tele-AI/TeleChat2-7B-32K)|
|[TeleAI/TeleChat2-35B-32K](https://modelscope.cn/models/TeleAI/TeleChat2-35B-32K)|telechat2|telechat2|-|✘|-|[Tele-AI/TeleChat2-35B-32K](https://huggingface.co/Tele-AI/TeleChat2-35B-32K)|
|[TeleAI/TeleChat2-35B-Nov](https://modelscope.cn/models/TeleAI/TeleChat2-35B-Nov)|telechat2|telechat2|-|✘|-|[Tele-AI/TeleChat2-35B-Nov](https://huggingface.co/Tele-AI/TeleChat2-35B-Nov)|
|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1)|mistral|llama|transformers>=4.34|✘|-|[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)|
|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2)|mistral|llama|transformers>=4.34|✘|-|[mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)|
|[LLM-Research/Mistral-7B-Instruct-v0.3](https://modelscope.cn/models/LLM-Research/Mistral-7B-Instruct-v0.3)|mistral|llama|transformers>=4.34|✘|-|[mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)|
|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1)|mistral|llama|transformers>=4.34|✘|-|[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)|
|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf)|mistral|llama|transformers>=4.34|✘|-|[alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf)|
|[swift/Codestral-22B-v0.1](https://modelscope.cn/models/swift/Codestral-22B-v0.1)|mistral|llama|transformers>=4.34|✘|-|[mistralai/Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1)|
|[mistralai/Devstral-Small-2505](https://modelscope.cn/models/mistralai/Devstral-Small-2505)|devstral|devstral|transformers>=4.43, mistral-common>=1.5.5|✘|-|[mistralai/Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505)|
|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta)|zephyr|zephyr|transformers>=4.34|✘|-|[HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)|
|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1)|mixtral|llama|transformers>=4.36|✘|-|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)|
|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1)|mixtral|llama|transformers>=4.36|✘|-|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)|
|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1)|mixtral|llama|transformers>=4.36|✘|-|[mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)|
|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf)|mixtral|llama|transformers>=4.38, aqlm, torch>=2.2.0|✘|-|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)|
|[AI-ModelScope/Mistral-Small-Instruct-2409](https://modelscope.cn/models/AI-ModelScope/Mistral-Small-Instruct-2409)|mistral_nemo|mistral_nemo|transformers>=4.43|✘|-|[mistralai/Mistral-Small-Instruct-2409](https://huggingface.co/mistralai/Mistral-Small-Instruct-2409)|
|[LLM-Research/Mistral-Large-Instruct-2407](https://modelscope.cn/models/LLM-Research/Mistral-Large-Instruct-2407)|mistral_nemo|mistral_nemo|transformers>=4.43|✘|-|[mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)|
|[AI-ModelScope/Mistral-Nemo-Base-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Base-2407)|mistral_nemo|mistral_nemo|transformers>=4.43|✘|-|[mistralai/Mistral-Nemo-Base-2407](https://huggingface.co/mistralai/Mistral-Nemo-Base-2407)|
|[AI-ModelScope/Mistral-Nemo-Instruct-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Instruct-2407)|mistral_nemo|mistral_nemo|transformers>=4.43|✘|-|[mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)|
|[AI-ModelScope/Ministral-8B-Instruct-2410](https://modelscope.cn/models/AI-ModelScope/Ministral-8B-Instruct-2410)|mistral_nemo|mistral_nemo|transformers>=4.46|✘|-|[mistralai/Ministral-8B-Instruct-2410](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410)|
|[mistralai/Mistral-Small-24B-Base-2501](https://modelscope.cn/models/mistralai/Mistral-Small-24B-Base-2501)|mistral_2501|mistral_2501|-|✘|-|[mistralai/Mistral-Small-24B-Base-2501](https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501)|
|[mistralai/Mistral-Small-24B-Instruct-2501](https://modelscope.cn/models/mistralai/Mistral-Small-24B-Instruct-2501)|mistral_2501|mistral_2501|-|✘|-|[mistralai/Mistral-Small-24B-Instruct-2501](https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501)|
|[AI-ModelScope/WizardLM-2-7B-AWQ](https://modelscope.cn/models/AI-ModelScope/WizardLM-2-7B-AWQ)|wizardlm2|wizardlm2|transformers>=4.34|✘|-|[MaziyarPanahi/WizardLM-2-7B-AWQ](https://huggingface.co/MaziyarPanahi/WizardLM-2-7B-AWQ)|
|[AI-ModelScope/WizardLM-2-8x22B](https://modelscope.cn/models/AI-ModelScope/WizardLM-2-8x22B)|wizardlm2_moe|wizardlm2_moe|transformers>=4.36|✘|-|[alpindale/WizardLM-2-8x22B](https://huggingface.co/alpindale/WizardLM-2-8x22B)|
|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2)|phi2|default|-|✘|-|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)|
|[LLM-Research/Phi-3-small-8k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-8k-instruct)|phi3_small|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct)|
|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct)|phi3_small|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)|
|[LLM-Research/Phi-3-mini-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)|
|[LLM-Research/Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)|
|[LLM-Research/Phi-3-medium-4k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-4k-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-medium-4k-instruct](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct)|
|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)|
|[LLM-Research/Phi-3.5-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-mini-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)|
|[LLM-Research/Phi-4-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-mini-instruct)|phi3|phi3|transformers>=4.36|✘|-|[microsoft/Phi-4-mini-instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct)|
|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct)|phi3_moe|phi3|transformers>=4.36|✘|-|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)|
|[LLM-Research/phi-4](https://modelscope.cn/models/LLM-Research/phi-4)|phi4|phi4|transformers>=4.36|✘|-|[microsoft/phi-4](https://huggingface.co/microsoft/phi-4)|
|[MiniMax/MiniMax-Text-01](https://modelscope.cn/models/MiniMax/MiniMax-Text-01)|minimax|minimax|-|✘|-|[MiniMaxAI/MiniMax-Text-01](https://huggingface.co/MiniMaxAI/MiniMax-Text-01)|
|[MiniMax/MiniMax-M1-40k](https://modelscope.cn/models/MiniMax/MiniMax-M1-40k)|minimax_m1|minimax_m1|-|✘|-|[MiniMaxAI/MiniMax-M1-40k](https://huggingface.co/MiniMaxAI/MiniMax-M1-40k)|
|[MiniMax/MiniMax-M1-80k](https://modelscope.cn/models/MiniMax/MiniMax-M1-80k)|minimax_m1|minimax_m1|-|✘|-|[MiniMaxAI/MiniMax-M1-80k](https://huggingface.co/MiniMaxAI/MiniMax-M1-80k)|
|[MiniMax/MiniMax-M2](https://modelscope.cn/models/MiniMax/MiniMax-M2)|minimax_m2|minimax_m2|transformers==4.57.1|✔|-|[MiniMaxAI/MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2)|
|[MiniMax/MiniMax-M2.1](https://modelscope.cn/models/MiniMax/MiniMax-M2.1)|minimax_m2|minimax_m2_1|transformers==4.57.1|✔|-|[MiniMaxAI/MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1)|
|[MiniMax/MiniMax-M2.5](https://modelscope.cn/models/MiniMax/MiniMax-M2.5)|minimax_m2|minimax_m2_5|transformers==4.57.1|✔|-|[MiniMaxAI/MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5)|
|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it)|gemma|gemma|transformers>=4.38|✘|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b)|gemma|gemma|transformers>=4.38|✘|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)|
|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b)|gemma|gemma|transformers>=4.38|✘|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it)|gemma|gemma|transformers>=4.38|✘|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)|
|[LLM-Research/gemma-2-2b-it](https://modelscope.cn/models/LLM-Research/gemma-2-2b-it)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it)|
|[LLM-Research/gemma-2-2b](https://modelscope.cn/models/LLM-Research/gemma-2-2b)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b)|
|[LLM-Research/gemma-2-9b](https://modelscope.cn/models/LLM-Research/gemma-2-9b)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b)|
|[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)|
|[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)|
|[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
|[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
|[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
|[google/gemma-3-270m](https://modelscope.cn/models/google/gemma-3-270m)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-270m](https://huggingface.co/google/gemma-3-270m)|
|[google/gemma-3-270m-it](https://modelscope.cn/models/google/gemma-3-270m-it)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-270m-it](https://huggingface.co/google/gemma-3-270m-it)|
|[google/medgemma-27b-text-it](https://modelscope.cn/models/google/medgemma-27b-text-it)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/medgemma-27b-text-it](https://huggingface.co/google/medgemma-27b-text-it)|
|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|✘|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|✘|-|-|
|[inclusionAI/Ling-lite](https://modelscope.cn/models/inclusionAI/Ling-lite)|ling|ling|-|✘|-|[inclusionAI/Ling-lite](https://huggingface.co/inclusionAI/Ling-lite)|
|[inclusionAI/Ling-plus](https://modelscope.cn/models/inclusionAI/Ling-plus)|ling|ling|-|✘|-|[inclusionAI/Ling-plus](https://huggingface.co/inclusionAI/Ling-plus)|
|[inclusionAI/Ling-lite-base](https://modelscope.cn/models/inclusionAI/Ling-lite-base)|ling|ling|-|✘|-|[inclusionAI/Ling-lite-base](https://huggingface.co/inclusionAI/Ling-lite-base)|
|[inclusionAI/Ling-plus-base](https://modelscope.cn/models/inclusionAI/Ling-plus-base)|ling|ling|-|✘|-|[inclusionAI/Ling-plus-base](https://huggingface.co/inclusionAI/Ling-plus-base)|
|[inclusionAI/Ling-mini-2.0](https://modelscope.cn/models/inclusionAI/Ling-mini-2.0)|ling2|ling2|-|✘|-|[inclusionAI/Ling-mini-2.0](https://huggingface.co/inclusionAI/Ling-mini-2.0)|
|[inclusionAI/Ling-mini-base-2.0](https://modelscope.cn/models/inclusionAI/Ling-mini-base-2.0)|ling2|ling2|-|✘|-|[inclusionAI/Ling-mini-base-2.0](https://huggingface.co/inclusionAI/Ling-mini-base-2.0)|
|[inclusionAI/Ring-mini-2.0](https://modelscope.cn/models/inclusionAI/Ring-mini-2.0)|ring2|ring2|-|✘|-|[inclusionAI/Ring-mini-2.0](https://huggingface.co/inclusionAI/Ring-mini-2.0)|
|[IEITYuan/Yuan2.0-2B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-2B-hf)|yuan2|yuan|-|✘|-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)|
|[IEITYuan/Yuan2.0-51B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-51B-hf)|yuan2|yuan|-|✘|-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)|
|[IEITYuan/Yuan2.0-102B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-102B-hf)|yuan2|yuan|-|✘|-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)|
|[IEITYuan/Yuan2-2B-Janus-hf](https://modelscope.cn/models/IEITYuan/Yuan2-2B-Janus-hf)|yuan2|yuan|-|✘|-|[IEITYuan/Yuan2-2B-Janus-hf](https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf)|
|[IEITYuan/Yuan2-M32-hf](https://modelscope.cn/models/IEITYuan/Yuan2-M32-hf)|yuan2|yuan|-|✘|-|[IEITYuan/Yuan2-M32-hf](https://huggingface.co/IEITYuan/Yuan2-M32-hf)|
|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat)|orion|orion|-|✘|-|[OrionStarAI/Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)|
|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base)|orion|orion|-|✘|-|[OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)|
|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat)|xverse|xverse|-|✘|-|[xverse/XVERSE-7B-Chat](https://huggingface.co/xverse/XVERSE-7B-Chat)|
|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B)|xverse|xverse|-|✘|-|[xverse/XVERSE-7B](https://huggingface.co/xverse/XVERSE-7B)|
|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B)|xverse|xverse|-|✘|-|[xverse/XVERSE-13B](https://huggingface.co/xverse/XVERSE-13B)|
|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat)|xverse|xverse|-|✘|-|[xverse/XVERSE-13B-Chat](https://huggingface.co/xverse/XVERSE-13B-Chat)|
|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B)|xverse|xverse|-|✘|-|[xverse/XVERSE-65B](https://huggingface.co/xverse/XVERSE-65B)|
|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2)|xverse|xverse|-|✘|-|[xverse/XVERSE-65B-2](https://huggingface.co/xverse/XVERSE-65B-2)|
|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat)|xverse|xverse|-|✘|-|[xverse/XVERSE-65B-Chat](https://huggingface.co/xverse/XVERSE-65B-Chat)|
|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K)|xverse|xverse|-|✘|-|[xverse/XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)|
|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B)|xverse_moe|xverse|-|✘|-|[xverse/XVERSE-MoE-A4.2B](https://huggingface.co/xverse/XVERSE-MoE-A4.2B)|
|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m)|seggpt|default|-|✘|-|[DAMO-NLP/SeqGPT-560M](https://huggingface.co/DAMO-NLP/SeqGPT-560M)|
|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K)|bluelm|bluelm|-|✘|-|[vivo-ai/BlueLM-7B-Chat-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Chat-32K)|
|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat)|bluelm|bluelm|-|✘|-|[vivo-ai/BlueLM-7B-Chat](https://huggingface.co/vivo-ai/BlueLM-7B-Chat)|
|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K)|bluelm|bluelm|-|✘|-|[vivo-ai/BlueLM-7B-Base-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Base-32K)|
|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base)|bluelm|bluelm|-|✘|-|[vivo-ai/BlueLM-7B-Base](https://huggingface.co/vivo-ai/BlueLM-7B-Base)|
|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01)|c4ai|c4ai|transformers>=4.39|✘|-|[CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)|
|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus)|c4ai|c4ai|transformers>=4.39|✘|-|[CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)|
|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base)|dbrx|dbrx|transformers>=4.36|✘|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct)|dbrx|dbrx|transformers>=4.36|✘|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch)|grok|default|-|✘|-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)|
|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)|
|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)|
|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-790m-hf](https://huggingface.co/state-spaces/mamba-790m-hf)|
|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)|
|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf)|mamba|default|transformers>=4.39.0|✘|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|✘|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
|[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|✘|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
|[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|✘|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
|[XiaomiMiMo/MiMo-7B-Base](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base)|mimo|qwen|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-Base](https://huggingface.co/XiaomiMiMo/MiMo-7B-Base)|
|[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
|[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
|[XiaomiMiMo/MiMo-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL)|mimo|qwen|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL)|
|[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo|mimo_rl|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
|[rednote-hilab/dots.llm1.base](https://modelscope.cn/models/rednote-hilab/dots.llm1.base)|dots1|dots1|transformers>=4.53|✔|-|[rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base)|
|[rednote-hilab/dots.llm1.inst](https://modelscope.cn/models/rednote-hilab/dots.llm1.inst)|dots1|dots1|transformers>=4.53|✔|-|[rednote-hilab/dots.llm1.inst](https://huggingface.co/rednote-hilab/dots.llm1.inst)|
|[Tencent-Hunyuan/Hunyuan-A13B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct)|hunyuan|hunyuan_moe|-|✘|-|[tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)|
|[Tencent-Hunyuan/Hunyuan-0.5B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-0.5B-Instruct)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-0.5B-Instruct](https://huggingface.co/tencent/Hunyuan-0.5B-Instruct)|
|[Tencent-Hunyuan/Hunyuan-1.8B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-1.8B-Instruct)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-1.8B-Instruct](https://huggingface.co/tencent/Hunyuan-1.8B-Instruct)|
|[Tencent-Hunyuan/Hunyuan-4B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-4B-Instruct)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-4B-Instruct](https://huggingface.co/tencent/Hunyuan-4B-Instruct)|
|[Tencent-Hunyuan/Hunyuan-7B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-7B-Instruct)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct)|
|[Tencent-Hunyuan/Hunyuan-0.5B-Pretrain](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-0.5B-Pretrain)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-0.5B-Pretrain](https://huggingface.co/tencent/Hunyuan-0.5B-Pretrain)|
|[Tencent-Hunyuan/Hunyuan-1.8B-Pretrain](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-1.8B-Pretrain)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-1.8B-Pretrain](https://huggingface.co/tencent/Hunyuan-1.8B-Pretrain)|
|[Tencent-Hunyuan/Hunyuan-4B-Pretrain](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-4B-Pretrain)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-4B-Pretrain](https://huggingface.co/tencent/Hunyuan-4B-Pretrain)|
|[Tencent-Hunyuan/Hunyuan-7B-Pretrain](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-7B-Pretrain)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-7B-Pretrain](https://huggingface.co/tencent/Hunyuan-7B-Pretrain)|
|[Tencent-Hunyuan/Hunyuan-0.5B-Instruct-FP8](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-0.5B-Instruct-FP8)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-0.5B-Instruct-FP8](https://huggingface.co/tencent/Hunyuan-0.5B-Instruct-FP8)|
|[Tencent-Hunyuan/Hunyuan-1.8B-Instruct-FP8](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-1.8B-Instruct-FP8)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-1.8B-Instruct-FP8](https://huggingface.co/tencent/Hunyuan-1.8B-Instruct-FP8)|
|[Tencent-Hunyuan/Hunyuan-4B-Instruct-FP8](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-4B-Instruct-FP8)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-4B-Instruct-FP8](https://huggingface.co/tencent/Hunyuan-4B-Instruct-FP8)|
|[Tencent-Hunyuan/Hunyuan-7B-Instruct-FP8](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-7B-Instruct-FP8)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-7B-Instruct-FP8](https://huggingface.co/tencent/Hunyuan-7B-Instruct-FP8)|
|[Tencent-Hunyuan/Hunyuan-0.5B-Instruct-AWQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-0.5B-Instruct-AWQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-0.5B-Instruct-AWQ-Int4](https://huggingface.co/tencent/Hunyuan-0.5B-Instruct-AWQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-1.8B-Instruct-AWQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-1.8B-Instruct-AWQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-1.8B-Instruct-AWQ-Int4](https://huggingface.co/tencent/Hunyuan-1.8B-Instruct-AWQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-4B-Instruct-AWQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-4B-Instruct-AWQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-4B-Instruct-AWQ-Int4](https://huggingface.co/tencent/Hunyuan-4B-Instruct-AWQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-7B-Instruct-AWQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-7B-Instruct-AWQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-7B-Instruct-AWQ-Int4](https://huggingface.co/tencent/Hunyuan-7B-Instruct-AWQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-0.5B-Instruct-GPTQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/tencent/Hunyuan-0.5B-Instruct-GPTQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-1.8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-1.8B-Instruct-GPTQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-1.8B-Instruct-GPTQ-Int4](https://huggingface.co/tencent/Hunyuan-1.8B-Instruct-GPTQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-4B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-4B-Instruct-GPTQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-4B-Instruct-GPTQ-Int4](https://huggingface.co/tencent/Hunyuan-4B-Instruct-GPTQ-Int4)|
|[Tencent-Hunyuan/Hunyuan-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-7B-Instruct-GPTQ-Int4)|hunyuan_v1_dense|hunyuan|transformers>=4.55.0.dev0|✘|-|[tencent/Hunyuan-7B-Instruct-GPTQ-Int4](https://huggingface.co/tencent/Hunyuan-7B-Instruct-GPTQ-Int4)|
|[PaddlePaddle/ERNIE-4.5-0.3B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-0.3B-Base-PT)|ernie4_5|ernie|-|✔|-|[baidu/ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT)|
|[PaddlePaddle/ERNIE-4.5-0.3B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-0.3B-PT)|ernie4_5|ernie|-|✔|-|[baidu/ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT)|
|[PaddlePaddle/ERNIE-4.5-21B-A3B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-21B-A3B-Base-PT)|ernie4_5_moe|ernie|-|✔|-|[baidu/ERNIE-4.5-21B-A3B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Base-PT)|
|[PaddlePaddle/ERNIE-4.5-21B-A3B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-21B-A3B-PT)|ernie4_5_moe|ernie|-|✔|-|[baidu/ERNIE-4.5-21B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT)|
|[PaddlePaddle/ERNIE-4.5-300B-A47B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-300B-A47B-Base-PT)|ernie4_5_moe|ernie|-|✔|-|[baidu/ERNIE-4.5-300B-A47B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-300B-A47B-Base-PT)|
|[PaddlePaddle/ERNIE-4.5-300B-A47B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-300B-A47B-PT)|ernie4_5_moe|ernie|-|✔|-|[baidu/ERNIE-4.5-300B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-300B-A47B-PT)|
|[PaddlePaddle/ERNIE-4.5-21B-A3B-Thinking](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-21B-A3B-Thinking)|ernie4_5_moe|ernie_thinking|-|✔|-|[baidu/ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking)|
|[google/embeddinggemma-300m](https://modelscope.cn/models/google/embeddinggemma-300m)|gemma_emb|dummy|-|✘|-|[google/embeddinggemma-300m](https://huggingface.co/google/embeddinggemma-300m)|
|[meituan-longcat/LongCat-Flash-Chat](https://modelscope.cn/models/meituan-longcat/LongCat-Flash-Chat)|longchat|longchat|transformers>=4.54,<4.56|✘|-|[meituan-longcat/LongCat-Flash-Chat](https://huggingface.co/meituan-longcat/LongCat-Flash-Chat)|
|[meituan-longcat/LongCat-Flash-Chat-FP8](https://modelscope.cn/models/meituan-longcat/LongCat-Flash-Chat-FP8)|longchat|longchat|transformers>=4.54,<4.56|✘|-|[meituan-longcat/LongCat-Flash-Chat-FP8](https://huggingface.co/meituan-longcat/LongCat-Flash-Chat-FP8)|
|[IQuestLab/IQuest-Coder-V1-40B-Base-Stage1](https://modelscope.cn/models/IQuestLab/IQuest-Coder-V1-40B-Base-Stage1)|iquestcoder|iquestcoder|transformers==4.52.4|✘|-|[IQuestLab/IQuest-Coder-V1-40B-Base-Stage1](https://huggingface.co/IQuestLab/IQuest-Coder-V1-40B-Base-Stage1)|
|[IQuestLab/IQuest-Coder-V1-40B-Base](https://modelscope.cn/models/IQuestLab/IQuest-Coder-V1-40B-Base)|iquestcoder|iquestcoder|transformers==4.52.4|✘|-|[IQuestLab/IQuest-Coder-V1-40B-Base](https://huggingface.co/IQuestLab/IQuest-Coder-V1-40B-Base)|
|[IQuestLab/IQuest-Coder-V1-40B-Instruct](https://modelscope.cn/models/IQuestLab/IQuest-Coder-V1-40B-Instruct)|iquestcoder|iquestcoder|transformers==4.52.4|✘|-|[IQuestLab/IQuest-Coder-V1-40B-Instruct](https://huggingface.co/IQuestLab/IQuest-Coder-V1-40B-Instruct)|
|[Tencent-YouTu-Research/Youtu-LLM-2B](https://modelscope.cn/models/Tencent-YouTu-Research/Youtu-LLM-2B)|youtu_llm|youtu_llm|transformers>=4.56|✘|-|[tencent/Youtu-LLM-2B](https://huggingface.co/tencent/Youtu-LLM-2B)|
|[Tencent-YouTu-Research/Youtu-LLM-2B-Base](https://modelscope.cn/models/Tencent-YouTu-Research/Youtu-LLM-2B-Base)|youtu_llm|youtu_llm|transformers>=4.56|✘|-|[tencent/Youtu-LLM-2B-Base](https://huggingface.co/tencent/Youtu-LLM-2B-Base)|
|[iic/gte-reranker-modernbert-base](https://modelscope.cn/models/iic/gte-reranker-modernbert-base)|modern_bert_gte_reranker|bert|transformers>=4.48|✘|bert, reranker|[Alibaba-NLP/gte-reranker-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-reranker-modernbert-base)|
|[BAAI/bge-reranker-base](https://modelscope.cn/models/BAAI/bge-reranker-base)|bge_reranker|bge_reranker|-|✘|-|[BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)|
|[BAAI/bge-reranker-v2-m3](https://modelscope.cn/models/BAAI/bge-reranker-v2-m3)|bge_reranker|bge_reranker|-|✘|-|[BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)|
|[BAAI/bge-reranker-large](https://modelscope.cn/models/BAAI/bge-reranker-large)|bge_reranker|bge_reranker|-|✘|-|[BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large)|
|[allenai/OLMoE-1B-7B-0125](https://modelscope.cn/models/allenai/OLMoE-1B-7B-0125)|olmoe|olmoe|-|✔|-|[allenai/OLMoE-1B-7B-0125](https://huggingface.co/allenai/OLMoE-1B-7B-0125)|
|[allenai/OLMoE-1B-7B-0125-Instruct](https://modelscope.cn/models/allenai/OLMoE-1B-7B-0125-Instruct)|olmoe|olmoe|-|✔|-|[allenai/OLMoE-1B-7B-0125-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0125-Instruct)|
|[allenai/OLMoE-1B-7B-0924](https://modelscope.cn/models/allenai/OLMoE-1B-7B-0924)|olmoe|olmoe_0924|-|✔|-|[allenai/OLMoE-1B-7B-0924](https://huggingface.co/allenai/OLMoE-1B-7B-0924)|
|[allenai/OLMoE-1B-7B-0924-Instruct](https://modelscope.cn/models/allenai/OLMoE-1B-7B-0924-Instruct)|olmoe|olmoe_0924|-|✔|-|[allenai/OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct)|
|[allenai/OLMoE-1B-7B-0924-SFT](https://modelscope.cn/models/allenai/OLMoE-1B-7B-0924-SFT)|olmoe|olmoe_0924|-|✔|-|[allenai/OLMoE-1B-7B-0924-SFT](https://huggingface.co/allenai/OLMoE-1B-7B-0924-SFT)|
|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
|[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|✘|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|
|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|✘|bert|-|
|[Shanghai_AI_Laboratory/internlm2-1_8b-reward](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b-reward)|internlm2_reward|internlm2_reward|transformers>=4.38|✘|-|[internlm/internlm2-1_8b-reward](https://huggingface.co/internlm/internlm2-1_8b-reward)|
|[Shanghai_AI_Laboratory/internlm2-7b-reward](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b-reward)|internlm2_reward|internlm2_reward|transformers>=4.38|✘|-|[internlm/internlm2-7b-reward](https://huggingface.co/internlm/internlm2-7b-reward)|
|[Shanghai_AI_Laboratory/internlm2-20b-reward](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b-reward)|internlm2_reward|internlm2_reward|transformers>=4.38|✘|-|[internlm/internlm2-20b-reward](https://huggingface.co/internlm/internlm2-20b-reward)|
|[Qwen/Qwen2-Math-RM-72B](https://modelscope.cn/models/Qwen/Qwen2-Math-RM-72B)|qwen2_reward|qwen|transformers>=4.37|✘|-|[Qwen/Qwen2-Math-RM-72B](https://huggingface.co/Qwen/Qwen2-Math-RM-72B)|
|[Qwen/Qwen2.5-Math-RM-72B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-RM-72B)|qwen2_reward|qwen2_5_math|transformers>=4.37|✘|-|[Qwen/Qwen2.5-Math-RM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B)|
|[Qwen/Qwen2.5-Math-PRM-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-PRM-7B)|qwen2_5_prm|qwen2_5_math_prm|transformers>=4.37|✘|-|[Qwen/Qwen2.5-Math-PRM-7B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B)|
|[Qwen/Qwen2.5-Math-7B-PRM800K](https://modelscope.cn/models/Qwen/Qwen2.5-Math-7B-PRM800K)|qwen2_5_prm|qwen2_5_math_prm|transformers>=4.37|✘|-|[Qwen/Qwen2.5-Math-7B-PRM800K](https://huggingface.co/Qwen/Qwen2.5-Math-7B-PRM800K)|
|[Qwen/Qwen2.5-Math-PRM-72B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-PRM-72B)|qwen2_5_prm|qwen2_5_math_prm|transformers>=4.37|✘|-|[Qwen/Qwen2.5-Math-PRM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B)|
|[AI-ModelScope/Skywork-Reward-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-Reward-Llama-3.1-8B)|llama3_2_reward|llama3_2|transformers>=4.43|✘|-|[Skywork/Skywork-Reward-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B)|
|[AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2](https://modelscope.cn/models/AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2)|llama3_2_reward|llama3_2|transformers>=4.43|✘|-|[Skywork/Skywork-Reward-Llama-3.1-8B-v0.2](https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2)|
|[AI-ModelScope/GRM_Llama3.1_8B_rewardmodel-ft](https://modelscope.cn/models/AI-ModelScope/GRM_Llama3.1_8B_rewardmodel-ft)|llama3_2_reward|llama3_2|transformers>=4.43|✘|-|[Ray2333/GRM_Llama3.1_8B_rewardmodel-ft](https://huggingface.co/Ray2333/GRM_Llama3.1_8B_rewardmodel-ft)|
|[AI-ModelScope/GRM-llama3.2-3B-rewardmodel-ft](https://modelscope.cn/models/AI-ModelScope/GRM-llama3.2-3B-rewardmodel-ft)|llama3_2_reward|llama3_2|transformers>=4.43|✘|-|[Ray2333/GRM-llama3.2-3B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-llama3.2-3B-rewardmodel-ft)|
|[AI-ModelScope/Skywork-Reward-Gemma-2-27B](https://modelscope.cn/models/AI-ModelScope/Skywork-Reward-Gemma-2-27B)|gemma_reward|gemma|transformers>=4.42|✘|-|[Skywork/Skywork-Reward-Gemma-2-27B](https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B)|
|[AI-ModelScope/Skywork-Reward-Gemma-2-27B-v0.2](https://modelscope.cn/models/AI-ModelScope/Skywork-Reward-Gemma-2-27B-v0.2)|gemma_reward|gemma|transformers>=4.42|✘|-|[Skywork/Skywork-Reward-Gemma-2-27B-v0.2](https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B-v0.2)|
### 多模态大模型
| Model ID | Model Type | Default Template | Requires | Support Megatron | Tags | HF Model ID |
| -------- | -----------| ---------------- | -------- | ---------------- | ---- | ----------- |
|[Qwen/Qwen-VL-Chat](https://modelscope.cn/models/Qwen/Qwen-VL-Chat)|qwen_vl|qwen_vl|-|✘|vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
|[Qwen/Qwen-VL](https://modelscope.cn/models/Qwen/Qwen-VL)|qwen_vl|qwen_vl|-|✘|vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
|[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|✘|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
|[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|✘|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
|[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|✘|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
|[bytedance-research/UI-TARS-2B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-2B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[bytedance-research/UI-TARS-2B-SFT](https://huggingface.co/bytedance-research/UI-TARS-2B-SFT)|
|[bytedance-research/UI-TARS-7B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[bytedance-research/UI-TARS-7B-SFT](https://huggingface.co/bytedance-research/UI-TARS-7B-SFT)|
|[bytedance-research/UI-TARS-7B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[bytedance-research/UI-TARS-7B-DPO](https://huggingface.co/bytedance-research/UI-TARS-7B-DPO)|
|[bytedance-research/UI-TARS-72B-SFT](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-SFT)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[bytedance-research/UI-TARS-72B-SFT](https://huggingface.co/bytedance-research/UI-TARS-72B-SFT)|
|[bytedance-research/UI-TARS-72B-DPO](https://modelscope.cn/models/bytedance-research/UI-TARS-72B-DPO)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[bytedance-research/UI-TARS-72B-DPO](https://huggingface.co/bytedance-research/UI-TARS-72B-DPO)|
|[allenai/olmOCR-7B-0225-preview](https://modelscope.cn/models/allenai/olmOCR-7B-0225-preview)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[allenai/olmOCR-7B-0225-preview](https://huggingface.co/allenai/olmOCR-7B-0225-preview)|
|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qwen2_vl|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
|[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)|
|[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)|
|[Qwen/Qwen2.5-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-32B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2.5-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)|
|[Qwen/Qwen2.5-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)|
|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|
|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|
|[Qwen/Qwen2.5-VL-32B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-32B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2.5-VL-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct-AWQ)|
|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|
|[XiaomiMiMo/MiMo-VL-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-SFT)|qwen2_5_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[XiaomiMiMo/MiMo-VL-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-SFT)|
|[XiaomiMiMo/MiMo-VL-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-RL)|qwen2_5_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|✔|vision, video|[XiaomiMiMo/MiMo-VL-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL)|
|[Qwen/Qwen2.5-Omni-3B](https://modelscope.cn/models/Qwen/Qwen2.5-Omni-3B)|qwen2_5_omni|qwen2_5_omni|transformers>=4.50, soundfile, qwen_omni_utils, decord|✔|vision, video, audio|[Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B)|
|[Qwen/Qwen2.5-Omni-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Omni-7B)|qwen2_5_omni|qwen2_5_omni|transformers>=4.50, soundfile, qwen_omni_utils, decord|✔|vision, video, audio|[Qwen/Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B)|
|[Qwen/Qwen3-Omni-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Instruct)|qwen3_omni_moe|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils>=0.0.9|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct)|
|[Qwen/Qwen3-Omni-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Thinking)|qwen3_omni_moe|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils>=0.0.9|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking)|
|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni_moe|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils>=0.0.9|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
|[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
|[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
|[Qwen/Qwen3-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)|
|[Qwen/Qwen3-VL-2B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-2B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking)|
|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8)|
|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking-FP8)|
|[Qwen/Qwen3-VL-4B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)|
|[Qwen/Qwen3-VL-4B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-4B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking)|
|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-FP8)|
|[Qwen/Qwen3-VL-4B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-4B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking-FP8)|
|[Qwen/Qwen3-VL-8B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)|
|[Qwen/Qwen3-VL-8B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking)|
|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8)|
|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking-FP8)|
|[Qwen/Qwen3-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct)|
|[Qwen/Qwen3-VL-32B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-32B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking)|
|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct-FP8)|
|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking-FP8)|
|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|
|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|
|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)|
|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking)|
|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|
|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|qwen3_vl_moe|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|
|[Qwen/Qwen3-VL-Embedding-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-2B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Embedding-2B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B)|
|[Qwen/Qwen3-VL-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Embedding-8B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)|
|[Qwen/Qwen3-VL-Reranker-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)|
|[Qwen/Qwen3-VL-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)|
|[Qwen/Qwen3.5-0.8B](https://modelscope.cn/models/Qwen/Qwen3.5-0.8B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B)|
|[Qwen/Qwen3.5-2B](https://modelscope.cn/models/Qwen/Qwen3.5-2B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-2B](https://huggingface.co/Qwen/Qwen3.5-2B)|
|[Qwen/Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-4B](https://huggingface.co/Qwen/Qwen3.5-4B)|
|[Qwen/Qwen3.5-9B](https://modelscope.cn/models/Qwen/Qwen3.5-9B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B)|
|[Qwen/Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B)|
|[Qwen/Qwen3.5-27B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-27B-FP8)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-27B-FP8](https://huggingface.co/Qwen/Qwen3.5-27B-FP8)|
|[Qwen/Qwen3.5-0.8B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-0.8B-Base)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-0.8B-Base](https://huggingface.co/Qwen/Qwen3.5-0.8B-Base)|
|[Qwen/Qwen3.5-2B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-2B-Base)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-2B-Base](https://huggingface.co/Qwen/Qwen3.5-2B-Base)|
|[Qwen/Qwen3.5-4B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-4B-Base)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-4B-Base](https://huggingface.co/Qwen/Qwen3.5-4B-Base)|
|[Qwen/Qwen3.5-9B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-9B-Base)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-9B-Base](https://huggingface.co/Qwen/Qwen3.5-9B-Base)|
|[Qwen/Qwen3.5-35B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-Base)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B-Base](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Base)|
|[Qwen/Qwen3.5-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)|
|[Qwen/Qwen3.5-122B-A10B](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)|
|[Qwen/Qwen3.5-397B-A17B](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)|
|[Qwen/Qwen3.5-35B-A3B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B-FP8](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-FP8)|
|[Qwen/Qwen3.5-122B-A10B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-122B-A10B-FP8](https://huggingface.co/Qwen/Qwen3.5-122B-A10B-FP8)|
|[Qwen/Qwen3.5-397B-A17B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)|
|[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|
|[iic/gme-Qwen2-VL-7B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-7B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)|
|[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|✘|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
|[AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4)|ovis1_6|ovis1_6|transformers>=4.42|✘|vision|[AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4)|
|[AIDC-AI/Ovis1.6-Gemma2-27B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-27B)|ovis1_6|ovis1_6|transformers>=4.42|✘|vision|[AIDC-AI/Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B)|
|[AIDC-AI/Ovis1.6-Llama3.2-3B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Llama3.2-3B)|ovis1_6|ovis1_6_llama3|-|✘|vision|[AIDC-AI/Ovis1.6-Llama3.2-3B](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B)|
|[AIDC-AI/Ovis2-1B](https://modelscope.cn/models/AIDC-AI/Ovis2-1B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-1B](https://huggingface.co/AIDC-AI/Ovis2-1B)|
|[AIDC-AI/Ovis2-2B](https://modelscope.cn/models/AIDC-AI/Ovis2-2B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-2B](https://huggingface.co/AIDC-AI/Ovis2-2B)|
|[AIDC-AI/Ovis2-4B](https://modelscope.cn/models/AIDC-AI/Ovis2-4B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-4B](https://huggingface.co/AIDC-AI/Ovis2-4B)|
|[AIDC-AI/Ovis2-8B](https://modelscope.cn/models/AIDC-AI/Ovis2-8B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-8B](https://huggingface.co/AIDC-AI/Ovis2-8B)|
|[AIDC-AI/Ovis2-16B](https://modelscope.cn/models/AIDC-AI/Ovis2-16B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-16B](https://huggingface.co/AIDC-AI/Ovis2-16B)|
|[AIDC-AI/Ovis2-34B](https://modelscope.cn/models/AIDC-AI/Ovis2-34B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|✘|vision|[AIDC-AI/Ovis2-34B](https://huggingface.co/AIDC-AI/Ovis2-34B)|
|[AIDC-AI/Ovis2.5-2B](https://modelscope.cn/models/AIDC-AI/Ovis2.5-2B)|ovis2_5|ovis2_5|transformers>=4.46.2, moviepy<2|✔|vision|[AIDC-AI/Ovis2.5-2B](https://huggingface.co/AIDC-AI/Ovis2.5-2B)|
|[AIDC-AI/Ovis2.5-9B](https://modelscope.cn/models/AIDC-AI/Ovis2.5-9B)|ovis2_5|ovis2_5|transformers>=4.46.2, moviepy<2|✔|vision|[AIDC-AI/Ovis2.5-9B](https://huggingface.co/AIDC-AI/Ovis2.5-9B)|
|[mispeech/midashenglm-7b](https://modelscope.cn/models/mispeech/midashenglm-7b)|midashenglm|midashenglm|transformers>=4.52, soundfile|✘|audio|[mispeech/midashenglm-7b](https://huggingface.co/mispeech/midashenglm-7b)|
|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|chatglm4v|chatglm4v|transformers>=4.42,<4.45|✘|-|[zai-org/glm-4v-9b](https://huggingface.co/zai-org/glm-4v-9b)|
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|chatglm4v|chatglm4v|transformers>=4.42|✘|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
|[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4v|glm4v|transformers>=4.53|✔|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
|[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4v|glm4v|transformers>=4.53|✔|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
|[ZhipuAI/AutoGLM-Phone-9B](https://modelscope.cn/models/ZhipuAI/AutoGLM-Phone-9B)|glm4v|glm4v|transformers>=4.53|✔|-|[zai-org/AutoGLM-Phone-9B](https://huggingface.co/zai-org/AutoGLM-Phone-9B)|
|[ZhipuAI/Glyph](https://modelscope.cn/models/ZhipuAI/Glyph)|glm4v|glm4_5v|transformers>=4.57|✔|-|[zai-org/Glyph](https://huggingface.co/zai-org/Glyph)|
|[ZhipuAI/GLM-4.6V-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.6V-Flash)|glm4v|glm4_5v|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.6V-Flash](https://huggingface.co/zai-org/GLM-4.6V-Flash)|
|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4v_moe|glm4_5v|transformers>=4.56|✔|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
|[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4v_moe|glm4_5v|transformers>=4.56|✘|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
|[ZhipuAI/GLM-4.6V](https://modelscope.cn/models/ZhipuAI/GLM-4.6V)|glm4v_moe|glm4_5v|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V)|
|[ZhipuAI/GLM-4.6V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6V-FP8)|glm4v_moe|glm4_5v|transformers>=5.0.0.dev|✘|-|[zai-org/GLM-4.6V-FP8](https://huggingface.co/zai-org/GLM-4.6V-FP8)|
|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|✘|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|
|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge_v|glm_edge_v|transformers>=4.46|✘|vision|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
|[ZhipuAI/GLM-OCR](https://modelscope.cn/models/ZhipuAI/GLM-OCR)|glm_ocr|glm_ocr|transformers>=5.0.1dev0|✘|-|[zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR)|
|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat)|cogvlm|cogvlm|transformers<4.42|✘|-|[zai-org/cogvlm-chat-hf](https://huggingface.co/zai-org/cogvlm-chat-hf)|
|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa)|cogagent_vqa|cogagent_vqa|transformers<4.42|✘|-|[zai-org/cogagent-vqa-hf](https://huggingface.co/zai-org/cogagent-vqa-hf)|
|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat)|cogagent_chat|cogagent_chat|transformers<4.42, timm|✘|-|[zai-org/cogagent-chat-hf](https://huggingface.co/zai-org/cogagent-chat-hf)|
|[ZhipuAI/cogvlm2-llama3-chat-19B](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-chat-19B)|cogvlm2|cogvlm2|transformers<4.42|✘|-|[zai-org/cogvlm2-llama3-chat-19B](https://huggingface.co/zai-org/cogvlm2-llama3-chat-19B)|
|[ZhipuAI/cogvlm2-llama3-chinese-chat-19B](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-chinese-chat-19B)|cogvlm2|cogvlm2|transformers<4.42|✘|-|[zai-org/cogvlm2-llama3-chinese-chat-19B](https://huggingface.co/zai-org/cogvlm2-llama3-chinese-chat-19B)|
|[ZhipuAI/cogvlm2-video-llama3-chat](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat)|cogvlm2_video|cogvlm2_video|decord, pytorchvideo, transformers>=4.42|✘|video|[zai-org/cogvlm2-video-llama3-chat](https://huggingface.co/zai-org/cogvlm2-video-llama3-chat)|
|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|internvl|internvl|transformers>=4.35, timm|✘|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5)|internvl|internvl|transformers>=4.35, timm|✘|vision|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8)|internvl|internvl|transformers>=4.35, timm|✘|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|internvl_phi3|internvl_phi3|transformers>=4.35,<4.42, timm|✘|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
|[OpenGVLab/InternVL2-1B](https://modelscope.cn/models/OpenGVLab/InternVL2-1B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)|
|[OpenGVLab/InternVL2-2B](https://modelscope.cn/models/OpenGVLab/InternVL2-2B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)|
|[OpenGVLab/InternVL2-8B](https://modelscope.cn/models/OpenGVLab/InternVL2-8B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)|
|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)|
|[OpenGVLab/InternVL2-40B](https://modelscope.cn/models/OpenGVLab/InternVL2-40B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)|
|[OpenGVLab/InternVL2-Llama3-76B](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)|
|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)|
|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)|
|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)|
|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)|
|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)|
|[OpenGVLab/InternVL2-8B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-MPO)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2-8B-MPO)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-1B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-1B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-2B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-2B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-4B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-4B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-8B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-8B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-26B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-26B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-40B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-40B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-Llama3-76B-Pretrain](https://modelscope.cn/models/OpenGVLab/InternVL2-Pretrain-Models)|internvl2|internvl2|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2-Pretrain-Models:InternVL2-Llama3-76B-Pretrain](https://huggingface.co/OpenGVLab/InternVL2-Pretrain-Models)|
|[OpenGVLab/InternVL2-4B](https://modelscope.cn/models/OpenGVLab/InternVL2-4B)|internvl2_phi3|internvl2_phi3|transformers>=4.36,<4.42, timm|✘|vision, video|[OpenGVLab/InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B)|
|[OpenGVLab/InternVL2_5-1B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-1B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B)|
|[OpenGVLab/InternVL2_5-2B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-2B](https://huggingface.co/OpenGVLab/InternVL2_5-2B)|
|[OpenGVLab/InternVL2_5-4B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-4B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B)|
|[OpenGVLab/InternVL2_5-8B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-8B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B)|
|[OpenGVLab/InternVL2_5-26B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-26B](https://huggingface.co/OpenGVLab/InternVL2_5-26B)|
|[OpenGVLab/InternVL2_5-38B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-38B](https://huggingface.co/OpenGVLab/InternVL2_5-38B)|
|[OpenGVLab/InternVL2_5-78B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-78B](https://huggingface.co/OpenGVLab/InternVL2_5-78B)|
|[OpenGVLab/InternVL2_5-4B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2_5-4B-AWQ)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-4B-AWQ](https://huggingface.co/OpenGVLab/InternVL2_5-4B-AWQ)|
|[OpenGVLab/InternVL2_5-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2_5-8B-AWQ)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2_5-8B-AWQ)|
|[OpenGVLab/InternVL2_5-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B-AWQ)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2_5-26B-AWQ)|
|[OpenGVLab/InternVL2_5-38B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B-AWQ)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-38B-AWQ](https://huggingface.co/OpenGVLab/InternVL2_5-38B-AWQ)|
|[OpenGVLab/InternVL2_5-78B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B-AWQ)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-78B-AWQ](https://huggingface.co/OpenGVLab/InternVL2_5-78B-AWQ)|
|[OpenGVLab/InternVL2_5-1B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-1B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-1B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-1B-MPO)|
|[OpenGVLab/InternVL2_5-2B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-2B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-2B-MPO)|
|[OpenGVLab/InternVL2_5-4B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-4B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-4B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-4B-MPO)|
|[OpenGVLab/InternVL2_5-8B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-8B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO)|
|[OpenGVLab/InternVL2_5-26B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-26B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-26B-MPO)|
|[OpenGVLab/InternVL2_5-38B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-38B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-38B-MPO)|
|[OpenGVLab/InternVL2_5-78B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|✘|vision, video|[OpenGVLab/InternVL2_5-78B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-78B-MPO)|
|[OpenGVLab/InternVL3-1B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-1B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-1B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-1B-Pretrained)|
|[OpenGVLab/InternVL3-2B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-2B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-2B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-2B-Pretrained)|
|[OpenGVLab/InternVL3-8B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-8B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-8B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-8B-Pretrained)|
|[OpenGVLab/InternVL3-9B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-9B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-9B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-9B-Pretrained)|
|[OpenGVLab/InternVL3-14B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-14B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-14B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-14B-Pretrained)|
|[OpenGVLab/InternVL3-38B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-38B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-38B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-38B-Pretrained)|
|[OpenGVLab/InternVL3-78B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3-78B-Pretrained)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-78B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3-78B-Pretrained)|
|[OpenGVLab/InternVL3-1B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-1B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-1B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-1B-Instruct)|
|[OpenGVLab/InternVL3-2B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-2B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-2B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-2B-Instruct)|
|[OpenGVLab/InternVL3-8B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-8B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-8B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct)|
|[OpenGVLab/InternVL3-9B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-9B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-9B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-9B-Instruct)|
|[OpenGVLab/InternVL3-14B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-14B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-14B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-14B-Instruct)|
|[OpenGVLab/InternVL3-38B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-38B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-38B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-38B-Instruct)|
|[OpenGVLab/InternVL3-78B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3-78B-Instruct)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-78B-Instruct](https://huggingface.co/OpenGVLab/InternVL3-78B-Instruct)|
|[OpenGVLab/InternVL3-1B](https://modelscope.cn/models/OpenGVLab/InternVL3-1B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-1B](https://huggingface.co/OpenGVLab/InternVL3-1B)|
|[OpenGVLab/InternVL3-2B](https://modelscope.cn/models/OpenGVLab/InternVL3-2B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-2B](https://huggingface.co/OpenGVLab/InternVL3-2B)|
|[OpenGVLab/InternVL3-8B](https://modelscope.cn/models/OpenGVLab/InternVL3-8B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-8B](https://huggingface.co/OpenGVLab/InternVL3-8B)|
|[OpenGVLab/InternVL3-9B](https://modelscope.cn/models/OpenGVLab/InternVL3-9B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-9B](https://huggingface.co/OpenGVLab/InternVL3-9B)|
|[OpenGVLab/InternVL3-14B](https://modelscope.cn/models/OpenGVLab/InternVL3-14B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-14B](https://huggingface.co/OpenGVLab/InternVL3-14B)|
|[OpenGVLab/InternVL3-38B](https://modelscope.cn/models/OpenGVLab/InternVL3-38B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-38B](https://huggingface.co/OpenGVLab/InternVL3-38B)|
|[OpenGVLab/InternVL3-78B](https://modelscope.cn/models/OpenGVLab/InternVL3-78B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3-78B](https://huggingface.co/OpenGVLab/InternVL3-78B)|
|[OpenGVLab/InternVL3-1B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-1B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-1B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-1B-AWQ)|
|[OpenGVLab/InternVL3-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-2B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-2B-AWQ)|
|[OpenGVLab/InternVL3-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-8B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-8B-AWQ)|
|[OpenGVLab/InternVL3-9B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-9B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-9B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-9B-AWQ)|
|[OpenGVLab/InternVL3-14B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-14B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-14B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-14B-AWQ)|
|[OpenGVLab/InternVL3-38B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-38B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-38B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-38B-AWQ)|
|[OpenGVLab/InternVL3-78B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL3-78B-AWQ)|internvl3|internvl2_5|transformers>=4.37.2, timm|✘|vision, video|[OpenGVLab/InternVL3-78B-AWQ](https://huggingface.co/OpenGVLab/InternVL3-78B-AWQ)|
|[SenseNova/SenseNova-SI-InternVL3-2B](https://modelscope.cn/models/SenseNova/SenseNova-SI-InternVL3-2B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[sensenova/SenseNova-SI-InternVL3-2B](https://huggingface.co/sensenova/SenseNova-SI-InternVL3-2B)|
|[SenseNova/SenseNova-SI-InternVL3-8B](https://modelscope.cn/models/SenseNova/SenseNova-SI-InternVL3-8B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[sensenova/SenseNova-SI-InternVL3-8B](https://huggingface.co/sensenova/SenseNova-SI-InternVL3-8B)|
|[SenseNova/SenseNova-SI-1.1-InternVL3-2B](https://modelscope.cn/models/SenseNova/SenseNova-SI-1.1-InternVL3-2B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[sensenova/SenseNova-SI-1.1-InternVL3-2B](https://huggingface.co/sensenova/SenseNova-SI-1.1-InternVL3-2B)|
|[SenseNova/SenseNova-SI-1.1-InternVL3-8B](https://modelscope.cn/models/SenseNova/SenseNova-SI-1.1-InternVL3-8B)|internvl3|internvl2_5|transformers>=4.37.2, timm|✔|vision, video|[sensenova/SenseNova-SI-1.1-InternVL3-8B](https://huggingface.co/sensenova/SenseNova-SI-1.1-InternVL3-8B)|
|[OpenGVLab/InternVL3-1B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-1B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)|
|[OpenGVLab/InternVL3-2B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-2B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-2B-hf](https://huggingface.co/OpenGVLab/InternVL3-2B-hf)|
|[OpenGVLab/InternVL3-8B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-8B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-8B-hf](https://huggingface.co/OpenGVLab/InternVL3-8B-hf)|
|[OpenGVLab/InternVL3-9B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-9B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-9B-hf](https://huggingface.co/OpenGVLab/InternVL3-9B-hf)|
|[OpenGVLab/InternVL3-14B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-14B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-14B-hf](https://huggingface.co/OpenGVLab/InternVL3-14B-hf)|
|[OpenGVLab/InternVL3-38B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-38B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-38B-hf](https://huggingface.co/OpenGVLab/InternVL3-38B-hf)|
|[OpenGVLab/InternVL3-78B-hf](https://modelscope.cn/models/OpenGVLab/InternVL3-78B-hf)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3-78B-hf](https://huggingface.co/OpenGVLab/InternVL3-78B-hf)|
|[OpenGVLab/InternVL3_5-1B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-1B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-1B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-1B-HF)|
|[OpenGVLab/InternVL3_5-2B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-2B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-2B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-2B-HF)|
|[OpenGVLab/InternVL3_5-4B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-4B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-4B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-4B-HF)|
|[OpenGVLab/InternVL3_5-8B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-8B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-8B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF)|
|[OpenGVLab/InternVL3_5-14B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-14B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-14B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-14B-HF)|
|[OpenGVLab/InternVL3_5-38B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-38B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-38B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-38B-HF)|
|[OpenGVLab/InternVL3_5-30B-A3B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-30B-A3B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-30B-A3B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-30B-A3B-HF)|
|[OpenGVLab/InternVL3_5-241B-A28B-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-241B-A28B-HF)|internvl_hf|internvl_hf|transformers>=4.52.1, timm|✔|vision, video|[OpenGVLab/InternVL3_5-241B-A28B-HF](https://huggingface.co/OpenGVLab/InternVL3_5-241B-A28B-HF)|
|[OpenGVLab/InternVL3_5-1B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-1B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-1B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-1B-Pretrained)|
|[OpenGVLab/InternVL3_5-2B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-2B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-2B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-2B-Pretrained)|
|[OpenGVLab/InternVL3_5-4B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-4B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-4B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-4B-Pretrained)|
|[OpenGVLab/InternVL3_5-8B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-8B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-8B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-8B-Pretrained)|
|[OpenGVLab/InternVL3_5-14B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-14B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-14B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-14B-Pretrained)|
|[OpenGVLab/InternVL3_5-38B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-38B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-38B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-38B-Pretrained)|
|[OpenGVLab/InternVL3_5-30B-A3B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-30B-A3B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-30B-A3B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-30B-A3B-Pretrained)|
|[OpenGVLab/InternVL3_5-241B-A28B-Pretrained](https://modelscope.cn/models/OpenGVLab/InternVL3_5-241B-A28B-Pretrained)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-241B-A28B-Pretrained](https://huggingface.co/OpenGVLab/InternVL3_5-241B-A28B-Pretrained)|
|[OpenGVLab/InternVL3_5-1B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-1B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-1B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-1B-Instruct)|
|[OpenGVLab/InternVL3_5-2B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-2B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-2B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-2B-Instruct)|
|[OpenGVLab/InternVL3_5-4B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-4B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-4B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct)|
|[OpenGVLab/InternVL3_5-8B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-8B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-8B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-8B-Instruct)|
|[OpenGVLab/InternVL3_5-14B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-14B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-14B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-14B-Instruct)|
|[OpenGVLab/InternVL3_5-38B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-38B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-38B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-38B-Instruct)|
|[OpenGVLab/InternVL3_5-30B-A3B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-30B-A3B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-30B-A3B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-30B-A3B-Instruct)|
|[OpenGVLab/InternVL3_5-241B-A28B-Instruct](https://modelscope.cn/models/OpenGVLab/InternVL3_5-241B-A28B-Instruct)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-241B-A28B-Instruct](https://huggingface.co/OpenGVLab/InternVL3_5-241B-A28B-Instruct)|
|[OpenGVLab/InternVL3_5-1B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-1B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-1B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-1B-MPO)|
|[OpenGVLab/InternVL3_5-2B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-2B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-2B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-2B-MPO)|
|[OpenGVLab/InternVL3_5-4B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-4B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-4B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-4B-MPO)|
|[OpenGVLab/InternVL3_5-8B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-8B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-8B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-8B-MPO)|
|[OpenGVLab/InternVL3_5-14B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-14B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-14B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-14B-MPO)|
|[OpenGVLab/InternVL3_5-38B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-38B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-38B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-38B-MPO)|
|[OpenGVLab/InternVL3_5-30B-A3B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-30B-A3B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-30B-A3B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-30B-A3B-MPO)|
|[OpenGVLab/InternVL3_5-241B-A28B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL3_5-241B-A28B-MPO)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-241B-A28B-MPO](https://huggingface.co/OpenGVLab/InternVL3_5-241B-A28B-MPO)|
|[OpenGVLab/InternVL3_5-1B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-1B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-1B](https://huggingface.co/OpenGVLab/InternVL3_5-1B)|
|[OpenGVLab/InternVL3_5-2B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-2B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-2B](https://huggingface.co/OpenGVLab/InternVL3_5-2B)|
|[OpenGVLab/InternVL3_5-4B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-4B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B)|
|[OpenGVLab/InternVL3_5-8B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-8B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-8B](https://huggingface.co/OpenGVLab/InternVL3_5-8B)|
|[OpenGVLab/InternVL3_5-14B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-14B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-14B](https://huggingface.co/OpenGVLab/InternVL3_5-14B)|
|[OpenGVLab/InternVL3_5-38B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-38B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-38B](https://huggingface.co/OpenGVLab/InternVL3_5-38B)|
|[OpenGVLab/InternVL3_5-30B-A3B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-30B-A3B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-30B-A3B](https://huggingface.co/OpenGVLab/InternVL3_5-30B-A3B)|
|[OpenGVLab/InternVL3_5-241B-A28B](https://modelscope.cn/models/OpenGVLab/InternVL3_5-241B-A28B)|internvl3_5|internvl3_5|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-241B-A28B](https://huggingface.co/OpenGVLab/InternVL3_5-241B-A28B)|
|[OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview](https://modelscope.cn/models/OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview)|internvl3_5_gpt|internvl3_5_gpt|transformers>=4.37.2, timm|✔|vision, video|[OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview](https://huggingface.co/OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview)|
|[OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview-HF](https://modelscope.cn/models/OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview-HF)|internvl_gpt_hf|internvl_hf|transformers>=4.55.0, timm|✔|vision, video|[OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview-HF](https://huggingface.co/OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview-HF)|
|[Shanghai_AI_Laboratory/Intern-S1-mini](https://modelscope.cn/models/Shanghai_AI_Laboratory/Intern-S1-mini)|interns1|interns1|transformers>=4.55.2,<4.56|✘|vision, video|[internlm/Intern-S1-mini](https://huggingface.co/internlm/Intern-S1-mini)|
|[Shanghai_AI_Laboratory/Intern-S1](https://modelscope.cn/models/Shanghai_AI_Laboratory/Intern-S1)|interns1|interns1|transformers>=4.55.2,<4.56|✘|vision, video|[internlm/Intern-S1](https://huggingface.co/internlm/Intern-S1)|
|[Shanghai_AI_Laboratory/Intern-S1-mini-FP8](https://modelscope.cn/models/Shanghai_AI_Laboratory/Intern-S1-mini-FP8)|interns1|interns1|transformers>=4.55.2,<4.56|✘|vision, video|[internlm/Intern-S1-mini-FP8](https://huggingface.co/internlm/Intern-S1-mini-FP8)|
|[Shanghai_AI_Laboratory/Intern-S1-FP8](https://modelscope.cn/models/Shanghai_AI_Laboratory/Intern-S1-FP8)|interns1|interns1|transformers>=4.55.2,<4.56|✘|vision, video|[internlm/Intern-S1-FP8](https://huggingface.co/internlm/Intern-S1-FP8)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|✘|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|✘|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|✘|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b)|xcomposer2_5|xcomposer2_5|decord|✘|vision|[internlm/internlm-xcomposer2d5-ol-7b:base](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:audio](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b)|xcomposer2_5_ol_audio|qwen2_audio|transformers>=4.45|✘|audio|[internlm/internlm-xcomposer2d5-ol-7b:audio](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b)|
|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|✘|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|✘|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision)|llama3_2_vision|llama3_2_vision|transformers>=4.45|✘|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision)|llama3_2_vision|llama3_2_vision|transformers>=4.45|✘|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
|[LLM-Research/Llama-4-Scout-17B-16E](https://modelscope.cn/models/LLM-Research/Llama-4-Scout-17B-16E)|llama4|llama4|transformers>=4.51|✔|vision|[meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)|
|[LLM-Research/Llama-4-Maverick-17B-128E](https://modelscope.cn/models/LLM-Research/Llama-4-Maverick-17B-128E)|llama4|llama4|transformers>=4.51|✔|vision|[meta-llama/Llama-4-Maverick-17B-128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E)|
|[LLM-Research/Llama-4-Scout-17B-16E-Instruct](https://modelscope.cn/models/LLM-Research/Llama-4-Scout-17B-16E-Instruct)|llama4|llama4|transformers>=4.51|✔|vision|[meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)|
|[LLM-Research/Llama-4-Maverick-17B-128E-Instruct-FP8](https://modelscope.cn/models/LLM-Research/Llama-4-Maverick-17B-128E-Instruct-FP8)|llama4|llama4|transformers>=4.51|✘|vision|[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)|
|[LLM-Research/Llama-4-Maverick-17B-128E-Instruct](https://modelscope.cn/models/LLM-Research/Llama-4-Maverick-17B-128E-Instruct)|llama4|llama4|transformers>=4.51|✔|vision|[meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct)|
|[ICTNLP/Llama-3.1-8B-Omni](https://modelscope.cn/models/ICTNLP/Llama-3.1-8B-Omni)|llama3_1_omni|llama3_1_omni|openai-whisper|✘|audio|[ICTNLP/Llama-3.1-8B-Omni](https://huggingface.co/ICTNLP/Llama-3.1-8B-Omni)|
|[llava-hf/llava-1.5-7b-hf](https://modelscope.cn/models/llava-hf/llava-1.5-7b-hf)|llava1_5_hf|llava1_5_hf|transformers>=4.36|✘|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
|[llava-hf/llava-1.5-13b-hf](https://modelscope.cn/models/llava-hf/llava-1.5-13b-hf)|llava1_5_hf|llava1_5_hf|transformers>=4.36|✘|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
|[llava-hf/llava-v1.6-mistral-7b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-mistral-7b-hf)|llava1_6_mistral_hf|llava1_6_mistral_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)|
|[llava-hf/llava-v1.6-vicuna-7b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-vicuna-7b-hf)|llava1_6_vicuna_hf|llava1_6_vicuna_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)|
|[llava-hf/llava-v1.6-vicuna-13b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-vicuna-13b-hf)|llava1_6_vicuna_hf|llava1_6_vicuna_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)|
|[llava-hf/llava-v1.6-34b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-34b-hf)|llava1_6_yi_hf|llava1_6_yi_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)|
|[llava-hf/llama3-llava-next-8b-hf](https://modelscope.cn/models/llava-hf/llama3-llava-next-8b-hf)|llama3_llava_next_hf|llama3_llava_next_hf|transformers>=4.39|✘|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)|
|[llava-hf/llava-next-72b-hf](https://modelscope.cn/models/llava-hf/llava-next-72b-hf)|llava_next_qwen_hf|llava_next_qwen_hf|transformers>=4.39|✘|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)|
|[llava-hf/llava-next-110b-hf](https://modelscope.cn/models/llava-hf/llava-next-110b-hf)|llava_next_qwen_hf|llava_next_qwen_hf|transformers>=4.39|✘|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)|
|[llava-hf/LLaVA-NeXT-Video-7B-DPO-hf](https://modelscope.cn/models/llava-hf/LLaVA-NeXT-Video-7B-DPO-hf)|llava_next_video_hf|llava_next_video_hf|transformers>=4.42, av|✘|video|[llava-hf/LLaVA-NeXT-Video-7B-DPO-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-DPO-hf)|
|[llava-hf/LLaVA-NeXT-Video-7B-32K-hf](https://modelscope.cn/models/llava-hf/LLaVA-NeXT-Video-7B-32K-hf)|llava_next_video_hf|llava_next_video_hf|transformers>=4.42, av|✘|video|[llava-hf/LLaVA-NeXT-Video-7B-32K-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-32K-hf)|
|[llava-hf/LLaVA-NeXT-Video-7B-hf](https://modelscope.cn/models/llava-hf/LLaVA-NeXT-Video-7B-hf)|llava_next_video_hf|llava_next_video_hf|transformers>=4.42, av|✘|video|[llava-hf/LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf)|
|[llava-hf/LLaVA-NeXT-Video-34B-hf](https://modelscope.cn/models/llava-hf/LLaVA-NeXT-Video-34B-hf)|llava_next_video_yi_hf|llava_next_video_hf|transformers>=4.42, av|✘|video|[llava-hf/LLaVA-NeXT-Video-34B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-34B-hf)|
|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|llava_onevision_hf|llava_onevision_hf|transformers>=4.45|✘|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/llava-hf/llava-onevision-qwen2-7b-ov-hf)|llava_onevision_hf|llava_onevision_hf|transformers>=4.45|✘|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/llava-hf/llava-onevision-qwen2-72b-ov-hf)|llava_onevision_hf|llava_onevision_hf|transformers>=4.45|✘|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B)|yi_vl|yi_vl|transformers>=4.34|✘|vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B)|yi_vl|yi_vl|transformers>=4.34|✘|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|✘|-|[baidu/ERNIE-4.5-VL-28B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT)|
|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|✘|-|[baidu/ERNIE-4.5-VL-424B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-PT)|
|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|✘|-|[baidu/ERNIE-4.5-VL-28B-A3B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Base-PT)|
|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|✘|-|[baidu/ERNIE-4.5-VL-424B-A47B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-Base-PT)|
|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking)|ernie_vl|ernie_vl_thinking|transformers>=4.52, moviepy|✘|-|[baidu/ERNIE-4.5-VL-28B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Thinking)|
|[swift/llava-llama3.1-8b](https://modelscope.cn/models/swift/llava-llama3.1-8b)|llava_llama3_1_hf|llava_llama3_1_hf|transformers>=4.41|✘|vision|-|
|[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers)|llava_llama3_hf|llava_llama3_hf|transformers>=4.36|✘|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b)|llava1_6_mistral|llava1_6_mistral|transformers>=4.34|✘|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b)|llava1_6_yi|llava1_6_yi|transformers>=4.34|✘|vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)|
|[AI-ModelScope/llava-next-72b](https://modelscope.cn/models/AI-ModelScope/llava-next-72b)|llava_next_qwen|llava_next_qwen|transformers>=4.42, av|✘|vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)|
|[AI-ModelScope/llava-next-110b](https://modelscope.cn/models/AI-ModelScope/llava-next-110b)|llava_next_qwen|llava_next_qwen|transformers>=4.42, av|✘|vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)|
|[AI-ModelScope/llama3-llava-next-8b](https://modelscope.cn/models/AI-ModelScope/llama3-llava-next-8b)|llama3_llava_next|llama3_llava_next|transformers>=4.42, av|✘|vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53.0, qwen_vl_utils|✘|vision|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)|
|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53.0, qwen_vl_utils|✘|vision|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)|
|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53.0, qwen_vl_utils|✘|vision|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Base)|
|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53.0, qwen_vl_utils|✘|vision|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Base)|
|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat)|deepseek_vl|deepseek_vl|-|✘|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat)|deepseek_vl|deepseek_vl|-|✘|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
|[deepseek-ai/deepseek-vl2-tiny](https://modelscope.cn/models/deepseek-ai/deepseek-vl2-tiny)|deepseek_vl2|deepseek_vl2|transformers<4.42|✘|vision|[deepseek-ai/deepseek-vl2-tiny](https://huggingface.co/deepseek-ai/deepseek-vl2-tiny)|
|[deepseek-ai/deepseek-vl2-small](https://modelscope.cn/models/deepseek-ai/deepseek-vl2-small)|deepseek_vl2|deepseek_vl2|transformers<4.42|✘|vision|[deepseek-ai/deepseek-vl2-small](https://huggingface.co/deepseek-ai/deepseek-vl2-small)|
|[deepseek-ai/deepseek-vl2](https://modelscope.cn/models/deepseek-ai/deepseek-vl2)|deepseek_vl2|deepseek_vl2|transformers<4.42|✘|vision|[deepseek-ai/deepseek-vl2](https://huggingface.co/deepseek-ai/deepseek-vl2)|
|[deepseek-ai/Janus-1.3B](https://modelscope.cn/models/deepseek-ai/Janus-1.3B)|deepseek_janus|deepseek_janus|-|✘|vision|[deepseek-ai/Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)|
|[deepseek-ai/Janus-Pro-1B](https://modelscope.cn/models/deepseek-ai/Janus-Pro-1B)|deepseek_janus_pro|deepseek_janus_pro|-|✘|vision|[deepseek-ai/Janus-Pro-1B](https://huggingface.co/deepseek-ai/Janus-Pro-1B)|
|[deepseek-ai/Janus-Pro-7B](https://modelscope.cn/models/deepseek-ai/Janus-Pro-7B)|deepseek_janus_pro|deepseek_janus_pro|-|✘|vision|[deepseek-ai/Janus-Pro-7B](https://huggingface.co/deepseek-ai/Janus-Pro-7B)|
|[deepseek-ai/DeepSeek-OCR](https://modelscope.cn/models/deepseek-ai/DeepSeek-OCR)|deepseek_ocr|deepseek_ocr|transformers==4.46.3, easydict|✘|vision|[deepseek-ai/DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)|
|[deepseek-ai/DeepSeek-OCR-2](https://modelscope.cn/models/deepseek-ai/DeepSeek-OCR-2)|deepseek_ocr2|deepseek_ocr2|transformers==4.46.3, easydict|✘|vision|[deepseek-ai/DeepSeek-OCR-2](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2)|
|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V)|minicpmv|minicpmv|timm, transformers<4.42|✘|vision|[openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V)|
|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2)|minicpmv|minicpmv|timm, transformers<4.42|✘|vision|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5)|minicpmv2_5|minicpmv2_5|timm, transformers>=4.36|✘|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)|minicpmv2_6|minicpmv2_6|timm, transformers>=4.36, decord|✘|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
|[OpenBMB/MiniCPM-V-4](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4)|minicpmv4|minicpmv4|timm, transformers>=4.36, decord|✘|vision, video|[openbmb/MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4)|
|[OpenBMB/MiniCPM-V-4_5](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)|minicpmv4_5|minicpmv4_5|timm, transformers>=4.36, decord|✘|vision, video|[openbmb/MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5)|
|[OpenBMB/MiniCPM-o-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6)|minicpmo|minicpmo|timm, transformers>=4.36, decord, soundfile|✘|vision, video, omni, audio|[openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)|
|[OpenBMB/MiniCPM-o-4_5](https://modelscope.cn/models/OpenBMB/MiniCPM-o-4_5)|minicpmo|minicpmo4_5|timm, transformers==4.51.3, decord, soundfile|✘|vision, video, omni, audio|[openbmb/MiniCPM-o-4_5](https://huggingface.co/openbmb/MiniCPM-o-4_5)|
|[MiniMax/MiniMax-VL-01](https://modelscope.cn/models/MiniMax/MiniMax-VL-01)|minimax_vl|minimax_vl|-|✘|vision|[MiniMaxAI/MiniMax-VL-01](https://huggingface.co/MiniMaxAI/MiniMax-VL-01)|
|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2)|mplug_owl2|mplug_owl2|transformers<4.35, icecream|✘|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1)|mplug_owl2_1|mplug_owl2|transformers<4.35, icecream|✘|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|✘|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|✘|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|✘|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
|[iic/mPLUG-Owl3-7B-241101](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-241101)|mplug_owl3_241101|mplug_owl3_241101|transformers>=4.36, icecream|✘|vision, video|[mPLUG/mPLUG-Owl3-7B-241101](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-241101)|
|[iic/DocOwl2](https://modelscope.cn/models/iic/DocOwl2)|doc_owl2|doc_owl2|transformers>=4.36, icecream|✘|vision|[mPLUG/DocOwl2](https://huggingface.co/mPLUG/DocOwl2)|
|[BAAI/Emu3-Gen](https://modelscope.cn/models/BAAI/Emu3-Gen)|emu3_gen|emu3_gen|-|✘|t2i|[BAAI/Emu3-Gen](https://huggingface.co/BAAI/Emu3-Gen)|
|[BAAI/Emu3-Chat](https://modelscope.cn/models/BAAI/Emu3-Chat)|emu3_chat|emu3_chat|transformers>=4.44.0|✘|vision|[BAAI/Emu3-Chat](https://huggingface.co/BAAI/Emu3-Chat)|
|[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0)|got_ocr2|got_ocr2|-|✘|vision|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)|
|[stepfun-ai/GOT-OCR-2.0-hf](https://modelscope.cn/models/stepfun-ai/GOT-OCR-2.0-hf)|got_ocr2_hf|got_ocr2_hf|-|✘|vision|[stepfun-ai/GOT-OCR-2.0-hf](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)|
|[stepfun-ai/Step-Audio-Chat](https://modelscope.cn/models/stepfun-ai/Step-Audio-Chat)|step_audio|step_audio|funasr, sox, conformer, openai-whisper, librosa|✘|audio|[stepfun-ai/Step-Audio-Chat](https://huggingface.co/stepfun-ai/Step-Audio-Chat)|
|[stepfun-ai/Step-Audio-2-mini](https://modelscope.cn/models/stepfun-ai/Step-Audio-2-mini)|step_audio2_mini|step_audio2_mini|transformers==4.53.3, torchaudio, librosa|✘|audio|[stepfun-ai/Step-Audio-2-mini](https://huggingface.co/stepfun-ai/Step-Audio-2-mini)|
|[moonshotai/Kimi-VL-A3B-Instruct](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Instruct)|kimi_vl|kimi_vl|transformers<4.49|✔|-|[moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)|
|[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|✔|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|✔|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
|[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|keye_vl_utils|✘|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
|[Kwai-Keye/Keye-VL-1_5-8B](https://modelscope.cn/models/Kwai-Keye/Keye-VL-1_5-8B)|keye_vl_1_5|keye_vl_1_5|keye_vl_utils>=1.5.2, transformers==4.52.4|✘|vision|[Kwai-Keye/Keye-VL-1_5-8B](https://huggingface.co/Kwai-Keye/Keye-VL-1_5-8B)|
|[rednote-hilab/dots.ocr](https://modelscope.cn/models/rednote-hilab/dots.ocr)|dots_ocr|dots_ocr|transformers>=4.51.0|✘|-|[rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr)|
|[BytedanceDouyinContent/SAIL-VL2-2B](https://modelscope.cn/models/BytedanceDouyinContent/SAIL-VL2-2B)|sail_vl2|sail_vl2|transformers<=4.51.3|✘|vision|[BytedanceDouyinContent/SAIL-VL2-2B](https://huggingface.co/BytedanceDouyinContent/SAIL-VL2-2B)|
|[BytedanceDouyinContent/SAIL-VL2-2B-Thinking](https://modelscope.cn/models/BytedanceDouyinContent/SAIL-VL2-2B-Thinking)|sail_vl2|sail_vl2|transformers<=4.51.3|✘|vision|[BytedanceDouyinContent/SAIL-VL2-2B-Thinking](https://huggingface.co/BytedanceDouyinContent/SAIL-VL2-2B-Thinking)|
|[BytedanceDouyinContent/SAIL-VL2-8B](https://modelscope.cn/models/BytedanceDouyinContent/SAIL-VL2-8B)|sail_vl2|sail_vl2|transformers<=4.51.3|✘|vision|[BytedanceDouyinContent/SAIL-VL2-8B](https://huggingface.co/BytedanceDouyinContent/SAIL-VL2-8B)|
|[BytedanceDouyinContent/SAIL-VL2-8B-Thinking](https://modelscope.cn/models/BytedanceDouyinContent/SAIL-VL2-8B-Thinking)|sail_vl2|sail_vl2|transformers<=4.51.3|✘|vision|[BytedanceDouyinContent/SAIL-VL2-8B-Thinking](https://huggingface.co/BytedanceDouyinContent/SAIL-VL2-8B-Thinking)|
|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|✘|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|✘|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
|[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|✘|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
|[AI-ModelScope/Florence-2-base-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-base-ft)|florence|florence|-|✘|vision|[microsoft/Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)|
|[AI-ModelScope/Florence-2-base](https://modelscope.cn/models/AI-ModelScope/Florence-2-base)|florence|florence|-|✘|vision|[microsoft/Florence-2-base](https://huggingface.co/microsoft/Florence-2-base)|
|[AI-ModelScope/Florence-2-large](https://modelscope.cn/models/AI-ModelScope/Florence-2-large)|florence|florence|-|✘|vision|[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large)|
|[AI-ModelScope/Florence-2-large-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-large-ft)|florence|florence|-|✘|vision|[microsoft/Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)|
|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3)|idefics3|idefics3|transformers>=4.45|✘|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|
|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
|[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|
|[AI-ModelScope/paligemma-3b-mix-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-224)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)|
|[AI-ModelScope/paligemma-3b-mix-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448)|
|[AI-ModelScope/paligemma2-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-224)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-3b-pt-224](https://huggingface.co/google/paligemma2-3b-pt-224)|
|[AI-ModelScope/paligemma2-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-3b-pt-448](https://huggingface.co/google/paligemma2-3b-pt-448)|
|[AI-ModelScope/paligemma2-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-896)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-3b-pt-896](https://huggingface.co/google/paligemma2-3b-pt-896)|
|[AI-ModelScope/paligemma2-10b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-224)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-10b-pt-224](https://huggingface.co/google/paligemma2-10b-pt-224)|
|[AI-ModelScope/paligemma2-10b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-10b-pt-448](https://huggingface.co/google/paligemma2-10b-pt-448)|
|[AI-ModelScope/paligemma2-10b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-896)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-10b-pt-896](https://huggingface.co/google/paligemma2-10b-pt-896)|
|[AI-ModelScope/paligemma2-28b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-224)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-28b-pt-224](https://huggingface.co/google/paligemma2-28b-pt-224)|
|[AI-ModelScope/paligemma2-28b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-28b-pt-448](https://huggingface.co/google/paligemma2-28b-pt-448)|
|[AI-ModelScope/paligemma2-28b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-896)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-28b-pt-896](https://huggingface.co/google/paligemma2-28b-pt-896)|
|[AI-ModelScope/paligemma2-3b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-3b-ft-docci-448](https://huggingface.co/google/paligemma2-3b-ft-docci-448)|
|[AI-ModelScope/paligemma2-10b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|✘|vision|[google/paligemma2-10b-ft-docci-448](https://huggingface.co/google/paligemma2-10b-ft-docci-448)|
|[LLM-Research/Molmo-7B-O-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-O-0924)|molmo|molmo|transformers>=4.45|✘|vision|[allenai/Molmo-7B-O-0924](https://huggingface.co/allenai/Molmo-7B-O-0924)|
|[LLM-Research/Molmo-7B-D-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-D-0924)|molmo|molmo|transformers>=4.45|✘|vision|[allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)|
|[LLM-Research/Molmo-72B-0924](https://modelscope.cn/models/LLM-Research/Molmo-72B-0924)|molmo|molmo|transformers>=4.45|✘|vision|[allenai/Molmo-72B-0924](https://huggingface.co/allenai/Molmo-72B-0924)|
|[LLM-Research/MolmoE-1B-0924](https://modelscope.cn/models/LLM-Research/MolmoE-1B-0924)|molmoe|molmo|transformers>=4.45|✘|vision|[allenai/MolmoE-1B-0924](https://huggingface.co/allenai/MolmoE-1B-0924)|
|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b)|pixtral|pixtral|transformers>=4.45|✘|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|[InfiniAI/Megrez-3B-Omni](https://modelscope.cn/models/InfiniAI/Megrez-3B-Omni)|megrez_omni|megrez_omni|-|✘|vision, audio|[Infinigence/Megrez-3B-Omni](https://huggingface.co/Infinigence/Megrez-3B-Omni)|
|[bytedance-research/Valley-Eagle-7B](https://modelscope.cn/models/bytedance-research/Valley-Eagle-7B)|valley|valley|transformers>=4.42, av|✘|vision|-|
|[LLM-Research/gemma-3-4b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-4b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-4b-pt](https://huggingface.co/google/gemma-3-4b-pt)|
|[LLM-Research/gemma-3-4b-it](https://modelscope.cn/models/LLM-Research/gemma-3-4b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)|
|[LLM-Research/gemma-3-12b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-12b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-12b-pt](https://huggingface.co/google/gemma-3-12b-pt)|
|[LLM-Research/gemma-3-12b-it](https://modelscope.cn/models/LLM-Research/gemma-3-12b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it)|
|[LLM-Research/gemma-3-27b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-27b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-27b-pt](https://huggingface.co/google/gemma-3-27b-pt)|
|[LLM-Research/gemma-3-27b-it](https://modelscope.cn/models/LLM-Research/gemma-3-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)|
|[google/medgemma-4b-pt](https://modelscope.cn/models/google/medgemma-4b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/medgemma-4b-pt](https://huggingface.co/google/medgemma-4b-pt)|
|[google/medgemma-4b-it](https://modelscope.cn/models/google/medgemma-4b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/medgemma-4b-it](https://huggingface.co/google/medgemma-4b-it)|
|[google/medgemma-27b-it](https://modelscope.cn/models/google/medgemma-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|✘|-|[google/medgemma-27b-it](https://huggingface.co/google/medgemma-27b-it)|
|[google/gemma-3n-E2B](https://modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
|[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
|[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)|
|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|
|[mistralai/Ministral-3-3B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Base-2512](https://huggingface.co/mistralai/Ministral-3-3B-Base-2512)|
|[mistralai/Ministral-3-3B-Instruct-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Instruct-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512)|
|[mistralai/Ministral-3-3B-Instruct-2512-BF16](https://modelscope.cn/models/mistralai/Ministral-3-3B-Instruct-2512-BF16)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Instruct-2512-BF16](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-BF16)|
|[mistralai/Ministral-3-8B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-8B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-8B-Base-2512](https://huggingface.co/mistralai/Ministral-3-8B-Base-2512)|
|[mistralai/Ministral-3-8B-Instruct-2512](https://modelscope.cn/models/mistralai/Ministral-3-8B-Instruct-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)|
|[mistralai/Ministral-3-8B-Instruct-2512-BF16](https://modelscope.cn/models/mistralai/Ministral-3-8B-Instruct-2512-BF16)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-8B-Instruct-2512-BF16](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512-BF16)|
|[mistralai/Ministral-3-14B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-14B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-14B-Base-2512](https://huggingface.co/mistralai/Ministral-3-14B-Base-2512)|
|[mistralai/Ministral-3-14B-Instruct-2512](https://modelscope.cn/models/mistralai/Ministral-3-14B-Instruct-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-14B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512)|
|[mistralai/Ministral-3-14B-Instruct-2512-BF16](https://modelscope.cn/models/mistralai/Ministral-3-14B-Instruct-2512-BF16)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-14B-Instruct-2512-BF16](https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512-BF16)|
|[mistralai/Ministral-3-3B-Reasoning-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Reasoning-2512)|mistral3|mistral_2512_thinking|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-3B-Reasoning-2512)|
|[mistralai/Ministral-3-8B-Reasoning-2512](https://modelscope.cn/models/mistralai/Ministral-3-8B-Reasoning-2512)|mistral3|mistral_2512_thinking|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512)|
|[mistralai/Ministral-3-14B-Reasoning-2512](https://modelscope.cn/models/mistralai/Ministral-3-14B-Reasoning-2512)|mistral3|mistral_2512_thinking|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-14B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512)|
|[mistralai/Mistral-Small-3.2-24B-Instruct-2506](https://modelscope.cn/models/mistralai/Mistral-Small-3.2-24B-Instruct-2506)|mistral3_2506|mistral_2506|transformers>=4.49|✘|-|[mistralai/Mistral-Small-3.2-24B-Instruct-2506](https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506)|
|[PaddlePaddle/PaddleOCR-VL](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL)|paddle_ocr|paddle_ocr|-|✘|-|[PaddlePaddle/PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)|
|[PaddlePaddle/PaddleOCR-VL-1.5](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL-1.5)|paddle_ocr_1_5|paddle_ocr_1_5|-|✘|-|[PaddlePaddle/PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5)|
|[Tencent-Hunyuan/HunyuanOCR](https://modelscope.cn/models/Tencent-Hunyuan/HunyuanOCR)|hunyuan_ocr|hunyuan_ocr|transformers>=4.49.0|✘|-|[tencent/HunyuanOCR](https://huggingface.co/tencent/HunyuanOCR)|
|[stepfun-ai/Step3-VL-10B-Base](https://modelscope.cn/models/stepfun-ai/Step3-VL-10B-Base)|step3_vl|step3_vl|transformers>=4.57.0|✘|vision|[stepfun-ai/Step3-VL-10B-Base](https://huggingface.co/stepfun-ai/Step3-VL-10B-Base)|
|[stepfun-ai/Step3-VL-10B](https://modelscope.cn/models/stepfun-ai/Step3-VL-10B)|step3_vl|step3_vl|transformers>=4.57.0|✘|vision|[stepfun-ai/Step3-VL-10B](https://huggingface.co/stepfun-ai/Step3-VL-10B)|
|[JinaAI/jina-reranker-m0](https://modelscope.cn/models/JinaAI/jina-reranker-m0)|jina_reranker_m0|jina_reranker_m0|-|✘|reranker, vision|[JinaAI/jina-reranker-m0](https://huggingface.co/JinaAI/jina-reranker-m0)|
## 数据集
下表介绍了ms-swift接入的数据集的相关信息:
- Dataset ID: ModelScope数据集id
- HF Dataset ID: HuggingFace数据集id
- Subset Name: 子数据集名称
- Dataset Size: 数据集大小
- Statistic: 数据集的统计量. 我们使用token数进行统计, 这对于调整`max_length`超参数有帮助. 我们使用qwen2.5的tokenizer对数据集进行分词. 不同的tokenizer的统计量不同, 如果你要获取其他的模型的tokenizer的token统计量, 可以通过[脚本](https://github.com/modelscope/ms-swift/tree/main/scripts/utils/run_dataset_info.py)自行获取.
- Tags: 数据集的tags
| Dataset ID | Subset Name | Dataset Size | Statistic (token) | Tags | HF Dataset ID |
| ---------- | ----------- | -------------| ------------------| ---- | ------------- |
|[AI-MO/NuminaMath-1.5](https://modelscope.cn/datasets/AI-MO/NuminaMath-1.5)|default|896215|116.1±80.8, min=31, max=5064|grpo, math|[AI-MO/NuminaMath-1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5)|
|[AI-MO/NuminaMath-CoT](https://modelscope.cn/datasets/AI-MO/NuminaMath-CoT)|default|859494|113.1±60.2, min=35, max=2120|grpo, math|[AI-MO/NuminaMath-CoT](https://huggingface.co/datasets/AI-MO/NuminaMath-CoT)|
|[AI-MO/NuminaMath-TIR](https://modelscope.cn/datasets/AI-MO/NuminaMath-TIR)|default|72441|100.9±52.2, min=36, max=1683|grpo, math, 🔥|[AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR)|
|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA)|chinese_traditional coig_pc exam finance douban human_value logi_qa ruozhiba segmentfault wiki wikihow xhs zhihu|44694|331.2±693.8, min=34, max=19288|general, 🔥|-|
|[AI-ModelScope/CodeAlpaca-20k](https://modelscope.cn/datasets/AI-ModelScope/CodeAlpaca-20k)|default|20022|99.3±57.6, min=30, max=857|code, en|[HuggingFaceH4/CodeAlpaca_20K](https://huggingface.co/datasets/HuggingFaceH4/CodeAlpaca_20K)|
|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT)|default|166758|1799.0±474.9, min=769, max=3151|chat, law, 🔥|[ShengbinYue/DISC-Law-SFT](https://huggingface.co/datasets/ShengbinYue/DISC-Law-SFT)|
|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT)|default|464885|426.5±178.7, min=110, max=1383|chat, medical, 🔥|[Flmc/DISC-Med-SFT](https://huggingface.co/datasets/Flmc/DISC-Med-SFT)|
|[AI-ModelScope/Duet-v0.5](https://modelscope.cn/datasets/AI-ModelScope/Duet-v0.5)|default|5000|1157.4±189.3, min=657, max=2344|CoT, en|[G-reen/Duet-v0.5](https://huggingface.co/datasets/G-reen/Duet-v0.5)|
|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset)|default|31563|250.3±70.6, min=95, max=987|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)|
|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K)|default|623302|630.7±143.0, min=301, max=1166|chat, multi-modal, vision|-|
|[AI-ModelScope/LLaVA-Pretrain](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Pretrain)|default|huge dataset|-|chat, multi-modal, quality|[liuhaotian/LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)|
|[AI-ModelScope/LaTeX_OCR](https://modelscope.cn/datasets/AI-ModelScope/LaTeX_OCR)|default human_handwrite human_handwrite_print synthetic_handwrite small|162149|117.6±44.9, min=41, max=312|chat, ocr, multi-modal, vision|[linxy/LaTeX_OCR](https://huggingface.co/datasets/linxy/LaTeX_OCR)|
|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k)|default|11998|9941.8±3417.1, min=4695, max=25826|long-sequence, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT)|coco vqa-v2 shapes shapes-rephrased coco-goi-rephrased snli-ve snli-ve-rephrased okvqa a-okvqa viquae textcap docvqa science-qa imagenet imagenet-open-ended imagenet-rephrased coco-goi clevr clevr-rephrased nlvr coco-itm coco-itm-rephrased vsr vsr-rephrased mocheg mocheg-rephrased coco-text fm-iqa activitynet-qa msrvtt ss coco-cn refcoco refcoco-rephrased multi30k image-paragraph-captioning visual-dialog visual-dialog-rephrased iqa vcr visual-mrc ivqa msrvtt-qa msvd-qa gqa text-vqa ocr-vqa st-vqa flickr8k-cn|huge dataset|-|chat, multi-modal, vision|-|
|[AI-ModelScope/MATH-lighteval](https://modelscope.cn/datasets/AI-ModelScope/MATH-lighteval)|default|7500|104.4±92.8, min=36, max=1683|grpo, math|[DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval)|
|[AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese)|default|200000|448.4±223.5, min=87, max=4098|chat, sft, 🔥, zh|[Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese)|
|[AI-ModelScope/Magpie-Qwen2-Pro-200K-English](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-English)|default|200000|609.9±277.1, min=257, max=4098|chat, sft, 🔥, en|[Magpie-Align/Magpie-Qwen2-Pro-200K-English](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-English)|
|[AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered)|default|300000|556.6±288.6, min=175, max=4098|chat, sft, 🔥|[Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered)|
|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct)|default|262040|253.3±177.4, min=42, max=2193|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)|
|[AI-ModelScope/MovieChat-1K-test](https://modelscope.cn/datasets/AI-ModelScope/MovieChat-1K-test)|default|162|39.7±2.0, min=32, max=43|chat, multi-modal, video|[Enxin/MovieChat-1K-test](https://huggingface.co/datasets/Enxin/MovieChat-1K-test)|
|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus)|default|24926|389.0±256.4, min=55, max=3153|chat, math, quality|[garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)|
|[AI-ModelScope/OpenO1-SFT](https://modelscope.cn/datasets/AI-ModelScope/OpenO1-SFT)|default|125894|1080.7±622.9, min=145, max=11637|chat, general, o1|[O1-OPEN/OpenO1-SFT](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)|
|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca)|default 3_5M|huge dataset|-|chat, multilingual, general|-|
|[AI-ModelScope/OpenOrca-Chinese](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca-Chinese)|default|huge dataset|-|QA, zh, general, quality|[yys/OpenOrca-Chinese](https://huggingface.co/datasets/yys/OpenOrca-Chinese)|
|[AI-ModelScope/SFT-Nectar](https://modelscope.cn/datasets/AI-ModelScope/SFT-Nectar)|default|131201|441.9±307.0, min=45, max=3136|cot, en, quality|[AstraMindAI/SFT-Nectar](https://huggingface.co/datasets/AstraMindAI/SFT-Nectar)|
|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o)|image_caption|57289|599.8±140.4, min=214, max=1932|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)|
|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V)|ShareGPT4V ShareGPT4V-PT|huge dataset|-|chat, multi-modal, vision|-|
|[AI-ModelScope/SkyPile-150B](https://modelscope.cn/datasets/AI-ModelScope/SkyPile-150B)|default|huge dataset|-|pretrain, quality, zh|[Skywork/SkyPile-150B](https://huggingface.co/datasets/Skywork/SkyPile-150B)|
|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k)|default|109184|483.3±338.4, min=27, max=3735|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
|[AI-ModelScope/alpaca-cleaned](https://modelscope.cn/datasets/AI-ModelScope/alpaca-cleaned)|default|51760|170.1±122.9, min=29, max=1028|chat, general, bench, quality|[yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)|
|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en)|default|52002|167.6±123.9, min=29, max=607|chat, general, 🔥|[vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4)|
|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh)|default|48818|157.2±93.2, min=27, max=544|chat, general, 🔥|[llm-wizard/alpaca-gpt4-data-zh](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data-zh)|
|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2)|default|10000|175.4±59.1, min=35, max=563|chat, math, 🔥|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)|
|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images)|default|8000|47.0±0.0, min=47, max=47|chat, multi-modal, vision|-|
|[AI-ModelScope/chartqa_digit_r1v_format](https://modelscope.cn/datasets/AI-ModelScope/chartqa_digit_r1v_format)|default|11399|48.3±5.1, min=37, max=82|grpo|[zyang39/chartqa_digit_r1v_format](https://huggingface.co/datasets/zyang39/chartqa_digit_r1v_format)|
|[AI-ModelScope/clevr_cogen_a_train](https://modelscope.cn/datasets/AI-ModelScope/clevr_cogen_a_train)|default|70000|67.0±0.0, min=67, max=67|qa, math, vision, grpo|[leonardPKU/clevr_cogen_a_train](https://huggingface.co/datasets/leonardPKU/clevr_cogen_a_train)|
|[AI-ModelScope/coco](https://modelscope.cn/datasets/AI-ModelScope/coco)|default|huge dataset|-|multi-modal, en, vqa, quality|[detection-datasets/coco](https://huggingface.co/datasets/detection-datasets/coco)|
|[AI-ModelScope/databricks-dolly-15k](https://modelscope.cn/datasets/AI-ModelScope/databricks-dolly-15k)|default|15011|199.0±268.8, min=26, max=5987|multi-task, en, quality|[databricks/databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)|
|[AI-ModelScope/deepctrl-sft-data](https://modelscope.cn/datasets/AI-ModelScope/deepctrl-sft-data)|default en|huge dataset|-|chat, general, sft, multi-round|-|
|[AI-ModelScope/egoschema](https://modelscope.cn/datasets/AI-ModelScope/egoschema)|default cls|101|191.6±80.7, min=96, max=435|chat, multi-modal, video|[lmms-lab/egoschema](https://huggingface.co/datasets/lmms-lab/egoschema)|
|[AI-ModelScope/firefly-train-1.1M](https://modelscope.cn/datasets/AI-ModelScope/firefly-train-1.1M)|default|1649399|204.3±365.3, min=28, max=9306|chat, general|[YeungNLP/firefly-train-1.1M](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)|
|[AI-ModelScope/function-calling-chatml](https://modelscope.cn/datasets/AI-ModelScope/function-calling-chatml)|default|112958|465.3±320.1, min=36, max=6106|agent, en, sft, 🔥|[Locutusque/function-calling-chatml](https://huggingface.co/datasets/Locutusque/function-calling-chatml)|
|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M)|default|396004|272.7±51.1, min=78, max=579|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0)|default|693987|133.8±93.5, min=30, max=1872|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)|
|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf)|helpful-base helpful-online helpful-rejection-sampled|huge dataset|-|rlhf, dpo|-|
|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn)|hh_rlhf harmless_base_cn harmless_base_en helpful_base_cn helpful_base_en|362909|142.3±107.5, min=25, max=1571|rlhf, dpo, 🔥|-|
|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data)|default|21476|224.4±83.9, min=69, max=832|chat, law|[Skepsun/lawyer_llama_data](https://huggingface.co/datasets/Skepsun/lawyer_llama_data)|
|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python)|default|2359|723.8±233.5, min=259, max=2117|chat, coding, 🔥|-|
|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m)|default|166211|545.8±3272.8, min=22, max=219116|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
|[AI-ModelScope/math-trn-format](https://modelscope.cn/datasets/AI-ModelScope/math-trn-format)|default|11500|102.2±88.9, min=36, max=1683|math|-|
|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric)|default addition|30000|615.7±198.7, min=251, max=2055|chat, agent, multi-round, 🔥|-|
|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k)|default|43666|938.1±694.2, min=36, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)|
|[AI-ModelScope/pile](https://modelscope.cn/datasets/AI-ModelScope/pile)|default|huge dataset|-|pretrain|[EleutherAI/pile](https://huggingface.co/datasets/EleutherAI/pile)|
|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba)|post-annual title-good title-norm|85658|40.0±18.3, min=22, max=559|pretrain, 🔥|-|
|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M)|default|248481|158.8±73.4, min=39, max=980|chat, math, quality|[BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)|
|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4)|default V3_format zh_38K_format|103329|3476.6±5959.0, min=33, max=115132|chat, multilingual, general, multi-round, gpt4, 🔥|-|
|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context)|default|78577|82.7±31.5, min=36, max=282|chat, sql, 🔥|[b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context)|
|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired)|default|huge dataset|-|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
|[AI-ModelScope/starcoderdata](https://modelscope.cn/datasets/AI-ModelScope/starcoderdata)|default|huge dataset|-|pretrain, quality|[bigcode/starcoderdata](https://huggingface.co/datasets/bigcode/starcoderdata)|
|[AI-ModelScope/synthetic_text_to_sql](https://modelscope.cn/datasets/AI-ModelScope/synthetic_text_to_sql)|default|100000|221.8±69.9, min=64, max=616|nl2sql, en|[gretelai/synthetic_text_to_sql](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql)|
|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2)|default|25000|277.3±328.3, min=40, max=1971|chat, sql|[Clinton/texttosqlv2_25000_v2](https://huggingface.co/datasets/Clinton/texttosqlv2_25000_v2)|
|[AI-ModelScope/the-stack](https://modelscope.cn/datasets/AI-ModelScope/the-stack)|default|huge dataset|-|pretrain, quality|[bigcode/the-stack](https://huggingface.co/datasets/bigcode/the-stack)|
|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin)|default|55895|104.9±51.0, min=43, max=1087|text-generation, law, pretrained|[TigerResearch/tigerbot-law-plugin](https://huggingface.co/datasets/TigerResearch/tigerbot-law-plugin)|
|[AI-ModelScope/train_0.5M_CN](https://modelscope.cn/datasets/AI-ModelScope/train_0.5M_CN)|default|519255|128.4±87.4, min=31, max=936|common, zh, quality|[BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)|
|[AI-ModelScope/train_1M_CN](https://modelscope.cn/datasets/AI-ModelScope/train_1M_CN)|default|huge dataset|-|common, zh, quality|[BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)|
|[AI-ModelScope/train_2M_CN](https://modelscope.cn/datasets/AI-ModelScope/train_2M_CN)|default|huge dataset|-|common, zh, quality|[BelleGroup/train_2M_CN](https://huggingface.co/datasets/BelleGroup/train_2M_CN)|
|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture)|default|326154|523.3±439.3, min=68, max=2549|chat, multilingual, general, multi-round|[allenai/tulu-v2-sft-mixture](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture)|
|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto)|default|230720|471.5±274.3, min=27, max=2232|rlhf, kto|-|
|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn)|default|50000|1455.2±12489.4, min=524, max=490480|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)|
|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered)|default|huge dataset|-|pretrain, quality|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)|
|[AI-ModelScope/zhihu_rlhf_3k](https://modelscope.cn/datasets/AI-ModelScope/zhihu_rlhf_3k)|default|3460|594.5±365.9, min=31, max=1716|rlhf, dpo, zh|[liyucheng/zhihu_rlhf_3k](https://huggingface.co/datasets/liyucheng/zhihu_rlhf_3k)|
|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd)|default cls|45012|66.9±87.0, min=41, max=1699|text-generation, classification, 🔥|-|
|[FreedomIntelligence/medical-o1-reasoning-SFT](https://modelscope.cn/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)|en zh|50143|98.0±53.6, min=36, max=1508|medical, o1, 🔥|[FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)|
|-|default|huge dataset|-|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
|-|auto_math_text khanacademy openstax stanford stories web_samples_v1 web_samples_v2 wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)|
|[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)|
|[LLM-Research/xlam-function-calling-60k](https://modelscope.cn/datasets/LLM-Research/xlam-function-calling-60k)|default grpo|120000|453.7±219.5, min=164, max=2779|agent, grpo, 🔥|[Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)|
|[MTEB/scidocs-reranking](https://modelscope.cn/datasets/MTEB/scidocs-reranking)|default|39193|41.9±5.8, min=31, max=107|rerank, 🔥|[mteb/scidocs-reranking](https://huggingface.co/datasets/mteb/scidocs-reranking)|
|[MTEB/stackoverflowdupquestions-reranking](https://modelscope.cn/datasets/MTEB/stackoverflowdupquestions-reranking)|default|26485|39.9±4.6, min=31, max=77|rerank, 🔥|[mteb/stackoverflowdupquestions-reranking](https://huggingface.co/datasets/mteb/stackoverflowdupquestions-reranking)|
|[OmniData/Zhihu-KOL](https://modelscope.cn/datasets/OmniData/Zhihu-KOL)|default|huge dataset|-|zhihu, qa|[wangrui6/Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)|
|[OmniData/Zhihu-KOL-More-Than-100-Upvotes](https://modelscope.cn/datasets/OmniData/Zhihu-KOL-More-Than-100-Upvotes)|default|271261|1003.4±1826.1, min=28, max=52541|zhihu, qa|[bzb2023/Zhihu-KOL-More-Than-100-Upvotes](https://huggingface.co/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes)|
|[PowerInfer/LONGCOT-Refine-500K](https://modelscope.cn/datasets/PowerInfer/LONGCOT-Refine-500K)|default|521921|296.5±158.4, min=39, max=4634|chat, sft, 🔥, cot|[PowerInfer/LONGCOT-Refine-500K](https://huggingface.co/datasets/PowerInfer/LONGCOT-Refine-500K)|
|[PowerInfer/QWQ-LONGCOT-500K](https://modelscope.cn/datasets/PowerInfer/QWQ-LONGCOT-500K)|default|498082|310.7±303.1, min=35, max=22941|chat, sft, 🔥, cot|[PowerInfer/QWQ-LONGCOT-500K](https://huggingface.co/datasets/PowerInfer/QWQ-LONGCOT-500K)|
|[ServiceNow-AI/R1-Distill-SFT](https://modelscope.cn/datasets/ServiceNow-AI/R1-Distill-SFT)|v0 v1|1850809|164.2±438.0, min=30, max=32469|chat, sft, cot, r1|[ServiceNow-AI/R1-Distill-SFT](https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT)|
|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus)|train|893929|301.4±196.7, min=50, max=1162|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)|
|[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption)|default|huge dataset|-|zh, multi-modal, vqa|-|
|[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images)|default|7736284|106.4±18.5, min=48, max=193|zh, multi-modal, vqa|-|
|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT)|default|74771|141.6±45.5, min=58, max=410|chat, general|-|
|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh)|default|74771|129.1±53.2, min=51, max=401|chat, general|-|
|[ZhipuAI/LongWriter-6k](https://modelscope.cn/datasets/ZhipuAI/LongWriter-6k)|default|6000|5009.0±2932.8, min=117, max=30354|long, chat, sft, 🔥|[zai-org/LongWriter-6k](https://huggingface.co/datasets/zai-org/LongWriter-6k)|
|-|default|huge dataset|-|pretrain, quality|[allenai/c4](https://huggingface.co/datasets/allenai/c4)|
|[bespokelabs/Bespoke-Stratos-17k](https://modelscope.cn/datasets/bespokelabs/Bespoke-Stratos-17k)|default|16710|480.7±236.1, min=266, max=3556|chat, sft, cot, r1|[bespokelabs/Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k)|
|-|default|huge dataset|-|pretrain, quality|[cerebras/SlimPajama-627B](https://huggingface.co/datasets/cerebras/SlimPajama-627B)|
|[clip-benchmark/wds_voc2007_multilabel](https://modelscope.cn/datasets/clip-benchmark/wds_voc2007_multilabel)|default|2501|112.0±0.0, min=112, max=112|multilabel, multi-modal|[clip-benchmark/wds_voc2007_multilabel](https://huggingface.co/datasets/clip-benchmark/wds_voc2007_multilabel)|
|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k)|default|27224|337.3±154.2, min=90, max=2826|chat, coding, 🔥|-|
|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k)|default|66862|440.1±208.4, min=46, max=2661|chat, coding, 🔥|-|
|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench)|default mini|638149|859.2±460.1, min=38, max=3479|chat, agent, multi-round|-|
|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft)|ar de es fr id ja ko pt ru th vi|131867|101.6±42.5, min=30, max=1029|chat, general, multilingual|-|
|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news)|default|4959|3234.4±2547.5, min=91, max=19548|chat, classification|-|
|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE)|default|1266|118.3±45.5, min=44, max=223|chat, ner|-|
|-|train validation test|3002|49.7±15.5, min=34, max=153|multi-modal, en, math|[hiyouga/geometry3k](https://huggingface.co/datasets/hiyouga/geometry3k)|
|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo|[shareAI/DPO-zh-en-emoji](https://huggingface.co/datasets/shareAI/DPO-zh-en-emoji)|
|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy)|alfworld db kg mind2web os webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
|[iic/100PoisonMpts](https://modelscope.cn/datasets/iic/100PoisonMpts)|default|906|150.6±80.8, min=39, max=656|poison-management, zh|-|
|[iic/DocQA-RL-1.6K](https://modelscope.cn/datasets/iic/DocQA-RL-1.6K)|default|1591|8307.3±7748.9, min=202, max=32563|docqa, rl, long-sequence|[Tongyi-Zhiwen/DocQA-RL-1.6K](https://huggingface.co/datasets/Tongyi-Zhiwen/DocQA-RL-1.6K)|
|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole)|default|543|413.0±79.7, min=70, max=936|chat, agent, multi-round, role-play, multi-agent|-|
|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro)|default|21910|1978.1±747.9, min=339, max=8064|chat, agent, multi-round, 🔥|-|
|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent)|default|30000|645.8±218.0, min=199, max=2070|chat, agent, multi-round, 🔥|-|
|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench)|default|316820|353.4±424.5, min=29, max=2924|chat, general, multi-round, 🔥|-|
|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1, 🔥|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|
|-|default|huge dataset|-|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)|
|-|0_30_s_academic_v0_1 0_30_s_youtube_v0_1 1_2_m_academic_v0_1 1_2_m_youtube_v0_1 2_3_m_academic_v0_1 2_3_m_youtube_v0_1 30_60_s_academic_v0_1 30_60_s_youtube_v0_1|1335486|273.7±78.8, min=107, max=638|chat, multi-modal, video|[lmms-lab/LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K)|
|[lmms-lab/multimodal-open-r1-8k-verified](https://modelscope.cn/datasets/lmms-lab/multimodal-open-r1-8k-verified)|default|7689|74.0±24.8, min=41, max=214|grpo, vision, 🔥|[lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified)|
|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen)|default|97484|130.9±21.9, min=73, max=232|text-generation, 🔥|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)|
|[mapjack/openwebtext_dataset](https://modelscope.cn/datasets/mapjack/openwebtext_dataset)|default|huge dataset|-|pretrain, zh, quality|-|
|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG)|default|17899|242.0±143.1, min=75, max=1416|text-generation, 🔥|-|
|[modelscope/MathR](https://modelscope.cn/datasets/modelscope/MathR)|default clean|6089|188.7±75.3, min=64, max=3341|qa, math|-|
|[modelscope/MathR-32B-Distill](https://modelscope.cn/datasets/modelscope/MathR-32B-Distill)|data|25921|209.4±63.1, min=121, max=3407|qa, math|-|
|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection)|default|1710|58.1±8.1, min=31, max=71|text-generation, poetry|-|
|[modelscope/clue](https://modelscope.cn/datasets/modelscope/clue)|cmnli|391783|81.6±16.0, min=54, max=157|text-generation, classification|[clue](https://huggingface.co/datasets/clue)|
|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption)|train validation|454617|389.6±68.4, min=70, max=587|chat, multi-modal, vision, 🔥|-|
|[modelscope/gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k)|main|7473|88.6±21.6, min=41, max=241|qa, math|-|
|[open-r1/DAPO-Math-17k-Processed](https://modelscope.cn/datasets/open-r1/DAPO-Math-17k-Processed)|all|17398|122.3±65.2, min=41, max=1517|math, rlvr|[open-r1/DAPO-Math-17k-Processed](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed)|
|[open-r1/verifiable-coding-problems-python](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python)|default|35735|559.0±255.2, min=74, max=6191|grpo, code|[open-r1/verifiable-coding-problems-python](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python)|
|[open-r1/verifiable-coding-problems-python-10k](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k)|default|1800|581.6±233.4, min=136, max=2022|grpo, code|[open-r1/verifiable-coding-problems-python-10k](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python-10k)|
|[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|default|1574|575.7±234.3, min=136, max=2022|grpo, code|[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|
|[open-r1/verifiable-coding-problems-python_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|default|27839|561.9±252.2, min=74, max=6191|grpo, code|[open-r1/verifiable-coding-problems-python_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|
|[open-thoughts/OpenThoughts-114k](https://modelscope.cn/datasets/open-thoughts/OpenThoughts-114k)|default|113957|413.2±186.9, min=265, max=13868|chat, sft, cot, r1|[open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)|
|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition)|default qwen3 empty_think|108|58.9±20.3, min=32, max=131|chat, self-cognition, 🔥|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default positive generate reg|5748|21.0±0.0, min=21, max=21|similarity, 🔥|[sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb)|
|[shenweizhou/alpha-umi-toolbench-processed-v2](https://modelscope.cn/datasets/shenweizhou/alpha-umi-toolbench-processed-v2)|backbone caller planner summarizer|huge dataset|-|chat, agent, 🔥|-|
|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3)|finance finance_cls medicine medicine_cls|11021|296.0±153.3, min=65, max=2267|text-generation, classification, 🔥|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)|
|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese)|baike baike_cls open_qa open_qa_cls nlpcc_dbqa nlpcc_dbqa_cls finance finance_cls medicine medicine_cls law law_cls psychology psychology_cls|39781|179.9±70.2, min=90, max=1070|text-generation, classification, 🔥|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)|
|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets)|train validation test|141600|40.8±3.3, min=33, max=53|chat, multi-modal, audio|-|
|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA)|default|18201|43.5±7.9, min=27, max=94|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
|[swift/ChartQA](https://modelscope.cn/datasets/swift/ChartQA)|default|28299|36.8±6.5, min=26, max=74|en, vqa, quality|[HuggingFaceM4/ChartQA](https://huggingface.co/datasets/HuggingFaceM4/ChartQA)|
|[swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT](https://modelscope.cn/datasets/swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|🔥, distill, sft|-|
|[swift/Chinese-Qwen3-235B-Thinking-2507-Distill-data-110k-SFT](https://modelscope.cn/datasets/swift/Chinese-Qwen3-235B-Thinking-2507-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|🔥, distill, sft, cot, r1, thinking|-|
|[swift/GRIT](https://modelscope.cn/datasets/swift/GRIT)|caption grounding vqa|huge dataset|-|multi-modal, en, caption-grounding, vqa, quality|[zzliang/GRIT](https://huggingface.co/datasets/zzliang/GRIT)|
|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA)|default|huge dataset|-|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct)|3M 7M 0625 Gen 7M_domains|huge dataset|-|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct)|birds-to-words chartqa coinstruct contrastive_caption docvqa dreamsim dvqa iconqa imagecode llava_665k_multi lrv_multi multi_vqa nextqa nlvr2 spot-the-diff star visual_story_telling|988115|619.9±156.6, min=243, max=1926|chat, multi-modal, vision|-|
|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset)|default|3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)|
|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web)|default|1009|293855.4±331149.5, min=11301, max=3577519|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)|
|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA)|default|186753|32.3±5.8, min=27, max=80|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)|
|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train)|default|9009|31.7±3.4, min=25, max=56|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)|
|[swift/OpenHermes-2.5](https://modelscope.cn/datasets/swift/OpenHermes-2.5)|default|huge dataset|-|cot, en, quality|[teknium/OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)|
|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset)|default|83132|99.6±54.8, min=30, max=362|rlhf, dpo, multi-modal, en|[openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)|
|[swift/RedPajama-Data-1T](https://modelscope.cn/datasets/swift/RedPajama-Data-1T)|default|huge dataset|-|pretrain, quality|[togethercomputer/RedPajama-Data-1T](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T)|
|[swift/RedPajama-Data-V2](https://modelscope.cn/datasets/swift/RedPajama-Data-V2)|default|huge dataset|-|pretrain, quality|[togethercomputer/RedPajama-Data-V2](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)|
|[swift/ScienceQA](https://modelscope.cn/datasets/swift/ScienceQA)|default|16967|101.7±55.8, min=32, max=620|multi-modal, science, vqa, quality|[derek-thomas/ScienceQA](https://huggingface.co/datasets/derek-thomas/ScienceQA)|
|[swift/SlimOrca](https://modelscope.cn/datasets/swift/SlimOrca)|default|517982|405.5±442.1, min=47, max=8312|quality, en|[Open-Orca/SlimOrca](https://huggingface.co/datasets/Open-Orca/SlimOrca)|
|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps)|default emb rerank|huge dataset|-|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)|
|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench)|default|124345|2251.7±1039.8, min=641, max=9451|chat, agent, multi-round|-|
|[swift/VQAv2](https://modelscope.cn/datasets/swift/VQAv2)|default|huge dataset|-|en, vqa, quality|[HuggingFaceM4/VQAv2](https://huggingface.co/datasets/HuggingFaceM4/VQAv2)|
|[swift/VideoChatGPT](https://modelscope.cn/datasets/swift/VideoChatGPT)|Generic Temporal Consistency|3206|87.4±48.3, min=31, max=398|chat, multi-modal, video, 🔥|[lmms-lab/VideoChatGPT](https://huggingface.co/datasets/lmms-lab/VideoChatGPT)|
|[swift/WebInstructSub](https://modelscope.cn/datasets/swift/WebInstructSub)|default|huge dataset|-|qa, en, math, quality, multi-domain, science|[TIGER-Lab/WebInstructSub](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)|
|[swift/aya_collection](https://modelscope.cn/datasets/swift/aya_collection)|aya_dataset|202364|474.6±1539.1, min=25, max=71312|multi-lingual, qa|[CohereForAI/aya_collection](https://huggingface.co/datasets/CohereForAI/aya_collection)|
|[swift/chinese-c4](https://modelscope.cn/datasets/swift/chinese-c4)|default|huge dataset|-|pretrain, zh, quality|[shjwudp/chinese-c4](https://huggingface.co/datasets/shjwudp/chinese-c4)|
|[swift/cinepile](https://modelscope.cn/datasets/swift/cinepile)|default|huge dataset|-|vqa, en, youtube, video|[tomg-group-umd/cinepile](https://huggingface.co/datasets/tomg-group-umd/cinepile)|
|[swift/classical_chinese_translate](https://modelscope.cn/datasets/swift/classical_chinese_translate)|default|6655|349.3±77.1, min=61, max=815|chat, play-ground|-|
|[swift/cosmopedia-100k](https://modelscope.cn/datasets/swift/cosmopedia-100k)|default|100000|1037.0±254.8, min=339, max=2818|multi-domain, en, qa|[HuggingFaceTB/cosmopedia-100k](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k)|
|[swift/dolma](https://modelscope.cn/datasets/swift/dolma)|v1_7|huge dataset|-|pretrain, quality|[allenai/dolma](https://huggingface.co/datasets/allenai/dolma)|
|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin)|flan1m-alpaca-uncensored flan5m-alpaca-uncensored|huge dataset|-|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
|[swift/github-code](https://modelscope.cn/datasets/swift/github-code)|default|huge dataset|-|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)|
|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset)|default|huge dataset|-|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)|
|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data)|llava_instruct|624255|369.7±143.0, min=40, max=905|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
|[swift/llava-instruct-mix-vsft](https://modelscope.cn/datasets/swift/llava-instruct-mix-vsft)|default|13640|178.8±119.8, min=34, max=951|multi-modal, en, vqa, quality|[HuggingFaceH4/llava-instruct-mix-vsft](https://huggingface.co/datasets/HuggingFaceH4/llava-instruct-mix-vsft)|
|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k)|default|56649|207.9±67.7, min=42, max=594|zh, medical, vqa, multi-modal|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)|
|[swift/lnqa](https://modelscope.cn/datasets/swift/lnqa)|default|huge dataset|-|multi-modal, en, ocr-vqa, quality|[vikhyatk/lnqa](https://huggingface.co/datasets/vikhyatk/lnqa)|
|[swift/longwriter-6k-filtered](https://modelscope.cn/datasets/swift/longwriter-6k-filtered)|default|666|4108.9±2636.9, min=1190, max=17050|long, chat, sft, 🔥|-|
|[swift/medical_zh](https://modelscope.cn/datasets/swift/medical_zh)|en zh|2068589|256.4±87.3, min=39, max=1167|chat, medical|-|
|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions)|default|huge dataset|-|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)|
|[swift/no_robots](https://modelscope.cn/datasets/swift/no_robots)|default|9485|300.0±246.2, min=40, max=6739|multi-task, quality, human-annotated|[HuggingFaceH4/no_robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots)|
|[swift/orca_dpo_pairs](https://modelscope.cn/datasets/swift/orca_dpo_pairs)|default|12859|364.9±248.2, min=36, max=2010|rlhf, quality|[Intel/orca_dpo_pairs](https://huggingface.co/datasets/Intel/orca_dpo_pairs)|
|[swift/path-vqa](https://modelscope.cn/datasets/swift/path-vqa)|default|19654|34.2±6.8, min=28, max=85|multi-modal, vqa, medical|[flaviagiammarino/path-vqa](https://huggingface.co/datasets/flaviagiammarino/path-vqa)|
|[swift/pile-val-backup](https://modelscope.cn/datasets/swift/pile-val-backup)|default|214661|1831.4±11087.5, min=21, max=516620|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose)|default|huge dataset|-|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco)|caption grounding|92430|45.4±3.0, min=37, max=63|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog)|caption grounding|89598|50.3±4.6, min=39, max=91|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
|[swift/sharegpt](https://modelscope.cn/datasets/swift/sharegpt)|common-zh unknow-zh common-en|194063|820.5±366.1, min=25, max=2221|chat, general, multi-round|-|
|[swift/swift-sft-mixture](https://modelscope.cn/datasets/swift/swift-sft-mixture)|sharegpt firefly codefuse metamathqa|huge dataset|-|chat, sft, general, 🔥|-|
|[swift/tagengo-gpt4](https://modelscope.cn/datasets/swift/tagengo-gpt4)|default|76437|468.1±276.8, min=28, max=1726|chat, multi-lingual, quality|[lightblue/tagengo-gpt4](https://huggingface.co/datasets/lightblue/tagengo-gpt4)|
|[swift/train_3.5M_CN](https://modelscope.cn/datasets/swift/train_3.5M_CN)|default|huge dataset|-|common, zh, quality|[BelleGroup/train_3.5M_CN](https://huggingface.co/datasets/BelleGroup/train_3.5M_CN)|
|[swift/ultrachat_200k](https://modelscope.cn/datasets/swift/ultrachat_200k)|default|207843|1188.0±571.1, min=170, max=4068|chat, en, quality|[HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)|
|[swift/wikipedia](https://modelscope.cn/datasets/swift/wikipedia)|default|huge dataset|-|pretrain, quality|[wikipedia](https://huggingface.co/datasets/wikipedia)|
|[tany0699/garbage265](https://modelscope.cn/datasets/tany0699/garbage265)|default|132673|39.0±0.0, min=39, max=39|cls, 🔥, multi-modal|-|
|[tastelikefeet/competition_math](https://modelscope.cn/datasets/tastelikefeet/competition_math)|default|12000|101.9±87.3, min=36, max=1683|qa, math|-|
|-|default|huge dataset|-|pretrain, quality|[tiiuae/falcon-refinedweb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)|
|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all)|default|806199|97.3±20.9, min=62, max=414|chat, general|-|
|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en)|default|20022|99.3±57.6, min=30, max=857|chat, coding|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en)|default|68912|264.5±207.1, min=30, max=2268|chat, financial|[ssbuild/alpaca_finance_en](https://huggingface.co/datasets/ssbuild/alpaca_finance_en)|
|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild)|default subset|103695|125.1±43.7, min=35, max=801|chat, general|-|
|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct)|default|888970|271.0±333.6, min=34, max=3967|chat, general|-|
|[zouxuhong/Countdown-Tasks-3to4](https://modelscope.cn/datasets/zouxuhong/Countdown-Tasks-3to4)|default|490364|126.6±2.0, min=122, max=130|math|-|
================================================
FILE: docs/source/Instruction/Use-tuners.md
================================================
# 使用Tuners
tuner是指附加在模型上的额外结构部分,用于减少训练参数量或者提高训练精度。目前SWIFT支持的tuners有:
- LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685)
- LoRA+: [LoRA+: Efficient Low Rank Adaptation of Large Models](https://arxiv.org/pdf/2402.12354.pdf)
- LLaMA PRO: [LLAMA PRO: Progressive LLaMA with Block Expansion](https://arxiv.org/pdf/2401.02415.pdf)
- GaLore/Q-GaLore: [GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection](https://arxiv.org/abs/2403.03507)
- Liger Kernel: [Liger Kernel: Efficient Triton Kernels for LLM Training](https://arxiv.org/abs/2410.10989)
- LISA: [LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning](https://arxiv.org/abs/2403.17919)
- UnSloth: https://github.com/unslothai/unsloth
- SCEdit: [SCEdit: Efficient and Controllable Image Diffusion Generation via Skip Connection Editing](https://arxiv.org/abs/2312.11392) < [arXiv](https://arxiv.org/abs/2312.11392) | [Project Page](https://scedit.github.io/) >
- NEFTune: [Noisy Embeddings Improve Instruction Finetuning](https://arxiv.org/abs/2310.05914)
- LongLoRA: [Efficient Fine-tuning of Long-Context Large Language Models](https://arxiv.org/abs/2309.12307)
- Adapter: [Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751)
- Vision Prompt Tuning: [Visual Prompt Tuning](https://arxiv.org/abs/2203.12119)
- Side: [Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503)
- Res-Tuning: [Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding Tuner from Backbone](https://arxiv.org/abs/2310.19859) < [arXiv](https://arxiv.org/abs/2310.19859) | [Project Page](https://res-tuning.github.io/) >
- [PEFT](https://github.com/huggingface/peft)提供的tuners, 如AdaLoRA、DoRA、Fourierft等
## 接口列表
### Swift类静态接口
- `Swift.prepare_model(model, config, **kwargs)`
- 接口作用:加载某个tuner到模型上,如果是PeftConfig的子类,则使用Peft库的对应接口加载tuner。在使用SwiftConfig的情况下,本接口可以传入SwiftModel实例并重复调用,此时和config传入字典的效果相同。
- 本接口支持并行加载不同类型的多个tuners共同使用
- 参数:
- `model`: `torch.nn.Module`或`SwiftModel`的实例,被加载的模型
- `config`: `SwiftConfig`、`PeftConfig`的实例,或者一个自定义tuner名称对config的字典
- 返回值:`SwiftModel`或`PeftModel`的实例
- `Swift.merge_and_unload(model)`
- 接口作用:将LoRA weights合并回原模型,并将LoRA部分完全卸载
- 参数:
- model: `SwiftModel`或`PeftModel`的实例,已加载LoRA的模型实例
- 返回值:None
- `Swift.merge(model)`
- 接口作用:将LoRA weights合并回原模型,不卸载LoRA部分
- 参数:
- model: `SwiftModel`或`PeftModel`的实例,已加载LoRA的模型实例
- 返回值:None
- `Swift.unmerge(model)`
- 接口作用:将LoRA weights从原模型weights中拆分回LoRA结构
- 参数:
- model: `SwiftModel`或`PeftModel`的实例,已加载LoRA的模型实例
- 返回值:None
- `Swift.save_to_peft_format(ckpt_dir, output_dir)`
- 接口作用:将存储的LoRA checkpoint转换为Peft兼容的格式。主要改变有:
- `default`会从对应的`default`文件夹中拆分到output_dir根目录中
- weights中的`{tuner_name}.`字段会被移除,如`model.layer.0.self.in_proj.lora_A.default.weight`会变为`model.layer.0.self.in_proj.lora_A.weight`
- weights中的key会增加`basemodel.model`前缀
- 注意:只有LoRA可以被转换,其他类型tuner由于Peft本身不支持,因此会报转换错误。此外,由于LoRAConfig中存在额外参数,如`dtype`,因此在这些参数有设定的情况下,不支持转换为Peft格式,此时可以手动删除adapter_config.json中的对应字段
- 参数:
- ckpt_dir:原weights目录
- output_dir:目标weights目录
- 返回值:None
- `Swift.from_pretrained(model, model_id, adapter_name, revision, **kwargs)`
- 接口作用:从存储的weights目录中加载起tuner到模型上,如果adapter_name不传,则会将model_id目录下所有的tuners都加载起来。同`prepare_model`相同,本接口可以重复调用
- 参数:
- model:`torch.nn.Module`或`SwiftModel`的实例,被加载的模型
- model_id:`str`类型,待加载的tuner checkpoint, 可以是魔搭hub的id,或者训练产出的本地目录
- adapter_name:`str`或`List[str]`或`Dict[str, str]`类型或`None`,待加载tuner目录中的tuner名称,如果为`None`则加载所有名称的tuners,如果是`str`或`List[str]`则只加载某些具体的tuner,如果是`Dict`,则将`key`指代的tuner加载起来后换成`value`的名字
- revision: 如果model_id是魔搭的id,则revision可以指定对应版本号
### SwiftModel接口
下面列出用户可能调用的接口列表,其他内部接口或不推荐使用的接口可以通过`make docs`命令查看API Doc文档。
- `SwiftModel.create_optimizer_param_groups(self, **defaults)`
- 接口作用:根据加载的tuners创建parameter groups,目前仅对`LoRA+`算法有作用
- 参数:
- defaults:`optimizer_groups`的默认参数,如`lr`和`weight_decay`
- 返回值:
- 创建的`optimizer_groups`
- `SwiftModel.add_weighted_adapter(self, ...)`
- 接口作用:将已有的LoRA tuners合并为一个
- 参数:
- 本接口是PeftModel.add_weighted_adapter的透传,参数可以参考:[add_weighted_adapter文档](https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraModel.add_weighted_adapter)
- `SwiftModel.save_pretrained(self, save_directory, safe_serialization, adapter_name)`
- 接口作用:存储tuner weights
- 参数:
- save_directory:存储目录
- safe_serialization: 是否使用safe_tensors,默认为False
- adapter_name:存储的adapter tuner,如果不传则默认存储所有的tuners
- `SwiftModel.set_active_adapters(self, adapter_names, offload=None)`
- 接口作用:设置当前激活的adapters,不在列表中的adapters会被失活
- 在`推理`时支持环境变量`USE_UNIQUE_THREAD=0/1`,默认值`1`,如果为`0`则set_active_adapters只对当前线程生效,此时默认使用本线程激活的tuners,不同线程tuners互不干扰
- 参数:
- adapter_names:激活的tuners
- offload:失活的adapters如何处理,默认为`None`代表留在显存中,同时支持`cpu`和`meta`,代表offload到cpu和meta设备中以减轻显存消耗,在`USE_UNIQUE_THREAD=0`时offload不要传值以免影响其他线程
- 返回值:None
- `SwiftModel.activate_adapter(self, adapter_name)`
- 接口作用:激活一个tuner
- 在`推理`时支持环境变量`USE_UNIQUE_THREAD=0/1`,默认值`1`,如果为`0`则activate_adapter只对当前线程生效,此时默认使用本线程激活的tuners,不同线程tuners互不干扰
- 参数:
- adapter_name:待激活的tuner名字
- 返回值:None
- `SwiftModel.deactivate_adapter(self, adapter_name, offload)`
- 接口作用:失活一个tuner
- 在`推理`时环境变量`USE_UNIQUE_THREAD=0`时不要调用本接口
- 参数:
- adapter_name:待失活的tuner名字
- offload:失活的adapters如何处理,默认为`None`代表留在显存中,同时支持`cpu`和`meta`,代表offload到cpu和meta设备中以减轻显存消耗
- 返回值:None
- `SwiftModel.get_trainable_parameters(self)`
- 接口作用:返回训练参数信息
- 参数:无
- 返回值:训练参数信息,格式如下:
```text
trainable params: 100M || all params: 1000M || trainable%: 10.00% || cuda memory: 10GiB.
```
================================================
FILE: docs/source/Megatron-SWIFT/Ascend.md
================================================
# Ascend NPU
关于Megatron-SWIFT在Ascend NPU上的环境准备,请参考[NPU最佳实践](../BestPractices/NPU-support.md)。
## NPU 性能数据采集
NPU性能采集通过`torch_npu.profiler.profile`接口进行采集,创建torch_npu.profiler.profile对象,通过start和stop接口控制采集性能数据,采集过程需要修改ms-swift源码,修改swift/megatron/trainers/base.py文件中的train函数,采集示例如下:
```python
import torch_npu
...
experimental_config = torch_npu.profiler._ExperimentalConfig(
profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
)
prof = torch_npu.profiler.profile(
activities=[
torch_npu.profiler.ProfilerActivity.CPU,
torch_npu.profiler.ProfilerActivity.NPU
],
schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1, skip_first=6),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"),
profile_memory=False, # 关闭采集内存信息
with_stack=False, # 关闭采集堆栈信息
experimental_config=experimental_config)
prof.start()
# ms-swift 逻辑
while state.iteration < args.train_iters:
...
metric, grad_norm, update_successful = train_step(train_data_iterator)
# 性能数据采集
prof.step()
...
prof.stop()
```
## NPU 精度数据采集
### 安装msprobe
```shell
pip install mindstudio-probe
```
### 代码修改
为了支持 msprobe 工具进行精度调试,我们需要修改 `swift/megatron/model/mm_gpt_model.py` 文件中的 `_patch_word_embeddings` 函数。主要改动是调整函数参数和内部实现逻辑,使其能够正确地对嵌入层进行patch
下面是具体的修改内容:
修改前:
```python
def _patch_word_embeddings(self, kwargs):
origin_forward = VocabParallelEmbedding.forward
def forward(_self, input_):
args = get_args()
reduce_scatter_embeddings = _self.reduce_scatter_embeddings
_self.reduce_scatter_embeddings = False
input_ = torch.masked_fill(input_, input_ < 0, 0)
res = origin_forward(_self, input_)
_self.reduce_scatter_embeddings = reduce_scatter_embeddings
packed_seq_params = kwargs.get('packed_seq_params')
# ...其他逻辑...
return res
VocabParallelEmbedding.forward = forward
try:
yield
finally:
VocabParallelEmbedding.forward = origin_forward
def forward(
self,
input_ids: torch.Tensor,
position_ids: torch.Tensor,
attention_mask: torch.Tensor = None,
decoder_input: torch.Tensor = None,
labels: torch.Tensor = None,
inference_params: InferenceParams = None,
packed_seq_params: PackedSeqParams = None,
**kwargs,
) -> torch.Tensor:
if decoder_input is not None:
pass
elif self.pre_process:
kwargs.update({'input_ids': input_ids, 'packed_seq_params': packed_seq_params})
with self._patch_word_embeddings(kwargs):
decoder_input = self.language_model.embedding(input_ids=input_ids, position_ids=position_ids)
# ...其他逻辑...
```
修改后:
```python
def _patch_word_embeddings(self, kwargs, emb): # 修改1
origin_forward = emb.word_embeddings.forward # 修改2
def forward(input_): # 修改3
args = get_args()
_self = emb.word_embeddings # 修改4
reduce_scatter_embeddings = _self.reduce_scatter_embeddings
_self.reduce_scatter_embeddings = False
input_ = torch.masked_fill(input_, input_ < 0, 0)
res = origin_forward(input_) # 修改5
_self.reduce_scatter_embeddings = reduce_scatter_embeddings
packed_seq_params = kwargs.get('packed_seq_params')
# ...其他逻辑...
return res
emb.word_embeddings.forward = forward # 修改6
try:
yield
finally:
emb.word_embeddings.forward = origin_forward # 修改7
def forward(
self,
input_ids: torch.Tensor,
position_ids: torch.Tensor,
attention_mask: torch.Tensor = None,
decoder_input: torch.Tensor = None,
labels: torch.Tensor = None,
inference_params: InferenceParams = None,
packed_seq_params: PackedSeqParams = None,
**kwargs,
) -> torch.Tensor:
if decoder_input is not None:
pass
elif self.pre_process:
kwargs.update({'input_ids': input_ids, 'packed_seq_params': packed_seq_params})
with self._patch_word_embeddings(kwargs, self.language_model.embedding): # 修改8
decoder_input = self.language_model.embedding(input_ids=input_ids, position_ids=position_ids)
# ...其他逻辑...
```
主要变化包括:
1. `_patch_word_embeddings` 方法增加了 `emb` 参数,用于接收 embedding 模块实例
2. 直接获取 `emb.word_embeddings.forward` 而不是 `VocabParallelEmbedding.forward`
3. 内部 `forward` 函数签名从 `(_self, input_)` 改为 `(input_)`
4. 在函数内部通过 `emb.word_embeddings` 获取 `_self`
5. 调用原始 forward 时直接传入 `input_`
6. 使用 `emb.word_embeddings.forward` 进行替换和恢复操作(修改6、7)
7. 在调用 `_patch_word_embeddings` 时传入 `self.language_model.embedding` 实例
对文件swift/megatron/trainers/base.py中的train_step函数进行修改
修改前:
```python
def train_step(self, forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config, *args,
**kwargs):
new_data_iterator = self._replace_data_iterator(data_iterator, model)
return self._origin_train_step(forward_step_func, new_data_iterator, model, optimizer, opt_param_scheduler,
config, *args, **kwargs)
```
修改后:
```python
def train_step(self, forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config, *args,
**kwargs):
new_data_iterator = self._replace_data_iterator(data_iterator, model)
from msprobe.pytorch import PrecisionDebugger
debugger = PrecisionDebugger(dump_path='./dump_path', level='mix', model=model)
debugger.start()
try:
origin_train_step_out = self._origin_train_step(
forward_step_func, new_data_iterator, model, optimizer, opt_param_scheduler,config, *args, **kwargs)
finally:
debugger.stop()
debugger.step()
return origin_train_step_out
```
### 使能
另外,由于msprobe不支持融合计算,需要在启动脚本添加`--bias_dropout_fusion false`、`--bias_swiglu_fusion false`、`--cross_entropy_loss_fusion false`
#### 示例
```shell
PYTORCH_NPU_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--mcore_model Qwen2.5-7B-Instruct-mcore \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--tensor_model_parallel_size 2 \
...
--bias_dropout_fusion false \
--bias_swiglu_fusion false \
--cross_entropy_loss_fusion false
```
================================================
FILE: docs/source/Megatron-SWIFT/Command-line-parameters.md
================================================
# 命令行参数
## Megatron参数
**训练参数**:
- 🔥micro_batch_size: 每个device的批次大小,默认为1。
- 🔥global_batch_size: 总批次大小,等价于`micro_batch_size*数据并行大小*梯度累加步数`。默认为16。
- 其中,`数据并行大小 (DP) = 总GPU数 / (TP × PP × CP)`。
- 🔥recompute_granularity: 重新计算激活的粒度,可选项为'full', 'selective' and 'none'。其中full代表重新计算整个transformer layer,selective代表只计算transformer layer中的核心注意力部分。通常'selective'是推荐的。默认为'selective'。
- 当你设置为'selective'时,你可以通过指定`--recompute_modules`来选择对哪些部分进行重新计算。
- 🔥recompute_method: 该参数需将recompute_granularity设置为'full'才生效,可选项为'uniform', 'block'。默认为None。
- 🔥recompute_num_layers: 该参数需将recompute_granularity设置为'full'才生效,默认为None。若`recompute_method`设置为uniform,该参数含义为每个均匀划分的重新计算单元的transformer layers数量。例如你可以指定为`--recompute_granularity full --recompute_method uniform --recompute_num_layers 4`。recompute_num_layers越大,显存占用越小,计算成本越大。注意:当前进程中的模型层数需能被`recompute_num_layers`整除。默认为None。
- 🔥recompute_modules: 选项包括"core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", "moe",默认值为`["core_attn"]`。该参数在`--recompute_granularity selective`时生效。例如在MoE训练时,你可以通过指定`--recompute_granularity selective --recompute_modules core_attn moe`降低显存。其中"core_attn"、"mlp" 和 "moe" 使用常规检查点,"moe_act"、"layernorm" 和 "mla_up_proj" 使用输出丢弃检查点。
- "core_attn": 重新计算 Transformer 层中的核心注意力部分。
- "mlp": 重新计算密集的 MLP 层。
- "moe": 重新计算 MoE 层。
- "moe_act": 重新计算 MoE 中的 MLP 激活函数部分。
- "layernorm": 重新计算 input_layernorm 和 pre_mlp_layernorm。
- "mla_up_proj": 重新计算 MLA 上投影和 RoPE 应用部分。
- 🔥train_iters: 训练的总迭代次数,默认为None。
- 提示:你可以通过设置`--num_train_epochs`来设置训练的epochs数。在使用非流式数据集时,会自动根据数据集数量计算`train_iters`(兼容packing)。
- 🔥num_train_epochs: 指定训练的epochs数。当使用非流式数据集时,该参数会为你自动计算train_iters而不需要手动传入`train_iters`。当使用流式数据集时,该参数会在训练到`num_train_epochs`时强制退出训练,并对权重进行验证和保存。默认为None。
- masked_softmax_fusion: 默认为True。用于开启query_key_value的scaling, masking, and softmax融合。
- bias_dropout_fusion: 默认为True。用于开启bias和dropout的融合。
- bias_activation_fusion: 如果为True,则在可能的情况下融合偏置加法和激活函数。默认为True。
- apply_rope_fusion: 默认为False。用于开启rope融合。该参数为megatron-core参数透传。注意:并不是所有情况都支持rope融合,例如:MLA、mrope等不支持。
- gradient_accumulation_fusion: 默认为True。用于开启梯度累加融合。
- 🔥cross_entropy_loss_fusion: 启动交叉熵损失计算融合。默认为True。
- cross_entropy_fusion_impl: 交叉熵损失融合的实现。可选为'native'和'te'。默认为None,如果是cuda设置为'te',npu设置为'native'。
- calculate_per_token_loss: 根据全局批次中的非填充token数量来对交叉熵损失进行缩放。默认为None,`task_type`为'causal_lm'且为预训练/微调时,默认为True,否则默认为False。
- 🔥attention_backend: 使用的注意力后端 (flash、fused、unfused、local、auto)。默认为 flash。
- 如果安装'flash_attention_3',`--attention_backend flash`则优先使用fa3。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/flash_attention_3)。多模态模型的vit部分要使用flash_attention_3,请设置`--attn_impl flash_attention_3`。
- 有些模型可能不支持flash,你需要手动设置`--attention_backend unfused/fused --padding_free false`,例如:Llama4、GPT-OSS。
- optimizer: 优化器类型,可选为'adam'、'sgd'。默认为adam。
- 注意:此'adam'为'adamw',参考[这里](https://github.com/NVIDIA/TransformerEngine/blob/d8f1e68f7c414f3e7985a8b41de4443b2f819af3/transformer_engine/pytorch/optimizers/fused_adam.py#L69-L70)。
- 🔥optimizer_cpu_offload: 将优化器状态卸载到 CPU,例如设置:`--use_precision_aware_optimizer true --optimizer_cpu_offload true --optimizer_offload_fraction 0.7`。默认为False。
- 该参数可以显著降低显存占用(但增加内存占用)。若global_batch_size较大,则对训练速度的影响不大。
- 🔥optimizer_offload_fraction: 卸载到 CPU 的优化器状态所占比例。默认为1.。
- use_precision_aware_optimizer: 使用 TransformerEngine 中的精度感知优化器,该优化器允许将主参数和优化器状态设置为较低精度,例如 fp16 和 fp8。
- main_grads_dtype: 启用 use_precision_aware_optimizer 时主梯度的 dtype。可选为'fp32', 'bf16'。默认为'fp32'。
- main_params_dtype: 启用 use_precision_aware_optimizer 时主参数的 dtype。可选为'fp32', 'fp16'。默认为'fp32'。
- exp_avg_dtype: 启用 use_precision_aware_optimizer 时,adam 优化器中 exp_avg(即一阶矩)的 dtype。该 dtype 用于在训练过程中将优化器状态存储在内存中,但不会影响内核计算时的精度。可选为'fp32', 'fp16', 'bf16', 'fp8'。默认为'fp32'。
- exp_avg_sq_dtype: 启用 use_precision_aware_optimizer 时,adam 优化器中 exp_avg_sq(即二阶矩)的 dtype。该 dtype 用于在训练过程中将优化器状态存储在内存中,但不会影响内核计算的精度。可选为'fp32', 'fp16', 'bf16', 'fp8'。默认为'fp32'。
- manual_gc: 禁用默认垃圾回收器,手动触发垃圾回收。默认为False。
- manual_gc_steps: 手动触发垃圾回收的间隔(steps)。默认为0。
- manual_gc_eval: 当使用手动垃圾回收时(`--manual_gc true`),在每次评估运行的开始和结束时禁用垃圾回收。默认为True。
**数据参数**:
- seed: python、numpy、pytorch和cuda的随机种子,默认为42。
- dataset_shuffle: 是否对dataset进行随机操作。默认为True。
- 注意:**Megatron-SWIFT的随机包括两个部分**:数据集的随机,由`dataset_shuffle`控制;train_dataloader中的随机,由`train_dataloader_shuffle`控制。
- train_dataloader_shuffle: 是否对train_dataloader使用随机,默认为True。val_dataset不进行随机操作。
- 🔥dataloader_num_workers: dataloader的workers数量,默认为4。
- 注意:若设置`--streaming true`,则设置为1。
- dataloader_pin_memory: 默认为True。
- dataloader_persistent_workers: 默认为True。
- dataloader_prefetch_factor: 默认为2。
- data_sharding: 当`--train_dataloader_shuffle true`时对 train_dataloader 生效,默认为False。该参数控制数据集随机的范围。若设置为True,则先对数据集进行分片,然后对每个分片进行随机处理(略节约内存);若设置为False,则先对数据集进行随机,再进行分片(更好的随机效果)。
- 🔥group_by_length: 是否在训练数据集中将长度大致相同的样本分组在一起(有随机因素),以最小化填充并确保各节点与进程的负载均衡以提高效率。默认为False。具体算法参考`transformers.trainer_pt_utils.get_length_grouped_indices`。
- te_rng_tracker: 使用 Transformer Engine 版本的随机数生成器。默认为False。
- data_parallel_random_init: 在数据并行的各个 rank 之间启用不同的随机初始化。默认为False。
- padding_free: 将一个batch中的数据进行展平而避免数据padding,从而降低显存占用并加快训练。默认为True。
- 若要自定义attention_mask,你可以设置`--padding_free false`。
- 注意:**Megatron-SWIFT训练特性优先支持padding_free格式**,若非特殊情况,请勿修改该值。
- mlp_padding_free: 默认为False。用于padding_free设置为false时,对mlp进行padding_free优化。这可以在自定义attention_mask的同时,提升训练速度和减少显存占用。
**学习率参数**:
- lr_warmup_init: 学习率warmup的初始值。学习率调度器从这个值开始进行预热。默认为0。
- 🔥lr: 初始学习率,最终会根据学习率预热策略和衰减策略决定每个迭代的学习率。默认为None,**全参数训练默认为1e-5,LoRA训练默认为1e-4**。
- lr_decay_style: 学习率衰减策略,默认为'cosine'。通常设置为'constant', 'linear', 'cosine', 'inverse-square-root', 'WSD'。
- 🔥lr_decay_iters: 学习率衰减的迭代次数。默认为None,则设置为`--train_iters`。
- lr_warmup_iters: 线性学习率预热的迭代次数,默认为0。
- 🔥lr_warmup_fraction: 线性学习率预热阶段所占比例,默认为None。
- 🔥min_lr: 学习率的最小值,将低于该阈值的学习率裁剪为该值,默认为0。
- lr_wsd_decay_style: WSD 退火阶段的衰减方式。默认为'exponential'。
- lr_wsd_decay_iters: 学习率衰减的迭代次数。默认为 None。
**正则化参数**:
- 🔥weight_decay: 默认为0.1。
- weight_decay_incr_style: 权重衰减的递增函数。可选为'constant', 'linear', 'cosine'。默认为'constant'。
- start_weight_decay: L2 正则化的初始权重衰减系数。
- end_weight_decay: 训练结束时 L2 正则化的权重衰减系数。
- 🔥clip_grad: l2梯度裁剪,默认为1.0。
- 日志中打印的grad_norm为未裁剪前的值。
- adam_beta1: 默认0.9。
- adam_beta2: 默认0.95。
- adam_eps: 默认1e-8。
- sgd_momentum: 设置`--optimizer sgd`时生效,默认为0.9。
**checkpoint参数**:
- 🔥output_dir: checkpoint的输出目录,默认None。在训练中,若未设置该参数,则默认为`f'megatron_output/{model_suffix}'`,例如`'megatron_output/Qwen2.5-7B-Instruct'`。
- 注意:**若在多机训练时,请确保每个节点的保存路径指向相同位置**,否则你需要在训练后手动集中这些权重。
- 🔥save_steps: checkpoint保存的间隔(steps),默认为500。
- 注意:训练结束时一定会保存权重。
- 🔥no_save_optim: 不保存optimizer,默认为False。在全参数训练时,可以显著降低存储时间。
- 🔥no_save_rng: 不保存rng,默认为False。
- 🔥mcore_model: 加载的checkpoint目录(mcore存储格式),默认None。对于断点续训的介绍,请查看`--finetune false`参数的介绍。
- megatron-swift推荐直接加载和存储safetensors权重,参考[mcore-bridge文档](./Mcore-Bridge.md)。
- `--model`与`--mcore_model`的区别:`--model/--adapters/--ref_model/--ref_adapters`后加safetensors权重目录,`--mcore_model/--mcore_adapter/--mcore_ref_model/--mcore_ref_adapter`后加mcore权重目录。`--model/--adapters`不支持加载断点续训状态,因此若设置`--no_save_optim false`,将额外存储mcore权重格式用于断点续训,你需要使用`--mcore_model/--mcore_adapter`来加载断点续训的状态。
- 🔥no_load_optim: 不载入optimizer,默认为False。
- 注意:断点续训时,设置`--no_load_optim false`读取优化器状态通常比`--no_load_optim true`不读取优化器状态消耗更大的显存资源。
- 🔥no_load_rng: 不载入rng,默认为False。
- 🔥finetune: 将模型加载并微调。**不加载检查点的优化器和随机种子状态,并将迭代数设置为0**。默认为True。
- 注意:**断点续训**你需要设置`--mcore_model`(lora训练需要额外设置`--mcore_adapter`),若设置`--finetune true`,将不加载优化器状态和随机种子状态并将迭代数设置为0,不会进行数据集跳过;若设置`--finetune false`,将读取迭代数并跳过之前训练的数据集数量,优化器状态和随机种子状态的读取通过`--no_load_optim`和`--no_load_rng`控制。
- 流式数据集`--streaming`,暂不支持跳过数据集。
- perform_initialization: 对权重进行初始化,默认为False。
- use_cpu_initialization: 在cpu上初始化权重,默认为False。在进行HF和MCore权重转换时会被使用。通常不需要修改该值。
- 🔥async_save: 使用异步检查点保存。目前仅适用于`torch_dist`分布式检查点格式。默认为False。
- 🔥save_total_limit: 最多保存的checkpoint数,会将过期的checkpoint进行删除。默认为None,保存所有的checkpoint。该参数需设置为`>=2`的数,若设置为2,则保存best checkpoint和last checkpoint。该参数暂不兼容`async_save`。
- metric_for_best_model: 默认为None,GRPO默认为'reward',其他情况默认为'loss'。
- greater_is_better: 默认为None,即当`metric_for_best_model`含'loss'时,设置为False,否则设置为True。
- use_persistent_ckpt_worker: 为异步保存启动持久化检查点工作进程。默认为False。
- dist_ckpt_save_pre_mcore_014: 使用 Megatron-Core 0.14 之前的格式存储。默认为False。
- dist_ckpt_optim_fully_reshardable: 使优化器分布式检查点完全可重分片(TP/PP/EP/DP),而不是仅支持普通的DP重分片。默认为False。
- distrib_optim_fully_reshardable_mem_efficient: 在分布式优化器检查点保存和加载过程中,通过使用Gloo(而非NCCL),并仅使用单个rank进行保存,以尽可能减少内存使用。仅在遇到主机或设备内存问题时启用,仅在设置了`--dist-ckpt-optim-fully-reshardable`标志时生效。默认为False。
**分布式参数**:
并行技术的选择请参考[训练技巧文档](Quick-start.md#训练技巧)。
- ddp_backend: 分布式后端,可选为'nccl', 'gloo'。默认为nccl。
- ddp_timeout: 默认为18000000,单位为秒。
- 🔥use_distributed_optimizer: 使用分布式优化器(即zero1)。默认为True。
- 🔥tensor_model_parallel_size: tp数,默认为1。
- 🔥pipeline_model_parallel_size: pp数,默认为1。
- 🔥decoder_first_pipeline_num_layers: decoder第一个流水线阶段所包含的Transformer层数。默认为 None,表示将Transformer层数平均分配到所有流水线阶段。
- 该参数通常用于**Transformer层数无法被PP整除**,或者多模态模型第0个pp阶段显存占用过高的情况。
- 🔥decoder_last_pipeline_num_layers: decoder最后一个流水线阶段所包含的Transformer层数。默认为 None,表示将Transformer层数平均分配到所有流水线阶段。
- account_for_embedding_in_pipeline_split: 如果设置为 True,在流水线并行的划分和放置策略中,输入 embedding 层会被视为一个标准的 Transformer 层来处理。默认为False。
- account_for_loss_in_pipeline_split: 如果设置为 True,在流水线并行的划分和放置策略中,loss 层会被视为一个标准的 Transformer 层来处理。默认为False。
- overlap_p2p_comm: 在 1F1B 中将流水线并行通信与前向和反向块重叠,默认为True。
- align_param_gather: 设置为True,所有 PP 阶段将同时启动参数全收集(all-gather)操作。否则,每个 PP 阶段将根据需要独立启动。默认为True。
- 🔥sequence_parallel: 启动序列并行优化,该参数需要设置`tensor_model_parallel_size`才生效。默认为False。
- 🔥context_parallel_size: cp数,默认为1。
- tp_comm_overlap: 启用张量并行通信与GEMM(通用矩阵乘法)内核的重叠(降低通信耗时)。默认为False。
- 🔥overlap_grad_reduce: 启用DDP中grad reduce操作的重叠(降低DP通信耗时)。默认为False。
- 🔥overlap_param_gather: 启用分布式优化器中参数all-gather的重叠(降低DP通信耗时)。默认为False。
- virtual_pipeline_model_parallel_size: 每个流水线并行 rank 的虚拟流水线阶段数量。默认为None。vpp并行,用于减少pp并行的计算空泡,提高GPU利用率,但会略微提高通信量。
- microbatch_group_size_per_vp_stage: 每个虚拟流水线阶段处理的连续微批次数量。默认为None,等于pipeline_model_parallel_size。
- 🔥pipeline_model_parallel_layout: 一个描述自定义流水线(pp/vpp)模型并行布局的字符串。例如:"E|(t|)*3,m|m||L"。其中 E、L、t、m 分别表示嵌入层(embedding)、损失层(loss)、Transformer 解码器层和 MTP 层。阶段之间用 "|" 分隔。重复的阶段或层可以通过乘法表示。逗号仅用于提升可读性(无实际语法作用)。默认值为 None,表示不使用此参数设置布局。
- 该参数通常在异构GPU集群上使用。
- 🔥expert_model_parallel_size: 专家并行数,默认为1。
- 🔥expert_tensor_parallel_size: 专家TP并行度。默认值为1。
**日志参数**:
- report_to: 启用的日志后端。默认为`['tensorboard']`。可选项为'tensorboard', 'wandb'和'swanlab'。'wandb'和'swanlab'登陆可以使用`WANDB_API_KEY`、`SWANLAB_API_KEY`环境变量。
- 🔥logging_steps: 日志记录的间隔(steps),默认为5。
- tensorboard_dir: tensorboard日志写入的目录。默认None,即存储在`f'{output_dir}/runs'`目录下。
- tensorboard_queue_size: 用于暂存事件和摘要的 TensorBoard 队列大小;当队列中待处理的事件和摘要数量达到该大小时,下一次调用 "add" 相关方法会触发将数据刷新写入磁盘。默认为50。
- wandb_project: wandb项目名称,默认为'megatron-swift'。
- wandb_exp_name: wandb 实验名称。默认为`--output_dir`的值。
- swanlab_project: swanlab项目名称,默认为'megatron-swift'。
- swanlab_exp_name: swanlab 实验名称。默认为`--output_dir`的值。
**评估参数**:
- 🔥eval_iters: 评估的迭代次数,默认为`-1`,根据验证数据集的数量设置合适的值。**若验证集数量少于global_batch_size,则不进行评估**。若使用流式数据集,该值需要手动设置。
- 🔥eval_steps: 评估的间隔(steps),即每训练多少steps进行评估。默认为None,即设置为`save_steps`。
**fp8参数**:
- fp8_format: 用于前向和反向传播中FP8张量的FP8格式方案。可选为'e4m3','hybrid'。默认为None。
- fp8_recipe: 用于前向和反向传播中 FP8 张量的 FP8 算法方案。可选为'tensorwise', 'delayed', 'mxfp8', 'blockwise'。默认为'delayed'。其中blockwise fp8需要 cuda129 以上版本。
- fp8_amax_history_len: 每个张量记录 amax 历史的步数。默认为1024。
- fp8_amax_compute_algo: 用于根据历史记录计算 amax 的算法。可选为'most_recent', 'max'。默认为'max'。
- fp8_param_gather: 保持计算参数为 fp8(不使用任何其他中间数据类型),并在 fp8 格式下执行参数的 all-gather 操作。默认为False。
- 提示:若想导出FP8权重格式,设置为True;否则设置为False。
**混合精度参数**:
- fp16: fp16模式。默认为None,会根据模型的torch_dtype进行设置,即torch_dtype为float16或者float32则fp16设置为True。torch_dtype默认读取config.json。
- bf16: bf16模式。默认为None,会根据模型的torch_dtype进行设置,即torch_dtype为bfloat16则bf16设置为True。
- apply_query_key_layer_scaling: 将`Q * K^T` 缩放为 `1 / 层数`(例如:第layer_num层则除以layer_num)。这对fp16训练很有帮助。默认为None,即若使用`--fp16`,则设置为True。
- 🔥attention_softmax_in_fp32: 在attention_mask和softmax中使用fp32进行计算。默认为True。
- accumulate_allreduce_grads_in_fp32: 在 fp32 精度下进行梯度累积和全规约操作。如果开启`--bf16`且`main_params_dtype`为'fp32',则设置为True。否则默认设置为False。
**MoE参数**:
- moe_router_load_balancing_type: 确定路由器的负载均衡策略。可选项为"aux_loss"、"seq_aux_loss"、"global_aux_loss"、"sinkhorn"、"none"。其中, "global_aux_loss"需要"megatron-core>=0.15"。默认值为 None。从config.json中读取。
- 🔥moe_router_dtype: 用于路由计算和专家输出加权平均的数据类型。可选为'none', 'fp32'、'fp64',这增强了数值稳定性,尤其是在专家数量较多时。与`moe_permute_fusion`一起使用时,性能影响可以忽略不计。默认为'fp32'。'none'代表不改变数据类型。
- moe_token_dispatcher_type: 要使用的token分发器类型。可选选项包括 'allgather'、'alltoall'、'flex'和'alltoall_seq'。默认值为'alltoall'。
- moe_enable_deepep: 启用 DeepEP 以实现 MoE 模型中的高效 token 调度和合并。仅在通过设置 `--moe_token_dispatcher_type flex` 使用弹性 token 调度器时有效。
- 🔥moe_grouped_gemm: 当每个rank包含多个专家时,通过在多个流中启动多个本地 GEMM 内核,利用 TransformerEngine中的GroupedLinear提高利用率和性能。默认为True。
- 🔥moe_permute_fusion: 在令牌分发过程中融合令牌重排操作。默认为False。
- 🔥moe_aux_loss_coeff: 默认为0,不使用aux_loss。**通常情况下,该值设置的越大,训练效果越差,但MoE负载越均衡**,请根据实验效果,选择合适的值。
- moe_z_loss_coeff: z-loss 的缩放系数。默认为None。
- 🔥moe_shared_expert_overlap: 启用共享专家计算与调度器通信之间的重叠。如果不启用此选项,共享专家将在路由专家之后执行。仅在设置了`moe_shared_expert_intermediate_size`时有效。默认为False。
- 🔥moe_expert_capacity_factor: 每个专家的容量因子,None表示不会丢弃任何token。默认为None。通过设置 `--moe_expert_capacity_factor`,超出专家容量的 token 会基于其被选中的概率被丢弃。可以**令训练负载均匀,提升训练速度**(例如设置为1或2)。
- moe_pad_expert_input_to_capacity: 对每个专家(expert)的输入进行填充,使其长度与专家容量(expert capacity length)对齐,默认为False。该操作仅在设置了 `--moe_expert_capacity_factor` 参数后才生效。
- moe_token_drop_policy: 可选为'probs', 'position'。默认为'probs'。
**DSA参数**
- dsa_indexer_loss_coeff: DSA 索引器 KL 散度损失的系数。设置为 0 可禁用索引器损失。默认为None。
- dsa_indexer_use_sparse_loss: 是否使用稀疏 DSA 索引器损失。如果为 True,索引器损失将使用 top-k 索引进行计算。默认为False。
**MTP参数**
- mtp_num_layers: 多token预测(MTP)层的数量。MTP将每个位置的预测范围扩展到多个未来token。此MTP实现使用D个顺序模块依次预测D个额外的token。默认为None。(需要"megatron-core>=0.14")
- 注意:mtp_num_layers的值,将不自动从config.json获取,需手动设置。你可以参考config.json中的`num_nextn_predict_layers`字段填写该值。使用mcore-bridge时,将优先从safetensors文件中加载MTP权重,若无法找到,则进行随机初始化。(若要使用blockwise fp8 + mtp,请使用mcore>=0.15)
- mtp_loss_scaling_factor: 多token预测(MTP)损失的缩放因子。我们计算所有深度上MTP损失的平均值,然后乘以该缩放因子得到总体MTP损失,它将作为一个额外的训练目标。默认为0.1。
**Tuner参数**:
- tuner_type: 可选为'lora', 'full'和'lora_llm'。默认为'full'。
- 其中'lora_llm'代表对llm部分进行lora,vit/aligner部分使用'full'。你可以使用`vit_lr/aligner_lr`设置各自的学习率。
- 🔥freeze_llm: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_llm设置为True会将LLM部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_llm设置为True将会取消在LLM部分添加LoRA模块。该参数默认为False。
- 🔥freeze_vit: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_vit设置为True会将vit部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_vit设置为True将会取消在vit部分添加LoRA模块。该参数默认为True。
- 注意:**这里的vit不仅限于vision_tower, 也包括audio_tower**。若是Omni模型,若你只希望对vision_tower加LoRA,而不希望对audio_tower加LoRA,你可以修改[这里的代码](https://github.com/modelscope/ms-swift/blob/a5d4c0a2ce0658cef8332d6c0fa619a52afa26ff/swift/llm/model/model_arch.py#L544-L554)。
- 🔥freeze_aligner: 该参数只对多模态模型生效,可用于全参数训练和LoRA训练,但会产生不同的效果。若是全参数训练,将freeze_aligner设置为True会将aligner(也称为projector)部分权重进行冻结;若是LoRA训练且`target_modules`设置为'all-linear',将freeze_aligner设置为True将会取消在aligner部分添加LoRA模块。该参数默认为True。
全参数训练:
- freeze_parameters: 需要被冻结参数的前缀,默认为`[]`。
- freeze_parameters_regex: 需要被冻结参数的正则表达式,默认为None。
- freeze_parameters_ratio: 从下往上冻结的参数比例,默认为0。可设置为1将所有参数冻结,结合`trainable_parameters`设置可训练参数。除了设置为0/1,该参数不兼容pp并行。
- trainable_parameters: 额外可训练参数的前缀,默认为`[]`。
- trainable_parameters_regex: 匹配额外可训练参数的正则表达式,默认为None。
lora训练:
- mcore_adapter: 加载adapter的权重路径,用于lora断点续训,默认为None。lora断点续训方式与全参数一致,请关注`--finetune`参数的含义。
- 🔥target_modules: 指定lora模块的后缀,例如:你可以设置为`--target_modules linear_qkv linear_proj`。默认为`['all-linear']`,代表将所有的linear设置为target_modules。
- 注意:在LLM和多模态LLM中,'all-linear'的行为有所不同。若是LLM则自动寻找除lm_head外的linear并附加tuner;**若是多模态LLM,则默认只在LLM上附加tuner,该行为可以被`freeze_llm`、`freeze_vit`、`freeze_aligner`控制**。
- 注意:若需要将所有的router设置为target_modules, 你可以额外设置`--target_modules all-router ...`,例如:`--target_modules all-router all-linear`。
- transformers和Megatron的Linear层后缀名称不同,在Megatron中,`linear_proj`代表`o_proj`,`linear_qkv`代表`q_proj, k_proj, v_proj`的拼接,`linear_fc1`代表`gate_proj`, `up_proj`的拼接,`linear_fc2`代表`down_proj`。
- 🔥target_regex: 指定lora模块的regex表达式,默认为`None`。如果该值传入,则target_modules参数失效。
- 🔥modules_to_save: 在已附加tuner后,额外指定一部分原模型模块参与训练和存储。默认为`[]`。例如设置为`--modules_to_save word_embeddings output_layer`,在LoRA训练中解开`word_embeddings`和`output_layer`层进行训练,这两部分的权重信息最终会进行保存。
- 🔥lora_rank: 默认为`8`。
- 🔥lora_alpha: 默认为`32`。
- lora_dropout: 默认为`0.05`。
- lora_bias: 默认为`'none'`,可以选择的值: 'none'、'all'。如果你要将bias全都设置为可训练,你可以设置为`'all'`。
- use_rslora: 默认为`False`,是否使用`RS-LoRA`。
**Mcore-Bridge参数**
- model: safetensors权重的model_id或者model_path。默认为None。
- model_type: 模型类型。介绍参考[ms-swift命令行参数文档](../Instruction/Command-line-parameters.md)。
- 🔥save_safetensors: 默认为True,是否直接保存成safetensors权重。若设置了`--no_save_optim false`则额外mcore格式权重和优化器权重(也保存在output_dir中)。断点续训时使用`--mcore_model/--mcore_adapter/--no_load_optim/--no_load_rng`参数加载mcore格式权重。
- adapters: safetensors格式的LoRA增量权重的adapter_id或者adapter_path。默认为`[]`。
- ref_model: ref_model safetensors权重的model_id或者model_path。采用grpo、dpo、kto算法且使用全参数训练时需要传入。默认为None,设置为`--model`。
- ref_adapters: ref_adapters safetensors权重的adapter_id或者adapter_path的列表(目前只支持长度为1),默认为`[]`。
- use_hf: 控制模型下载、数据集下载、模型推送使用[ModelScope](https://modelscope.cn/)还是[HuggingFace](https://huggingface.co/)。默认为False,使用ModelScope。
- 提示:如果你想在国外访问ModelScope,可以尝试使用[ModelScope国际版](https://modelscope.ai/home),设置环境变量`MODELSCOPE_DOMAIN='www.modelscope.ai'`即可。
- hub_token: hub token. modelscope的hub token可以查看[这里](https://modelscope.cn/my/myaccesstoken)。默认为None。
- merge_lora: 是否存储合并后的权重。默认为None,若`save_safetensors`设置为True,该参数默认值为`True`,否则为False。即默认情况下,存储为safetensors格式时会合并LoRA;存储为torch_dist格式时,不会合并LoRA。
- max_shard_size: safetensors格式存储文件最大大小,默认'5GB'。
- 🔥offload_bridge: Megatron导出的用于vLLM更新HF格式权重使用CPU主存存放,以降低 GPU 显存占用。默认为 False。(在GRPO/GKD算法中生效)
**多模态参数**:
- vit_gradient_checkpointing: 多模态模型训练时,是否对vit部分开启gradient_checkpointing。默认为True。(**Megatron-SWIFT的vit实现使用transformers实现**)
- attn_impl: 多模态模型训练时,设置vit部分的attn_impl实现。默认为'flash_attn'。
- vit_lr: 当训练多模态大模型时,该参数指定vit的学习率,默认为None,等于learning_rate。通常与`--freeze_vit`、`--freeze_aligner`参数结合使用。
- 提示:在日志中打印的"learning rate"为llm的学习率。
- aligner_lr: 当训练多模态大模型时,该参数指定aligner的学习率,默认为None,等于learning_rate。
- gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数。例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`。默认为None。该参数只对`vit_gradient_checkpointing`生效。
**其他参数**:
- check_model: 检查本地模型文件有损坏或修改并给出提示,默认为True。**如果是断网环境,请设置为False**。
- rope_scaling: rope_scaling相关参数,默认为None。格式参考[llama3.1 config.json](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct/file/view/master?fileName=config.json&status=1),传入json字符串。
- **目前rope_scaling模块使用transformers实现,支持transformers支持的所有rope_scaling。**
- apply_wd_to_qk_layernorm: 用于Qwen3-Next/Qwen3.5全参数训练,对 qk layernorm 应用权重衰减。默认为False。
- enable_dft_loss: 是否在SFT训练中使用[DFT](https://arxiv.org/abs/2508.05629) (Dynamic Fine-Tuning) loss,默认为False。
- enable_channel_loss: 启用channel loss,默认为`False`。你需要在数据集中准备"channel"字段,ms-swift会根据该字段分组统计loss(若未准备"channel"字段,则归为默认`None` channel)。数据集格式参考[channel loss](../Customization/Custom-dataset.md#channel-loss)。channel loss兼容packing/padding_free/loss_scale等技术。
- 🔥task_type: 默认为'causal_lm'。可选为'causal_lm'、'seq_cls'、'embedding'和'generative_reranker'。
- num_labels: 分类模型(即`--task_type seq_cls`)需要指定该参数。代表标签数量,默认为None。
- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None,若模型为 reward_model 或 num_labels 为1,该参数为'regression',其他情况,该参数为'single_label_classification'。
- 🔥save_strategy: 保存策略,可选项为'steps'和'epoch'。默认为'steps'。当设置为'epoch'时,会根据数据集大小自动计算`save_steps`和`eval_steps`以实现每个epoch保存一次,用户传入的`save_steps`和`eval_steps`参数值将被忽略。
- callbacks: 自定义trainer callback,默认为`[]`。
## 训练参数
Megatron训练参数继承自Megatron参数和基本参数(**与ms-swift共用dataset、template等参数,也支持ms-swift中的特定模型参数**)。基本参数的内容可以参考[这里](../Instruction/Command-line-parameters.md#基本参数)。此外还包括以下参数:
- add_version: 在`output_dir`上额外增加目录`'<版本号>-<时间戳>'`防止权重覆盖,默认为True。
- 🔥create_checkpoint_symlink: 额外创建checkpoint软链接,方便书写自动化训练脚本。best_model和last_model的软链接路径分别为f'{output_dir}/best'和f'{output_dir}/last'。
- 🔥packing: 使用`padding_free`的方式将不同长度的数据样本打包成**近似**统一长度的样本(packing能保证不对完整的序列进行切分),实现训练时各节点与进程的负载均衡(避免长文本拖慢短文本的训练速度),从而提高GPU利用率,保持显存占用稳定。当使用 `--attention_backend flash` 时,可确保packed样本内的不同序列之间相互独立,互不可见(除Qwen3-Next,因为含有linear-attention)。该参数默认为`False`。Megatron-SWIFT的所有训练任务都支持该参数。注意:**packing会导致数据集样本数减少,请自行调节梯度累加数和学习率**。
- packing_length: packing的长度。默认为None,设置为max_length。
- packing_num_proc: packing的进程数,默认为1。需要注意的是,不同的`packing_num_proc`,最终形成的packed数据集是不同的。(该参数在流式packing时不生效)。通常不需要修改该值,packing速度远快于tokenize速度。
- streaming: 流式读取并处理数据集,默认False。(流式数据集的随机并不彻底,可能导致loss波动剧烈。)
- 注意:因为流式数据集无法获得其长度,因此需要设置`--train_iters`参数。设置`num_train_epochs`参数确保训练到对应epochs时退出训练,并对权重进行验证和保存。
- 注意:流式数据集可以跳过预处理等待,将预处理时间与训练时间重叠。流式数据集的预处理只在rank0上进行,并通过数据分发的方式同步到其他进程,**其通常效率不如非流式数据集采用的数据分片读取方式**。当训练的world_size较大时,预处理和数据分发将成为训练瓶颈。
- lazy_tokenize: 是否使用lazy_tokenize。若该参数设置为False,则在训练之前对所有的数据集样本进行tokenize(多模态模型则包括从磁盘中读取图片)。该参数默认为None,在LLM训练中默认为False,而MLLM训练默认为True,节约内存。
- new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/lora/new_special_tokens.sh)。
- 注意:你也可以传入以`.txt`结尾的文件路径,每行为一个special token。
## RLHF参数
除了继承训练参数外,还支持以下参数:
- 🔥rlhf_type: 默认为'dpo'。目前可选择为'dpo'、'grpo'、'kto'、'rm'和'gkd'。
- loss_scale: 覆盖[基本参数](../Instruction/Command-line-parameters.md)中的loss_scale。默认为'last_round'。
- calculate_per_token_loss: 覆盖Megatron参数,默认为False。
### DPO参数
- mcore_ref_model: ref_model的加载路径。采用DPO/GRPO/KTO算法且使用全参数训练时需要传入。默认为None,即设置为`mcore_model`。
- mcore_ref_adapter: 加载ref_adapter的权重路径,默认为None。若你要使用SFT产生的LoRA权重进行DPO,,请在训练时设置`--mcore_adapter sft_ckpt --mcore_ref_adapter sft_ckpt --finetune true`。若是此场景的断点续训,则设置`--mcore_adapter rlhf_ckpt --mcore_ref_adapter sft_ckpt --finetune false`。
- beta: 含义与[TRL](https://huggingface.co/docs/trl/main/en/dpo_trainer#trl.DPOConfig)相同。控制与参考模型偏差程度的参数。beta值越高,表示与参考模型的偏差越小。对于 IPO 损失函数 (loss_type="ipo"),beta是[论文](https://huggingface.co/papers/2310.12036)中所指的正则化参数。默认为0.1。
- 🔥rpo_alpha: 来自[RPO 论文](https://huggingface.co/papers/2404.19733)中的参数,用于控制损失函数中NLL项的权重(即SFT损失),`loss = dpo_loss + rpo_alpha * sft_loss`,论文中推荐设置为`1.`。默认为`None`,即默认不引入sft_loss。
- reference_free: 是否忽略提供的参考模型,并隐式地使用一个对所有响应赋予相等概率的参考模型。默认为False。
- label_smoothing: 默认为0.。
- f_divergence_type: 默认为`reverse_kl`。可选值参考[TRL文档](https://huggingface.co/docs/trl/main/en/dpo_trainer)。
- loss_type: 默认为'sigmoid'。可选值参考[TRL文档](https://huggingface.co/docs/trl/main/en/dpo_trainer#loss-functions)。
### KTO参数
- mcore_ref_model: 含义同DPO。
- mcore_ref_adapter: 含义同DPO。
- beta: 控制与 ref_model 偏离程度的参数。较高的 beta 表示与 ref_model 偏离更小。默认为`0.1`。
- loss_type: 默认为'kto'。可选值参考[TRL文档](https://huggingface.co/docs/trl/main/en/kto_trainer#trl.KTOConfig.loss_type)。
- desirable_weight: 抵消 desirable 和 undesirable 数量不均衡的影响,对 desirable 损失按该系数进行加权,默认为`1.`。
- undesirable_weight: 抵消 desirable 和 undesirable 数量不均衡的影响,对 undesirable 损失按该系数进行加权,默认为`1.`。
### RM参数
- center_rewards_coefficient: 用于激励奖励模型输出均值为零的奖励的系数,具体查看这篇[论文](https://huggingface.co/papers/2312.09244)。推荐值:0.01。
### GRPO参数
- mcore_ref_model: 含义同DPO。
- mcore_ref_adapter: 含义同DPO。
- beta: KL正则系数,默认为0.04,设置为0时不加载ref model。
- micro_batch_size: 每个device的批次大小,默认为1。
- global_batch_size: 总批次大小,等价于`micro_batch_size*数据并行大小*梯度累加步数`。默认为16。
- steps_per_generation: 每轮生成的优化步数,即采样批量大小相对global_batch_size的倍数,默认为1。
- generation_batch_size: 采样批量大小,需要是global_batch_size的倍数,默认等于`global_batch_size*steps_per_generation`。
- num_generations: 每个prompt采样的数量,论文中的G值,默认为8。
- num_generations_eval: 评估阶段每个prompt采样的数量。允许在评估时使用较少的生成数量以节省计算资源。如果为 None,则使用 num_generations 的值。默认为 None。
- reward_funcs: GRPO算法奖励函数,可选项为`accuracy`、`format`、`cosine`、`repetition`和`soft_overlong`,见swift/rewards/orm.py。你也可以在plugin中自定义自己的奖励函数。默认为`[]`。
- reward_weights: 每个奖励函数的权重。必须与奖励函数和奖励模型的总数量匹配。默认为 None,即所有奖励的权重都相等,为`1.0`。
- 提示:如果GRPO训练中包含`--reward_model`,则其加在奖励函数的最后位置。
- truncation_strategy: 对输入长度超过 `max_length`的处理方式,支持`delete`和`left`,代表删除、左侧裁剪,默认为`left`。注意对于多模态模型,左裁剪可能会裁剪掉多模态token导致模型前向报错shape mismatch。使用`delete`方式,对于超长数据和编码失败的样例会在原数据集中重采样其他数据作为补充。
- loss_type: loss 归一化的类型,可选项为['grpo', 'bnpo', 'dr_grpo'], 默认为'grpo', 具体查看该[pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)。
- log_completions: 是否记录训练中的模型生成内容,默认为False。
- vllm_mode: vLLM 集成模式,可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样,colocate 模式在程序内部署 vLLM。使用server端时,
- vllm_mode server 参数
- vllm_server_host: vLLM server host地址,默认为None。
- vllm_server_port: vLLM server 服务端口,默认为8000。
- vllm_server_base_url: vLLM server的Base URL(比如 http://local_host:8000), 默认为None。设置后,忽略host和port设置。
- vllm_server_timeout: 连接vLLM server的超时时间,默认为 240s。
- vllm_server_pass_dataset: 透传额外的数据集信息到vLLM server,用于多轮训练。
- async_generate: 异步rollout以提高训练速度,注意开启时采样会使用上一轮更新的模型进行采样,不支持多轮场景。默认`false`.
- SWIFT_UPDATE_WEIGHTS_BUCKET_SIZE: 环境变量,用于控制权重同步时的传输桶大小(bucket size),适用于 Server Mode 下的全参数训练,单位为 MB,默认值为 512 MB。
- vllm_mode colocate 参数(更多参数支持参考[vLLM参数](#vLLM参数)。)
- vllm_gpu_memory_utilization: vllm透传参数,默认为0.9。
- vllm_max_model_len: vllm透传参数,默认为None。
- vllm_enforce_eager: vllm透传参数,默认为False。
- vllm_limit_mm_per_prompt: vllm透传参数,默认为None。
- vllm_enable_prefix_caching: vllm透传参数,默认为True。
- vllm_tensor_parallel_size: tp并行数,默认为`1`。
- vllm_enable_lora: 支持vLLM Engine 加载 LoRA adapter,默认为False。用于加速LoRA训练的权重同步,具体参考[文档](../Instruction/GRPO/GetStarted/GRPO.md#权重同步加速)。
- sleep_level: 训练时释放 vLLM 显存,可选项为[0, 1, 2], 默认为0,不释放。
- offload_optimizer: 是否在vLLM推理时offload optimizer参数,默认为False。
- offload_model: 是否在vLLM推理时 offload 模型,默认为False。
- num_iterations: 每条数据的更新次数,[GRPO论文](https://arxiv.org/abs/2402.03300)中的 $\mu$ 值,默认为1。
- epsilon: clip 系数,默认为0.2。
- epsilon_high: upper clip 系数,默认为None,设置后与epsilon共同构成[epsilon, epsilon_high]裁剪范围。
- dynamic_sample: 筛除group内奖励标准差为0的数据,额外采样新数据,默认为False。
- max_resample_times: dynamic_sample设置下限制重采样次数,默认3次。
- overlong_filter: 跳过超长截断的样本,不参与loss计算,默认为False。
- delta: [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291)中双侧 GRPO 上界裁剪值。若设置,建议大于 1 + epsilon。默认为None。
- importance_sampling_level: 控制重要性采样比计算,可选项为 `token` 和 `sequence`,`token` 模式下保留原始的每个 token 的对数概率比,`sequence` 模式下则会对序列中所有有效 token 的对数概率比进行平均。[GSPO论文](https://arxiv.org/abs/2507.18071)中使用sequence级别计算来稳定训练,默认为`token`。
- scale_rewards: 指定奖励的缩放策略。可选值包括 `group`(按组内标准差缩放)、`batch`(按整个批次的标准差缩放)、`none`(不进行缩放)、`gdpo`(对每个奖励函数分别进行组内归一化后加权聚合,参考 [GDPO 论文](https://arxiv.org/abs/2601.05242))。在 ms-swift < 3.10 版本中,该参数为布尔类型,`true` 对应 `group`,`false` 对应 `none`。默认值与 `advantage_estimator` 绑定:`grpo` 对应 `group`,`rloo` 对应 `none`,`reinforce_plus_plus` 对应 `batch`。
- 注意:`gdpo` 模式不支持 `kl_in_reward=True`,若同时设置会自动将 `kl_in_reward` 设为 `False`。
- GDPO 适用于多奖励优化场景:当使用多个奖励函数时,GDPO 会对每个奖励函数分别在组内进行标准化(减均值、除标准差),然后使用 `reward_weights` 进行加权求和,最后再进行批次级别的标准化。这种方式可以更好地保留各个奖励的相对差异,避免不同奖励组合坍塌成相同的 advantage 值。
- rollout_importance_sampling_mode: 训推不一致校正模式,可选项为 `token_truncate`、`token_mask`、`sequence_truncate`、`sequence_mask`。默认为None,不启用校正。具体参考[文档](../Instruction/GRPO/AdvancedResearch/training_inference_mismatch.md)。
- rollout_importance_sampling_threshold: 重要性采样权重的阈值,用于截断或屏蔽极端权重。默认为2.0。
- log_rollout_offpolicy_metrics: 当 `rollout_importance_sampling_mode` 未设置时,是否记录训推不一致诊断指标(KL、PPL、χ²等)。当设置了 `rollout_importance_sampling_mode` 时,指标会自动记录。默认为False。
- off_policy_sequence_mask_delta: Off-Policy Sequence Masking 阈值,来自 DeepSeek-V3.2 论文。当设置此值时,会计算每个序列的 `mean(old_policy_logps - policy_logps)`,若该值大于阈值且该序列的优势为负,则 mask 掉该序列不参与损失计算。默认为None,不启用。具体参考[文档](../Instruction/GRPO/AdvancedResearch/training_inference_mismatch.md#off-policy-sequence-masking)。
内置奖励函数参数参考[文档](../Instruction/Command-line-parameters.md#奖励函数参数)
### GKD参数
- teacher_model: 教师模型的路径或模型 ID,必需参数。
- teacher_model_type: 教师模型类型,默认为None,自动检测。
- teacher_model_revision: 教师模型版本,默认为None。
- beta: JSD 散度插值系数。0.0 代表 Forward KL,0.5 代表对称 JSD,1.0 代表 Reverse KL。默认为0.5。
- lmbda: On-Policy 学习触发概率。0.0 代表纯 Off-Policy,1.0 代表纯 On-Policy。默认为0.5。
- seq_kd: 是否使用教师生成的响应(Sequential KD),当前暂不支持。默认为False。
- temperature: 用于采样和损失计算的温度参数。默认为0.9。
- offload_teacher_model: 是否将教师模型卸载到 CPU 以节省 GPU 显存。默认为False。
- sft_alpha: SFT 损失的混合系数,`loss = jsd_loss + sft_alpha * sft_loss`。当使用数据集响应(Off-Policy)时生效。默认为0。
- max_completion_length: 生成时的最大 token 数。默认为512。
- vllm_mode: 同 GRPO 参数,用于 On-Policy 生成。colocate 模式下在程序内部署 vLLM。
- 注意:On-Policy 生成需要启用 vLLM(`--use_vllm true --vllm_mode colocate/server`)。
- 当 `lmbda > 0` 但未启用 vLLM 时,将自动回退到 Off-Policy 模式。
## 导出参数
这里介绍`megatron export`的参数,若要使用`swift export`导出命令,请参考[ms-swift命令行参数文档](../Instruction/Command-line-parameters.md#导出参数)。`megatron export`相比`swift export`,支持分布式和多机导出。Megatron导出参数继承自Megatron参数和基本参数。
- 🔥to_mcore: HF格式权重转成Megatron格式。默认为False。
- 🔥to_hf: Megatron格式权重转成HF格式。默认为False。
- 🔥merge_lora: 默认为None,若`to_hf`设置为True,该参数默认值为`True`,否则为False。即默认情况下,存储为safetensors格式时会合并LoRA;存储为torch_dist格式时,不会合并LoRA。合并后的权重存储在`--output_dir`目录下。
- 注意:transformers 5.0对Moe的模型组织结构进行了重构,该结构不支持Moe LoRA的推理,可能造成推理异常。**建议对Moe模型进行Merge LoRA**(vLLM不受影响)。
- 注意:由于transformers和Megatron模型专家结构并不一定一致(例如transformers的Qwen3-VL-Moe的专家部分并不是Linear实现,而是Parameters),因此部分模型无法转换(若Qwen3-VL-Moe只设置linear_proj和linear_qkv训练LoRA也支持转换)。但大多数的模型支持LoRA转换,例如:Qwen3-Moe,Qwen3-Omni-Moe,GLM4.5-V等。
- 🔥test_convert_precision: 测试HF和Megatron格式权重转换的精度误差。默认为False。
- test_convert_dtype: 转换精度测试使用的dtype,默认为'float32'。
- exist_ok: 如果`args.output_dir`存在,不抛出异常,进行覆盖。默认为False。
- device_map: 设置`--test_convert_precision true`时生效,控制HF模型的加载位置,默认为'auto'。你可以设置为'cpu'节约显存资源。
================================================
FILE: docs/source/Megatron-SWIFT/GKD.md
================================================
# GKD
**版本依赖**:ms-swift >= 3.12
如果你是首次使用 GKD,请先参考 [GKD文档](../Instruction/GKD.md)。
GKD(Generalized Knowledge Distillation,广义知识蒸馏)是一种将教师模型的知识迁移到学生模型的训练方法,通过计算两个模型输出分布之间的 Jensen-Shannon 散度(JSD)损失来实现知识蒸馏。
## 功能支持
Megatron GKD 当前已支持以下功能:
- **训练模式**:全参数训练与 LoRA 微调
- **并行策略**:支持上下文并行(CP)、流水线并行(PP)、张量并行(TP)和专家并行(EP)
- **模型支持**:兼容 Megatron-SWIFT 中的 LLM 及 MLLM
- **Teacher Offload**:支持将教师模型卸载到 CPU 以节省 GPU 显存
- **在线生成**:支持使用 vLLM 进行学生模型的 on-policy 生成
### 当前限制
- **教师模型在线生成**(`seq_kd=True`):当前 Sequential KD 模式下的教师模型生成暂不支持
- **非vLLM生成**:On-policy 生成当前仅支持 vLLM
- **教师模型使用与学生模型不同的并行参数**: 将在未来版本支持
⚠️ 注意事项:
- **On-policy 生成**:需要启用 vLLM(`--use_vllm true --vllm_mode colocate/server`)
- 当 `lmbda > 0` 但未启用 vLLM 时,将自动回退到离线学习模式(使用数据集响应)
- 当 `seq_kd=True` 时,由于教师生成暂不支持,将自动回退到离线学习模式,如需使用,请提前用[swift infer](../Instruction/Inference-and-deployment.md)推理数据集
## 参数说明
### GKD 特有参数
| 参数 | 类型 | 默认值 | 说明 |
|------|------|--------|------|
| `--teacher_model` | str | - | 教师模型路径或模型 ID *使用 `teacher_model_server` 时可省略 |
| `--teacher_model_server` | str | None | 教师模型服务地址(仅支持 `vllm serve`),如 `http://localhost:8000` |
| `--gkd_logits_topk` | int | None | Top-K logits 数量,使用外部教师 API 时必须设置 |
| `--beta` | float | 0.5 | JSD 散度插值系数: • 0.0: Forward KL • 0.5: 对称 JSD • 1.0: Reverse KL |
| `--lmbda` | float | 0.5 | On-Policy 学习触发概率: • 0.0: 纯 Off-Policy • 1.0: 纯 On-Policy |
| `--seq_kd` | bool | False | 是否使用教师生成的响应(当前暂不支持) |
| `--temperature` | float | 0.9 | 温度参数,用于采样和损失计算 |
| `--sft_alpha` | float | 0 | 混合一定比例的sft loss,对非student生成结果生效 |
| `--max_completion_length` | int | 512 | 生成时的最大 token 数 |
### 批量相关参数
与 Megatron SFT 相同,使用以下参数控制批量大小:
| 参数 | 说明 |
|------|------|
| `--micro_batch_size` | 每张 GPU 的训练批次大小 |
| `--global_batch_size` | 全局批次大小:`micro_batch_size × dp_size × gradient_accumulation_steps` |
## 三种训练模式
GKD 支持三种训练模式,通过 `lmbda` 和 `seq_kd` 参数控制:
### Mode 1: On-Policy 学习
- 触发条件:`random() < lmbda` 且 `use_vllm=True`
- 数据来源:学生模型生成的响应
### Mode 2: Sequential KD(当前暂不支持)
- 触发条件:`random() >= lmbda` 且 `seq_kd=True`
- 数据来源:教师模型生成的响应
### Mode 3: Off-Policy 学习
- 触发条件:其他情况
- 数据来源:数据集中的标注响应
## 参考
更多参数请参考[命令行文档](./Command-line-parameters.md)
训练脚本请参考 [Megatron GKD 脚本](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/rlhf/gkd)
使用 Teacher Server 的训练脚本请参考 [这里](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/rlhf/gkd/teacher_server.sh)
================================================
FILE: docs/source/Megatron-SWIFT/GRPO.md
================================================
# GRPO
**版本依赖**:ms-swift >= 3.11
如果你是首次使用 GRPO,请先参考 [GRPO文档](../Instruction/GRPO/GetStarted/GRPO.md)。
Megatron GRPO 当前已支持以下功能:
- **训练模式**:全参数训练与 LoRA 微调
- **并行策略**:支持上下文并行(CP)、流水线并行(PP)、张量并行(TP)和专家并行(EP)
- **推理加速**:支持 vLLM 的 colocate 模式和 server 模式
- **模型支持**:兼容 Megatron Swift 中的 LLM 及 MLLM(多模态大模型)
- **算法支持**:涵盖 swift GRPO 的大部分功能
以下参数或功能将在后续版本中逐步支持:
- **Reward Model / Reward Model Plugin**
- **多轮 Rollout 调度机制**(`multi_turn_scheduler`):实现多轮对话策略优化
- **虚拟流水线并行**(VPP)
- **参考模型同步更新**(`sync_ref_model`)
- **Async Generate** (`async_generate`)
- **num_iterations**
- **日志同步 SwanLab**
⚠️ 注意:以下参数在 Megatron GRPO 中不生效:
- **`use_vllm`**:Megatron GRPO 仅使用 vLLM 进行 Rollout 推理。
- **`move_model_batches`**:该参数专用于 DeepSpeed ZeRO-3 优化,在 Megatron 架构下无效。
与 ms-swift GRPO 相同,Megatron GRPO batch size 相关的参数均以 **completion-level** 为单位,即表示模型生成的 completion 数量,而非 prompt 数量。
#### 参数对比
下表对比了 ms-swift 和 Megatron-SWIFT 中批量相关参数的对应关系:
| ms-swift 参数 | Megatron-SWIFT 参数 | 说明 |
|---------------|---------------------|------|
| `per_device_train_batch_size` | `micro_batch_size` | 每张 GPU 的训练批次大小(completion-level) |
| `gradient_accumulation_steps` | - | 梯度累积步数,在 Megatron-SWIFT 中已包含在 `global_batch_size` 的计算中 |
| - | `global_batch_size` | 全局批次大小(completion-level) **Megatron-SWIFT**: `micro_batch_size × dp_size × gradient_accumulation_steps` **ms-swift**: `per_device_train_batch_size × world_size × gradient_accumulation_steps` |
| `num_generations` | `num_generations` | 每个 prompt 生成的 completion 数量 |
| `steps_per_generation` | `steps_per_generation` | Rollout 批次大小相对于训练批次大小的倍数 **注意**:在 ms-swift 中需为 `gradient_accumulation_steps` 的整数倍 |
| `generation_batch_size` | `generation_batch_size` | Rollout 阶段的批次大小(completion-level),需为 `global_batch_size` 的整数倍 |
以下公式用于计算 Megatron GRPO 中的批量:
- **数据并行大小**:`dp_size = world_size / (TP × PP × CP)`
- **全局批次大小**:`global_batch_size = micro_batch_size × dp_size × gradient_accumulation_steps`
- **生成批次大小**:`generation_batch_size = global_batch_size × steps_per_generation`
- **Rollout Prompt 数量**:`num_rollout_prompts = generation_batch_size / num_generations`
- **训练 Prompt 数量**:`num_train_prompts = global_batch_size / num_generations`
- **每个 DP group 的训练 Prompt 数量**:`num_prompts_per_dp_group = global_batch_size / num_generations / dp_size`
注意:在 Megatron GRPO 中,每个 DP group 的训练 Prompt 数量须满足 `num_prompts_per_dp_group` 是 `micro_batch_size`的整数倍,以确保训练批次能够正确分配。
更多参数请参考[命令行文档](./Command-line-parameters.md#grpo参数)
训练脚本请参考[Megatron GRPO 脚本](https://github.com/modelscope/ms-swift/blob/main/examples/megatron/grpo)
================================================
FILE: docs/source/Megatron-SWIFT/LoRA-Training.md
================================================
# LoRA训练
Qwen3-235B-A22B-Instruct-250718 单机8卡H20 LoRA训练的最佳实践参考:[https://github.com/modelscope/ms-swift/pull/5033](https://github.com/modelscope/ms-swift/pull/5033)。
环境准备请参考Megatron-SWIFT的[快速开始文档](./Quick-start.md)。
## 传统方式
### HF转换Mcore
以下,我们分别介绍使用`swift export`和`megatron export`命令进行权重转换。相比于`swift export`,`megatron export`支持多机和LoRA增量权重转换,但也更加复杂,需要在导出时额外指定并行参数,例如`--tensor_model_parallel_size`, `--export_model_parallel_size`,具体参考[Mcore-Bridge文档](./Mcore-Bridge.md)。若要使用`swift export`命令,参考[快速开始文档](./Quick-start.md)。
- `swift export`使用单进程,将HF权重放置在gpu中,并使用device_map并行;mcore权重放置在cpu中,且不开启并行。这种方式非常易于debug,并测试HF和mcore的精度对齐情况。
- `megatron export`使用torchrun启动多进程,mcore权重放置在gpu中,支持开启各种并行、fp8和mtp等功能。如果需测试精度对齐情况,会在第一个rank加载HF权重,并放置在cpu中。
```shell
# megatron export
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron export \
--model Qwen/Qwen2.5-7B-Instruct \
--tensor_model_parallel_size 2 \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir Qwen2.5-7B-Instruct-mcore \
--test_convert_precision true
# swift export
# CUDA_VISIBLE_DEVICES=0 \
# swift export \
# --model Qwen/Qwen2.5-7B-Instruct \
# --to_mcore true \
# --torch_dtype bfloat16 \
# --output_dir Qwen2.5-7B-Instruct-mcore \
# --test_convert_precision true
```
### LoRA训练
训练脚本:
```bash
# full: 2 * 70GiB 0.61s/it
# lora: 2 * 14GiB 0.45s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--mcore_model Qwen2.5-7B-Instruct-mcore \
--save_safetensors false \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--micro_batch_size 16 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-7B-Instruct \
--save_steps 100 \
--max_length 2048 \
--system 'You are a helpful assistant.' \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 4 \
--model_author swift \
--model_name swift-robot
```
- MoE模型的LoRA训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/lora)。
### MCore转换HF
```bash
# megatron export
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron export \
--mcore_adapter megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
--to_hf true \
--tensor_model_parallel_size 2 \
--merge_lora false \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
--test_convert_precision true
# swift export
# CUDA_VISIBLE_DEVICES=0 \
# swift export \
# --mcore_adapter megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
# --to_hf true \
# --torch_dtype bfloat16 \
# --output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
# --test_convert_precision true
```
- 注意:`--mcore_adapter`文件夹中包含`args.json`文件,转换过程会读取文件中`--model/--mcore_model`以及LoRA相关的参数信息。`swift export`暂不支持LoRA增量权重的转换。`megatron export`你可以使用`--merge_lora`参数控制是否进行权重合并。
### 推理
```shell
# 如果是全量权重,请将`--adapters`替换为`--model
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
--stream true
```
### Merge-LoRA
如果只想merge-lora,而不希望转成HF格式权重,用于后续DPO训练,可以使用以下脚本:
```shell
# megatron export
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron export \
--mcore_adapter megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
--tensor_model_parallel_size 2 \
--to_mcore true \
--merge_lora true \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-mcore \
--test_convert_precision true
# swift export
# CUDA_VISIBLE_DEVICES=0 \
# swift export \
# --mcore_adapter megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
# --to_mcore true \
# --torch_dtype bfloat16 \
# --output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-mcore \
# --test_convert_precision true
```
## Mcore-Bridge【推荐】
### 训练
```shell
# full: 2 * 70GiB 0.61s/it
# lora: 2 * 14GiB 0.45s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--merge_lora false \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--micro_batch_size 16 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-7B-Instruct \
--save_steps 100 \
--max_length 2048 \
--system 'You are a helpful assistant.' \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 4 \
--model_author swift \
--model_name swift-robot
```
### 推理
```shell
# 如果是全量权重,请将`--adapters`替换为`--model
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
--stream true
```
================================================
FILE: docs/source/Megatron-SWIFT/Mcore-Bridge.md
================================================
# Mcore Bridge
Megatron 以其卓越的训练速度和丰富的并行技术而著称,但也因此带来了较高的使用门槛。因此mcore-bridge 应运而生,旨在让 Megatron 训练像 transformers 一样简单易用。通过 Mcore-Bridge,用户可以:
1. 直接加载 safetensors 格式的模型权重,无缝使用 Megatron 进行高效训练。直接保存 训练权重为 safetensors 格式,无需额外转换。
2. 兼容 LoRA 增量权重的双向转换。
3. 兼容GRPO/GKD等算法的`Megatron->vLLM`权重同步。
4. 支持多机转换超大规模模型。
Mcore-Bridge 兼容 Dense/MoE/多模态等多种模型架构。训练完成后,转换后的模型可直接使用 transformers、vLLM、SGLang 等主流推理框架部署。
## 无缝训练
目前Mcore-Bridge已支持TP/PP/EP/ETP/VPP等并行技术,支持所有Megatron-SWIFT支持的模型架构,参考[支持的模型文档](../Instruction/Supported-models-and-datasets.md)。以下介绍Mcore-Bridge的无缝训练能力,分别介绍Dense模型和Moe模型。
- 使用`--model/--adapters/--ref_model/--ref_adapters`参数读取模型时,将使用mcore-bridge来读取safetensors格式的模型权重。若使用`--mcore_model/--mcore_adapter/--mcore_ref_model/--mcore_ref_adapter`参数,则使用mcore默认方式读取。
- `save_safetensors`参数决定存储权重为safetensors格式还是mcore格式。如果设置`--no_save_optim false`则总会额外存储一份mcore权重用于断点续训。
### Dense模型
以下为多模态模型Qwen3-VL模型训练的例子:
```shell
# 2 * 76GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen3-VL-8B-Instruct \
--save_safetensors true \
--dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
--load_from_cache_file true \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--packing true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--split_dataset_ratio 0.01 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen3-VL-8B-Instruct \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 8
```
然后我们对验证集部分进行推理:
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
IMAGE_MAX_TOKEN_NUM=1024 \
VIDEO_MAX_TOKEN_NUM=128 \
FPS_MAX_FRAMES=16 \
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model megatron_output/Qwen3-VL-8B-Instruct/vx-xxx/checkpoint-xxx \
--load_data_args true \
--stream true
```
### Moe模型
以下为纯文本模型Qwen3-Moe模型CoT训练的例子:
```shell
# 8 * 76GiB, 3s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
megatron sft \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--save_safetensors true \
--dataset 'swift/Chinese-Qwen3-235B-Thinking-2507-Distill-data-110k-SFT#20000' \
--load_from_cache_file true \
--split_dataset_ratio 0.01 \
--moe_permute_fusion true \
--pipeline_model_parallel_size 2 \
--decoder_first_pipeline_num_layers 25 \
--tensor_model_parallel_size 4 \
--expert_model_parallel_size 4 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-6 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--num_train_epochs 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507 \
--eval_steps 500 \
--save_steps 500 \
--max_length 8192 \
--packing true \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--moe_expert_capacity_factor 2 \
--attention_backend flash
```
对训练后的权重进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx \
--stream true \
--max_new_tokens 1024
```
## LoRA导出
Mcore-Bridge除了支持全参数的导入导出,还支持单独对LoRA增量模型进行导入导出。
以下为纯文本模型Qwen3-Moe模型使用LoRA自我认知训练的例子:
- 若你希望导出merge后的权重,而不是LoRA增量权重,请设置`--merge_lora true`。设置`--merge_lora true`的兼容性更好,支持所有系列模型。
- 注意:(transformers>5.0的情况)transformers 5.0对Moe的模型组织结构进行了重构,该结构不支持Moe LoRA的推理,可能造成推理异常。**建议对Moe模型进行Merge LoRA**(vLLM不受影响)。
- 注意:(transformers<5.0的情况)由于transformers和Megatron模型专家结构并不一定一致(例如transformers的Qwen3-VL-Moe的专家部分并不是Linear实现,而是Parameters),因此部分模型无法转换LoRA增量权重(若Qwen3-VL-Moe只设置linear_proj和linear_qkv训练LoRA也支持转换)。但大多数的模型支持LoRA转换,例如:Qwen3-Moe,Qwen3-Omni-Moe,GLM4.5-V等。
```shell
# 50GiB
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--save_safetensors true \
--merge_lora false \
--dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \
'swift/self-cognition#1000' \
--load_from_cache_file true \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--split_dataset_ratio 0.01 \
--moe_permute_fusion true \
--expert_model_parallel_size 2 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 8 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--num_train_epochs 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507 \
--eval_steps 200 \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--moe_expert_capacity_factor 2 \
--attention_backend flash \
--model_author swift \
--model_name swift-robot
```
对导出的LoRA权重进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--adapters megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx \
--stream true
```
提示:如果在vLLM权重更新期间遇到 GPU OOM 问题,您可以设置 `--offload_bridge true` 将张量卸载到 CPU 并减少 GPU 内存使用量。
## `megatron export` 与 转换精度测试
Mcore-Bridge除了支持在训练中进行safetensors的转换和保存,也支持了`megatron export`命令用于单独的权重导出。`megatron export`支持在权重转换时,对转换精度进行测试,这在接入新模型时验证接入准确性很有帮助。通常,Megatron-SWIFT已经接入的模型不会出现精度不对齐的情况,你可以放心设置`--test_convert_precision false`。
- 提示:多模态模型请关注`mean_diff (with loss)`字段,`mean_diff`因包含图像tokens且该部分不计算损失,有较大的diff。
全参数权重:
```shell
# safetensors -> torch_dist
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
megatron export \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--output_dir Qwen3-30B-A3B-Instruct-2507-mcore \
--to_mcore true \
--tensor_model_parallel_size 2 \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2 \
--test_convert_precision true
```
```shell
# torch_dist -> safetensors
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
megatron export \
--mcore_model Qwen3-30B-A3B-Instruct-2507-mcore \
--output_dir Qwen3-30B-A3B-Instruct-2507-hf \
--to_hf true \
--tensor_model_parallel_size 2 \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2 \
--test_convert_precision true
```
LoRA权重:
```shell
# torch_dist -> safetensors
# 若你需要进行merge-lora,并测试merge-lora后的精度对齐,你只需要设置`--merge_lora true`即可
# 你也可以将`--model safetensors-path`修改为`--mcore_model torch-dist-path`。这两种方式是等价的,mcore-bridge会自动处理。
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
megatron export \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--mcore_adapter megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx \
--output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx-lora \
--merge_lora false \
--to_hf true \
--tensor_model_parallel_size 2 \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2 \
--test_convert_precision true
```
```shell
# safetensors -> torch_dist
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
megatron export \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--adapters megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx-lora \
--output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx-mcore \
--merge_lora false \
--to_mcore true \
--tensor_model_parallel_size 2 \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2 \
--test_convert_precision true
```
Merge-LoRA:
```shell
# torch_dist -> torch_dist
CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
megatron export \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--mcore_adapter megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx \
--output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx/checkpoint-xxx-merged \
--merge_lora true \
--to_mcore true \
--tensor_model_parallel_size 2 \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2
```
## 使用代码
你需要创建以下文件(test.py),然后运行`CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 test.py`。以下为使用Mcore-Bridge进行权重加载、导出、保存的示例代码。
```python
import torch
from swift.megatron import MegatronArguments, get_mcore_model
from swift.model import get_processor
model_id = 'Qwen/Qwen3-4B-Instruct-2507'
processor = get_processor(model_id, download_model=True)
hf_config = processor.model_info.config
args = MegatronArguments(
model=model_id,
tensor_model_parallel_size=2,
torch_dtype=torch.bfloat16,
)
mg_models = get_mcore_model(args, hf_config)
bridge = args.megatron_model_meta.bridge_cls(args)
# 加载权重
bridge.load_weights(mg_models, args.model_dir)
# 导出权重
for name, parameters in bridge.export_weights(mg_models):
pass
# 保存权重
bridge.save_weights(mg_models, 'output/Qwen3-4B-Instruct-2507-new')
```
推理新产生的权重:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model output/Qwen3-4B-Instruct-2507-new \
--model_type qwen3 \
--template qwen3_nothinking \
--stream true
```
LoRA权重的加载、导出和存储同理,运行`CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 test.py`
```python
import torch
from swift.megatron import MegatronArguments, get_mcore_model, prepare_mcore_model
from swift.model import get_processor
model_id = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
processor = get_processor(model_id, download_model=True)
hf_config = processor.model_info.config
args = MegatronArguments(
model=model_id,
tensor_model_parallel_size=2,
pipeline_model_parallel_size=2,
expert_model_parallel_size=2,
sequence_parallel=True,
torch_dtype=torch.bfloat16,
tuner_type='lora',
)
mg_models = get_mcore_model(args, hf_config)
bridge = args.megatron_model_meta.bridge_cls(args)
# 加载权重
bridge.load_weights(mg_models, args.model_dir)
# 准备LoRA并加载
peft_models = [prepare_mcore_model(args, mg_model) for mg_model in mg_models]
print(f'peft_model: {peft_models[0]}')
# bridge.load_weights(mg_models, 'adapter-path', is_peft_format=True)
# 导出权重
for name, parameters in bridge.export_weights(mg_models, is_peft_format=True):
pass
# 保存权重
bridge.save_weights(mg_models, 'output/Qwen3-30B-A3B-Instruct-2507-lora', is_peft_format=True)
```
推理新产生的权重:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--adapters output/Qwen3-30B-A3B-Instruct-2507-lora \
--stream true
```
================================================
FILE: docs/source/Megatron-SWIFT/Multimodal-Model.md
================================================
# 多模态模型
ms-swift引入了Megatron的并行技术来加速多模态大模型的训练。目前支持Qwen3-VL, Qwen3-Omni, InternVL3.5, GLM4.5v, Kimi-VL等模型的CPT/SFT/GRPO/DPO/KTO/RM。完整支持的模型可以参考[支持的模型与数据集文档](../Instruction/Supported-models-and-datasets.md)。
环境准备请参考Megatron-SWIFT的[快速开始文档](./Quick-start.md)。
## Dense模型
这里介绍使用2卡80GiB A100对Qwen2.5-VL-7B-Instruct模型进行Latex-OCR的微调,分别使用全参数和LoRA的方式,以下最佳实践可以在10分钟内完成。
### Full
全参数训练脚本如下:
```shell
# 2 * 72GiB; 4.1s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
MAX_PIXELS=1003520 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--save_safetensors true \
--dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
--load_from_cache_file true \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--packing true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--split_dataset_ratio 0.01 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-VL-7B-Instruct \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 8
```
### LoRA
LoRA训练脚本如下:
```shell
# 2 * 23GiB; 2.3s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
MAX_PIXELS=1003520 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--save_safetensors true \
--merge_lora false \
--dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
--load_from_cache_file true \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--tensor_model_parallel_size 1 \
--sequence_parallel true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--packing true \
--split_dataset_ratio 0.01 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-VL-7B-Instruct \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 8
```
最后,我们使用生成的HF格式权重对验证集进行推理:
```shell
MAX_PIXELS=1003520 \
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters megatron_output/Qwen2.5-VL-7B-Instruct/vx-xxx/checkpoint-xxx \
--attn_impl flash_attn \
--stream true \
--load_data_args true \
--temperature 0 \
--max_new_tokens 512
```
推理结果如下:
```
[QUERY] Using LaTeX to perform OCR on the image.
[LABELS] \forall x \in X , ( \alpha f ) ( x ) = \alpha f ( x )
[RESPONSE] \forall x \in X , ( \alpha f ) ( x ) = \alpha f ( x )
--------------------------------------------------
[QUERY] Using LaTeX to perform OCR on the image.
[LABELS] \pi \int _ { c } ^ { d } \{ g ( y ) \} ^ { 2 } d y
[RESPONSE] \pi \int _ { c } ^ { d } \{ g ( y ) \} ^ { 2 } d y
--------------------------------------------------
[QUERY] Using LaTeX to perform OCR on the image.
[LABELS] [ \frac 2 3 x ^ { \frac 3 2 } ] _ { 0 } ^ { 1 }
[RESPONSE] [ \frac 2 3 x ^ { \frac 3 2 } ] _ { 0 } ^ { 1 }
```
## Moe模型
训练脚本:
```bash
# 2 * 43GiB, 8s/it
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model OpenGVLab/InternVL3_5-30B-A3B \
--save_safetensors true \
--merge_lora false \
--dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
--load_from_cache_file true \
--tuner_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--sequence_parallel true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--packing true \
--split_dataset_ratio 0.01 \
--expert_model_parallel_size 2 \
--moe_permute_fusion true \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 1 \
--global_batch_size 4 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--num_train_epochs 1 \
--output_dir megatron_output/InternVL3_5-30B-A3B \
--eval_steps 200 \
--save_steps 200 \
--max_length 2048 \
--dataloader_num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--attention_backend flash
```
训练结束后,我们使用生成的HF格式权重对验证集进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--adapters megatron_output/InternVL3_5-30B-A3B/vx-xxx/checkpoint-xxx \
--attn_impl flash_attn \
--stream true \
--load_data_args true \
--temperature 0 \
--max_new_tokens 512
```
================================================
FILE: docs/source/Megatron-SWIFT/Quick-start.md
================================================
# 快速开始
ms-swift引入了Megatron的并行技术来加速大模型的训练,包括数据并行、张量并行、流水线并行、序列并行,上下文并行,专家并行。支持Qwen3、Qwen3.5、Deepseek-R1、GLM4.5、GPT-OSS等模型的CPT/SFT/GRPO/DPO/KTO/RM。完整支持的模型可以参考[支持的模型与数据集文档](../Instruction/Supported-models-and-datasets.md)。
| 方法 | 全参数 | LoRA | MoE | 多模态 | FP8 |
| ------ | ------ | ---- | ----- | ----- | ----- |
| 预训练 | ✅ | ✅| ✅ | ✅ | ✅ |
| [指令监督微调](https://github.com/modelscope/ms-swift/tree/main/examples/megatron) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GRPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/grpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [GKD](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/gkd) | ✅ | ✅| ✅ | ✅ | ✅ |
| [DPO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/dpo) | ✅ | ✅| ✅ | ✅ | ✅ |
| [KTO](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/kto) | ✅ | ✅| ✅ | ✅ | ✅ |
| [RM](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/rlhf/rm) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Embedding](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/embedding) | ✅ | ✅| ✅ | ✅ | ✅ |
| [Reranker](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/reranker) | ✅ | ✅| ✅ | ✅ | ✅ |
| [序列分类](https://github.com/modelscope/ms-swift/tree/main/examples/megatron/seq_cls) | ✅ | ✅| ✅ | ✅ | ✅ |
## 环境准备
使用Megatron-SWIFT,除了安装swift依赖外,还需要安装以下内容:
```shell
pip install pybind11
# transformer_engine
# 若出现安装错误,可以参考该issue解决: https://github.com/modelscope/ms-swift/issues/3793
pip install --no-build-isolation transformer_engine[pytorch] --no-cache-dir
# apex
# 提示:Megatron-SWIFT可以在不含apex的环境下运行,额外设置`--gradient_accumulation_fusion false`即可。
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# megatron-core
pip install "megatron-core==0.15.*" -U
# 若使用多机训练,请额外设置`MODELSCOPE_CACHE`环境变量为共享存储路径
# 这将确保数据集缓存共享,而加速预处理速度。
# 注意:这步很关键,不然多机训练可能因随机性问题导致数据不一致而训练卡住。
export MODELSCOPE_CACHE='/xxx/shared'
# flash_attn
# 选择合适的版本进行安装:https://github.com/Dao-AILab/flash-attention/releases/tag/v2.8.3
# 注意:请勿安装高于transformer_engine限制的最高版本:https://github.com/NVIDIA/TransformerEngine/blob/release_v2.10/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L118
MAX_JOBS=8 pip install "flash-attn==2.8.3" --no-build-isolation
```
或者你也可以使用镜像:(历史镜像查看[这里](../GetStarted/SWIFT-installation.md#镜像))
```
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.10.0-vllm0.17.1-modelscope1.34.0-swift4.0.2
# cu129 (fp8 training)
modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py311-torch2.8.0-vllm0.11.0-modelscope1.32.0-swift3.11.3
```
推荐运行环境:
| | 范围 | 推荐 | 备注 |
|--------------|--------------|-------------|--------------------|
| python | >=3.9 | 3.11/3.12 | |
| cuda | | cuda12 | |
| torch | >=2.0 | 2.8.0/2.10.0 | |
| transformer_engine | >=2.3 | 2.12.0 | |
| apex | | 0.1 | |
| megatron_core | >=0.12,<0.16 | 0.15 | |
| flash_attn | | 2.8.3/3.0.0b1 | |
| transformers | >=4.33 | 4.57.6/5.2.0 | |
| modelscope | >=1.23 | | |
| peft | >=0.11,<0.19 | | LoRA |
| trl | >=0.15,<0.29 | | RLHF |
## 快速入门案例
这里介绍使用2卡80GiB A100对Qwen2.5-7B-Instruct模型进行自我认知微调的快速入门案例,以下最佳实践可以在10分钟内完成。
### 传统方式
首先,我们需要将HF格式的权重转为Megatron格式:
- 多卡权重转换:将`CUDA_VISIBLE_DEVICES=0`删除即可使用多卡权重转换。
- 转换精度测试:`--test_convert_precision true`将测试转换精度。在MoE大型模型的转换时,该参数所需时间较长,且需要更多的内存消耗,可酌情去除。
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--model Qwen/Qwen2.5-7B-Instruct \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir Qwen2.5-7B-Instruct-mcore \
--test_convert_precision true
```
然后,使用以下脚本进行训练,训练所需显存资源为2*80GiB:
- 若使用多机训练,建议共享磁盘,并将`--output_dir`指定为相同的路径。
```shell
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--mcore_model Qwen2.5-7B-Instruct-mcore \
--save_safetensors false \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--micro_batch_size 16 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-7B-Instruct \
--save_steps 100 \
--max_length 2048 \
--system 'You are a helpful assistant.' \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 4 \
--model_author swift \
--model_name swift-robot
```
最后,将Megatron格式权重转为HF格式:
- 注意:`--mcore_model`请指向`iter_xxx`的上级目录。默认会使用`latest_checkpointed_iteration.txt`中对应的checkpoint。
- 若出现OOM,将`CUDA_VISIBLE_DEVICES=0`删除。若出现内存不足,请将`--test_convert_precision true`删除。
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--mcore_model megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
--to_hf true \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
--test_convert_precision true
```
我们对生成的HF格式权重进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx-hf \
--stream true \
--temperature 0 \
--max_new_tokens 2048
```
推理结果如下:
```
<<< who are you?
I am a language model developed by swift, you can call me swift-robot. How can I assist you?
```
### Mcore-Bridge【推荐】
Mcore-Bridge去除模型转换的繁琐过程。具体参考[Mcore-Bridge文档](./Mcore-Bridge.md)。
训练脚本:
```bash
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
megatron sft \
--model Qwen/Qwen2.5-7B-Instruct \
--save_safetensors true \
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
'AI-ModelScope/alpaca-gpt4-data-en#500' \
'swift/self-cognition#500' \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--micro_batch_size 16 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-6 \
--num_train_epochs 1 \
--output_dir megatron_output/Qwen2.5-7B-Instruct \
--save_steps 100 \
--max_length 2048 \
--system 'You are a helpful assistant.' \
--dataloader_num_workers 4 \
--no_save_optim true \
--no_save_rng true \
--dataset_num_proc 4 \
--model_author swift \
--model_name swift-robot
```
我们对生成的safetensors格式权重进行推理:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift infer \
--model megatron_output/Qwen2.5-7B-Instruct/vx-xxx/checkpoint-xxx \
--stream true \
--temperature 0 \
--max_new_tokens 2048
```
- 若要进行预训练,你可以使用`megatron pt`替代`megatron sft`,这将会使用生成式的template进行训练。
- Megatron-SWIFT使用与ms-swift相同的dataset和template处理模块,因此同样支持packing、loss_scale、agent训练等技术。自定义数据集格式参考[自定义数据集文档](../Customization/Custom-dataset.md)。
- **更多案例**:包括packing、多机、32K上下文、DPO、MoE模型、预训练,可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/megatron)。
## 训练技巧
- 增加训练吞吐量方法:使用packing(不要开启流式)、增加DP、减少重计算、增加计算通信overlap。MoE还可以通过丢弃tokens加速。
- 并行技术选择:
- Megatron-SWIFT的并行技术采用zero1(默认开启use_distributed_optimizer)+各种并行技术的组合。
- DP的速度最快,但显存占用较多,使用其他并行技术以降低显存占用。
- TP/EP通信量较大,尽量不跨节点(NVLink域内),跨节点建议使用PP/DP;专家层建议使用EP而不是ETP,ETP更节约显存,但速度较慢。
- MoE 并行折叠:MoE 相关的并行组与 Dense 组分离。Attention使用 tp-cp-dp-pp 组,MoE 使用 etp-ep-dp-pp 组。
- 权重转换并行数的选择:Megatron-SWIFT在mcore端使用torch_dist存储格式,训练时可以调整并行数,不需要在权重转化时指定并行数。
- 关于日志打印:Megatron-SWIFT的日志在last rank进行打印,因为在PP并行中,只有last pp_rank拥有完整的信息。
## Benchmark
使用`megatron sft`和`swift sft`在单机八卡A800环境下进行Dense模型全参数8K上下文训练的速度对比如下:
**Dense** Qwen2.5-14B:
| | Megatron-LM | Deepspeed-ZeRO2 | Deepspeed-ZeRO3 |
| -------- | ----------- | ---------- | ---------- |
| 训练速度 | 9.04s/it | 10.32s/it | 10.56s/it |
| 显存占用 | 8\*64GB | 8\*80GB | 8\*58GB |
使用`megatron sft`和`swift sft`在双机16卡A800环境下进行MoE模型全参数8K上下文训练的速度对比如下:
**MoE** Qwen3-30B-A3B:
- 注意:其中,DeepSpeed测试结果在"transformers<5.0"环境下进行。在"transformers>5.0",可以通过`--experts_impl grouped_mm`加速训练。
| | Megatron-LM | DeepSpeed-ZeRO2 | DeepSpeed-ZeRO3 |
| -------- | ----------- | --------------- | --------------- |
| 训练速度 | 9.6s/it | - | 91.2s/it |
| 显存使用 | 16 * 60GiB | OOM | 16 * 80GiB |
## Megatron-SWIFT微信群
================================================
FILE: docs/source/_templates/autosummary/class.rst
================================================
.. currentmodule:: {{ module }}
{{ name | underline}}
.. autoclass:: {{ name }}
:inherited-members:
:members:
.. autogenerated from source/_templates/autosummary/class.rst
================================================
FILE: docs/source/_templates/classtemplate.rst
================================================
.. currentmodule:: {{ module }}
{{ name | underline}}
.. autoclass:: {{ name }}
:members:
:special-members: __init__, __call__
..
autogenerated from source/_templates/classtemplate.rst
note it does not have :inherited-members:
================================================
FILE: docs/source/_templates/sobolengine.rst
================================================
.. currentmodule:: {{ module }}
{{ name | underline}}
.. autoclass:: {{ name }}
:members:
:exclude-members: MAXBIT, MAXDIM
:undoc-members:
..
autogenerated from source/_templates/sobolengine.rst
note it has specific options
================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
# import sphinx_book_theme
sys.path.insert(0, os.path.abspath('../../'))
# -- Project information -----------------------------------------------------
project = 'swift'
copyright = '2022-2025, Alibaba ModelScope'
author = 'ModelScope Authors'
version_file = '../../swift/version.py'
html_theme = 'sphinx_rtd_theme'
language = 'zh_CN'
def get_version():
with open(version_file, 'r', encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
# The full version, including alpha/beta/rc tags
version = get_version()
release = version
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.napoleon',
'sphinx.ext.autosummary',
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx_markdown_tables',
'sphinx_copybutton',
'myst_parser',
]
# build the templated autosummary files
autosummary_generate = True
numpydoc_show_class_members = False
# Enable overriding of function signatures in the first line of the docstring.
autodoc_docstring_signature = True
# Disable docstring inheritance
autodoc_inherit_docstrings = False
# Show type hints in the description
autodoc_typehints = 'description'
# Add parameter types if the parameter is documented in the docstring
autodoc_typehints_description_target = 'documented_params'
autodoc_default_options = {
'member-order': 'bysource',
}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = ['.rst', '.md']
# The master toctree document.
root_doc = 'index'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['build', 'source/.ipynb_checkpoints', 'source/api/generated', 'Thumbs.db', '.DS_Store']
# A list of glob-style patterns [1] that are used to find source files.
# They are matched against the source file names relative to the source directory,
# using slashes as directory separators on all platforms.
# The default is **, meaning that all files are recursively included from the source directory.
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# html_theme = 'sphinx_book_theme'
# html_theme_path = [sphinx_book_theme.get_html_theme_path()]
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_css_files = ['css/readthedocs.css']
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
# -- Extension configuration -------------------------------------------------
# Ignore >>> when copying code
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'https://docs.python.org/': None}
myst_enable_extensions = [
'amsmath',
'dollarmath',
'colon_fence',
]
================================================
FILE: docs/source/index.rst
================================================
.. swift documentation file,
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Swift DOCUMENTATION
========================
.. toctree::
:maxdepth: 2
:caption: Get Started
GetStarted/SWIFT-installation.md
GetStarted/Quick-start.md
GetStarted/Web-UI.md
.. toctree::
:maxdepth: 2
:caption: Instruction
Instruction/Command-line-parameters.md
Instruction/Pre-training-and-Fine-tuning.md
Instruction/GRPO/index.rst
Instruction/GKD.md
Instruction/RLHF.md
Instruction/Inference-and-deployment.md
Instruction/Sample.md
Instruction/Evaluation.md
Instruction/Export-and-push.md
Instruction/Ray.md
Instruction/Reinforced-Fine-tuning.md
Instruction/Agent-support.md
Instruction/Supported-models-and-datasets.md
Instruction/Use-tuners.md
Instruction/Frequently-asked-questions.md
.. toctree::
:maxdepth: 2
:caption: Megatron-SWIFT
Megatron-SWIFT/Quick-start.md
Megatron-SWIFT/Command-line-parameters.md
Megatron-SWIFT/LoRA-Training.md
Megatron-SWIFT/Multimodal-Model.md
Megatron-SWIFT/Mcore-Bridge.md
Megatron-SWIFT/GRPO.md
Megatron-SWIFT/GKD.md
Megatron-SWIFT/Ascend.md
.. toctree::
:maxdepth: 2
:caption: Customization
Customization/Architecture.md
Customization/Custom-model.md
Customization/Custom-dataset.md
.. toctree::
:maxdepth: 2
:caption: Best Practices
BestPractices/GRPO.md
BestPractices/GRPO-Multi-Modal-Training.md
BestPractices/GRPO-Code-Training.md
BestPractices/Qwen3-Best-Practice.md
BestPractices/Qwen3-VL-Best-Practice.md
BestPractices/Qwen3_5-Best-Practice.md
BestPractices/MLLM-Registration.md
BestPractices/Embedding.md
BestPractices/Reranker.md
BestPractices/Rapidly-Training-VL-model.md
BestPractices/NPU-support.md
BestPractices/Metax-support.md
BestPractices/More-Best-Practices.md
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/source_en/.readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.12"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source_en/conf.py
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/framework.txt
================================================
FILE: docs/source_en/BestPractices/Elastic.md
================================================
# Elastic
## Installing Dependencies
Deploy a K8S cluster and deploy [DLRover](https://github.com/intelligent-machine-learning/dlrover) in the cluster, and install the required packages using `pip install dlrover && pip install tornado && pip install kubernetes && pip install ms-swift`
Other dependencies and versions verified through repeated testing in the training image:
deepspeed 0.16.5 (refer to this [PR](https://github.com/deepspeedai/DeepSpeed/pull/7585/files) to fix issues related to universal checkpoint)
pytorch 2.6.0
## How to Start
Enable elastic training by adding the `deepspeed_elastic` callback (optionally `graceful_exit`) in `--callbacks`, and configure DeepSpeed elasticity settings.
The command format is dlrover-run + DLrover command parameters + Swift startup command + Swift parameters.dlrover-run behaves like torchrun for most arguments, except for its custom parameters.
The dlrover-run arguments are as follows:
```
usage: dlrover-run [-h] [--nnodes NNODES] [--nproc-per-node NPROC_PER_NODE]
[--rdzv-backend RDZV_BACKEND] [--rdzv-endpoint RDZV_ENDPOINT] [--rdzv-id RDZV_ID]
[--rdzv-conf RDZV_CONF] [--standalone] [--max-restarts MAX_RESTARTS]
[--monitor-interval MONITOR_INTERVAL] [--start-method {spawn,fork,forkserver}]
[--role ROLE] [-m] [--no-python] [--run-path] [--log-dir LOG_DIR] [-r REDIRECTS]
[-t TEE] [--local-ranks-filter LOCAL_RANKS_FILTER] [--node-rank NODE_RANK]
[--master-addr MASTER_ADDR] [--master-port MASTER_PORT] [--local-addr LOCAL_ADDR]
[--logs-specs LOGS_SPECS] [--precheck {0,1,2}] [--node_unit NODE_UNIT]
[--auto_config] [--auto_tunning] [--exclude-straggler] [--save_at_breakpoint]
[--accelerator {nvidia.com/gpu,ascend-npu}] [--training_port TRAINING_PORT]
[--switchbox-check] [--box-pairs PAIR [PAIR ...]] [--min-bandwidth MIN_BANDWIDTH]
[--min-channels MIN_CHANNELS] [--numa-affinity] [--network-check]
[--comm-perf-test] [--ucp_device_type UCP_DEVICE_TYPE]
training_script
```
In elastic training, the parameters you may pay attention to focus on are:
--nnodes NNODES
Number of nodes, or the range of nodes in the form :.
--nproc-per-node NPROC_PER_NODE
Number of processes per node.
Example:
```bash
model=your model path
dataset=your dataset
output= your output dir
export CUDA_VISIBLE_DEVICES=0 # Set according to the actual GPU usage
deepspeed_config_or_type=deepspeed type or configuration file path, e.g., zero1 or /xxx/ms-swift/swift/llm/ds_config/zero1.json
dlrover-run --nnodes 1:$NODE_NUM --nproc_per_node=1 \
/opt/conda/lib/python3.10/site-packages/swift/cli/sft.py --model $model \
--model_type qwen3 \
--tuner_type lora \
--torch_dtype bfloat16 \
--dataset $dataset \
--num_train_epochs 4 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--learning_rate 5e-7 \
--gradient_accumulation_steps 8 \
--eval_steps 500 \
--save_steps 10 \
--save_total_limit 20 \
--logging_steps 1 \
--output_dir $output \
--warmup_ratio 0.01 \
--dataloader_num_workers 4 \
--temperature 1.0 \
--system 'You are a helpful assistant.' \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--dataset_num_proc 1 \
--use_flash_ckpt true \
--callbacks deepspeed_elastic graceful_exit \
--deepspeed $deepspeed_config_or_type \
```
## Configuration
By default, the zero1 configuration is as follows:
```json
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
}
```
If users need custom configurations, they can specify the path to the custom zero1.json file in the deepspeed_config_or_type parameter. The elasticity-related configuration is as follows:
```json
...
"elasticity": {
"ignore_non_elastic_batch_info": true,
"enabled": true,
"max_train_batch_size": 8,
"micro_batch_sizes": [
4,
2
],
"min_gpus": 1,
"max_gpus": 4,
"min_time": 20,
"version": 0.1
}
```
- ignore_non_elastic_batch_info:Indicates that the batch size configurations outside the elasticity settings will be ignored. During training, the batch size and related parameters will be dynamically adjusted based on the number of training processes.
Calculation principle:
global-training-batch-size = micro-batch-size * gradient-accumulation-steps * world-size
- max_train_batch_size: Maximum batch size
- micro_batch_sizes:List of allowed per-GPU micro-batch sizes under elasticity; candidates for train_micro_batch_size_per_gpu.
- min_gpus:Minimum number of GPUs.
- max_gpus:Maximum number of GPUs.
For more details, see: [Deepspeed](https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02)
## Starting Training
```yaml
---
apiVersion: elastic.iml.github.io/v1alpha1
kind: ElasticJob
metadata:
name: deepspeed-elastic-swift
namespace: dlrover
spec:
distributionStrategy: AllreduceStrategy
optimizeMode: single-job
replicaSpecs:
worker:
replicas: 1 # This should match the maximum value of --nnodes NNODES in the startup command
template:
spec:
restartPolicy: Never
containers:
- name: main
image: #【Training image, needs to have deepspeed, dlrover, and swift installed】
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- sh start.sh # Startup script
resources:
limits:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '1'
volumeMounts:
- mountPath: /model
name: volume-model
- mountPath: /dev/shm
name: volume-shm
restartPolicy: Never
volumes:
- hostPath:
path: /model
type: Directory
name: volume-model
- emptyDir:
medium: Memory
sizeLimit: 200Gi
name: volume-shm
```
================================================
FILE: docs/source_en/BestPractices/Embedding.md
================================================
# Embedding Training
SWIFT has already supported the training of embedding models, including both pure text and multimodal types. Currently supported models include:
1. modernbert embedding model
- [ModelScope](https://modelscope.cn/models/iic/gte-modernbert-base) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)
2. gte embedding models
- 1.5B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-1.5B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)
3. gme embedding models
- 2B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)
- 7B: [ModelScope](https://www.modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct) [Hugging Face](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)
4. qwen3-embedding models
- 0.6B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)
- 4B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-4B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-4B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-Embedding-8B)
5. qwen3-vl-embedding models
- 2B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-2B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B)
- 8B: [ModelScope](https://www.modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B) [Hugging Face](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)
Developers can integrate their own models by ensuring the model forward output satisfies:
```text
{"last_hidden_state": some-embedding-tensor}
```
The return value should be a JSON with a `last_hidden_state` key, where the value is an embedding tensor. For the input part, you can use our already supported templates. Users can also specify the
```shell
--task_type embedding
```
parameter to convert any other model into an embedding model for training.
It should be noted that the embedding models currently supported by SWIFT are all based on pure text or multimodal LLMs, and CLIP-type model training is not currently supported.
Additionally, all embedding models supported by SWIFT have normalization added at the end of the model forward pass. If you add new models yourself, please remember to include a normalization layer.
## Loss
The Embedding models supported by SWIFT currently can use the following loss functions:
- **cosine_similarity**: Cosine similarity loss, which calculates the similarity between two embeddings and fits based on the label value. It is effectively an MSE loss.
- **contrastive**: Contrastive learning loss with adjustable margin. Labels are only supported as 0 and 1.
- **online_contrastive**: Contrastive loss considering hard negatives and hard positives. Labels are only supported as 0 and 1.
- **infonce**: Computes pairwise cosine similarities between different rows within the same batch, maximizing similarity within rows and minimizing similarity between different rows. No labels are required.
The source code for the loss functions can be found [here](https://github.com/modelscope/ms-swift/blob/main/swift/loss/mapping.py).
## Dataset Format
> **Note:**
> 1. The `` tag can appear anywhere inside `messages`/`positive_messages`/`negative_messages`. Each group has its own image fields: `images`/`positive_images`/`negative_images` to provide paths or URLs.
> 2. There is no longer any cross-field ordering requirement. Alignment rules:
> - `images` length equals the number of `` tags in `messages`.
> - `positive_images` and `negative_images` are both list-of-list. Their outer lengths equal the lengths of `positive_messages` and `negative_messages` respectively. For each outer item, the inner list length equals the number of `` tags in that message sequence.
> 3. `messages` is the anchor sample; `positive_messages` and `negative_messages` are each a list of messages (hence one more `[]`). Accordingly, `positive_images`/`negative_images` are also list-of-list and align item-by-item.
> 4. `` and `` are supported as well. Follow the same rules via `videos`/`positive_videos`/`negative_videos` and `audios`/`positive_audios`/`negative_audios`.
> 5. Current constraint: the outer length of `positive_messages` must be 1 (i.e., provide exactly one positive). Accordingly, the outer length of `positive_images` must also be 1.
### Format for Cosine Similarity Loss
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 0.8}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images1.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 0.7}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0.7}
```
The eval metrics are the Pearson and Spearman's Rank Correlation Coefficient of the embeddings' euclidean distance/dot production and so on, totally 8 values.
### Format for Contrastive/Online Contrastive Loss
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "label": 1}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images1.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]], "positive_images": [["/some/images2.jpg"]], "label": 1}
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]], "positive_images": [["/some/images.jpg"]], "label": 0}
```
### Format for InfoNCE
```json lines
# LLM
{"messages": [{"role": "user", "content": "sentence1"}], "positive_messages": [[{"role": "user", "content": "sentence2"}]]}
# MLLM
{"messages": [{"role": "user", "content": ""}], "images": ["/some/images.jpg"], "positive_messages": [[{"role": "user", "content": "sentence"}]]}
{"messages": [{"role": "user", "content": "