Repository: verl-project/verl
Branch: main
Commit: 8e24127f4234
Files: 1128
Total size: 8.0 MB
Directory structure:
gitextract_5e2u4bw9/
├── .gemini/
│ └── config.yaml
├── .git-blame-ignore-revs
├── .github/
│ ├── CODEOWNERS
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug-report.yml
│ │ ├── config.yml
│ │ └── feature-request.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── dependabot.yml
│ └── workflows/
│ ├── README.md
│ ├── check-pr-title.yml
│ ├── cpu_unit_tests.yml
│ ├── doc.yml
│ ├── docker-build-ascend-a2.yml
│ ├── docker-build-ascend-a3.yml
│ ├── e2e_ascend.yml
│ ├── e2e_fully_async_policy.yml
│ ├── e2e_fully_async_policy_ascend.yml
│ ├── e2e_one_step_off_policy.yml
│ ├── e2e_one_step_off_policy_ascend.yml
│ ├── e2e_ppo_grpo_trainer_trtllm.yml
│ ├── e2e_ppo_trainer.yml
│ ├── e2e_ppo_trainer_megatron_sglang.yml
│ ├── e2e_ppo_trainer_megatron_sglang_2.yml
│ ├── e2e_ppo_trainer_megatron_vllm.yml
│ ├── e2e_ppo_trainer_megatron_vllm_2.yml
│ ├── e2e_ppo_trainer_megatron_vllm_2_ascend.yml
│ ├── e2e_ppo_trainer_veomni_vllm.yml
│ ├── e2e_sft_llm.yml
│ ├── e2e_sft_llm_ascend.yml
│ ├── e2e_sft_vlm.yml
│ ├── gpu_unit_tests.yml
│ ├── model.yml
│ ├── model_ascend.yml
│ ├── nightly_ascend.yml
│ ├── npu_unit_tests.yml
│ ├── pre-commit.yml
│ ├── precommit-autofix.yml
│ ├── reward_model_sglang.yml
│ ├── reward_model_vllm.yml
│ ├── reward_model_vllm_ascend.yml
│ ├── sanity.yml
│ ├── scorecard.yml
│ ├── secrets_scan.yml
│ ├── sgl.yml
│ ├── type-coverage-check.yml
│ └── vllm.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Notice.txt
├── README.md
├── docker/
│ ├── Dockerfile.isaaclab230
│ ├── Dockerfile.stable.sglang
│ ├── Dockerfile.stable.trtllm
│ ├── Dockerfile.stable.vllm
│ ├── README.md
│ ├── ascend/
│ │ ├── Dockerfile.ascend.sglang_8.3.rc1_a2
│ │ ├── Dockerfile.ascend.sglang_8.3.rc1_a3
│ │ ├── Dockerfile.ascend_8.2.rc1_a2
│ │ ├── Dockerfile.ascend_8.2.rc1_a3
│ │ ├── Dockerfile.ascend_8.3.rc1_a2
│ │ ├── Dockerfile.ascend_8.3.rc1_a3
│ │ ├── Dockerfile.ascend_8.5.0_a2
│ │ └── Dockerfile.ascend_8.5.0_a3
│ ├── aws/
│ │ ├── Dockerfile.extention.awsefa
│ │ └── Dockerfile.ngc.vllm0.8.sagemaker
│ ├── rocm/
│ │ ├── Apptainerfile.rocm
│ │ ├── Dockerfile.rocm
│ │ ├── Dockerfile.rocm7
│ │ ├── Dockerfile.rocm_verl-0.3.0.post1
│ │ └── Dockerfile.rocm_verl-0.4.1
│ ├── verl0.4-cu124-torch2.6-fa2.7.4/
│ │ ├── Dockerfile.app.sglang.vllm.mcore0.12
│ │ ├── Dockerfile.app.sglang.vllm.mcore0.12.deepep
│ │ ├── Dockerfile.app.sglang.vllm.mcore0.13.preview
│ │ ├── Dockerfile.app.vllm.mcore0.12
│ │ ├── Dockerfile.app.vllm.mcore0.12.deepep
│ │ ├── Dockerfile.app.vllm.mcore0.13.preview
│ │ ├── Dockerfile.base
│ │ └── README.md
│ ├── verl0.5-cu126-torch2.7-fa2.7.4/
│ │ ├── Dockerfile.app.sglang0.4.10.post2.mcore0.13
│ │ ├── Dockerfile.app.sglang0.4.9.post6.mcore0.13
│ │ ├── Dockerfile.app.vllm.mcore0.13
│ │ ├── Dockerfile.app.vllm.mcore0.15
│ │ ├── Dockerfile.base.torch2.7.1
│ │ └── README.md
│ ├── verl0.5-cu126-torch2.7.1-fa2.8.0/
│ │ ├── Dockerfile.app.sglang.mcore0.12
│ │ ├── Dockerfile.app.sglang.mcore0.13.preview
│ │ ├── Dockerfile.base
│ │ └── README.md
│ ├── verl0.5-preview-cu128-torch2.7.1-fa2.8.0/
│ │ ├── Dockerfile.app.sglang.megatron
│ │ ├── Dockerfile.base
│ │ └── README.md
│ ├── verl0.6-cu128-torch2.8.0-fa2.7.4/
│ │ ├── Dockerfile.app.sglang
│ │ ├── Dockerfile.base
│ │ └── Dockerfile.vllm011.mcore_gpt-oss
│ └── verl0.6.1-experimental/
│ ├── Dockerfile.sglang056exp
│ └── Dockerfile.vllm012exp
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── README_vllm0.7.md
│ ├── README_vllm0.8.md
│ ├── _static/
│ │ ├── custom.css
│ │ └── js/
│ │ ├── resizable-sidebar.js
│ │ └── runllm-widget.js
│ ├── advance/
│ │ ├── agent_loop.rst
│ │ ├── async-on-policy-distill.md
│ │ ├── attention_implementation.rst
│ │ ├── checkpoint.rst
│ │ ├── dpo_extension.rst
│ │ ├── fp8.md
│ │ ├── fsdp_extension.rst
│ │ ├── fully_async.md
│ │ ├── grafana_prometheus.md
│ │ ├── megatron_extension.rst
│ │ ├── mtp.md
│ │ ├── one_step_off.md
│ │ ├── placement.rst
│ │ ├── ppo_lora.rst
│ │ ├── reward_loop.rst
│ │ ├── rollout_skip.rst
│ │ ├── rollout_trace.rst
│ │ └── rope.rst
│ ├── algo/
│ │ ├── baseline.md
│ │ ├── collabllm.md
│ │ ├── dapo.md
│ │ ├── dppo.md
│ │ ├── entropy.md
│ │ ├── gpg.md
│ │ ├── grpo.md
│ │ ├── opo.md
│ │ ├── otb.md
│ │ ├── ppo.md
│ │ ├── rollout_corr.md
│ │ ├── rollout_corr_math.md
│ │ ├── spin.md
│ │ └── sppo.md
│ ├── amd_tutorial/
│ │ ├── amd_build_dockerfile_page.rst
│ │ └── amd_vllm_page.rst
│ ├── api/
│ │ ├── data.rst
│ │ ├── single_controller.rst
│ │ ├── trainer.rst
│ │ └── utils.rst
│ ├── ascend_tutorial/
│ │ ├── contribution_guide/
│ │ │ └── ascend_ci_guide_zh.rst
│ │ ├── examples/
│ │ │ ├── ascend_performance_analysis_guide.md
│ │ │ ├── ascend_retool_best_pratice.rst
│ │ │ ├── ascend_sglang_best_practices.rst
│ │ │ ├── dapo_multi_model_optimization_practice.md
│ │ │ ├── gspo_optimization_practice.md
│ │ │ └── run_qwen3_32B_megatron_1k_256k_npu.md
│ │ ├── faq/
│ │ │ └── faq.rst
│ │ ├── features/
│ │ │ ├── ascend_backend_features.md
│ │ │ └── ascend_consistency.rst
│ │ ├── profiling/
│ │ │ ├── ascend_profiling_en.rst
│ │ │ └── ascend_profiling_zh.rst
│ │ └── quick_start/
│ │ ├── ascend_quick_start.rst
│ │ ├── ascend_sglang_quick_start.rst
│ │ └── dockerfile_build_guidance.rst
│ ├── blog/
│ │ └── v0.7.md
│ ├── conf.py
│ ├── data/
│ │ └── transfer_queue.md
│ ├── examples/
│ │ ├── config.rst
│ │ ├── gsm8k_example.rst
│ │ ├── multi_modal_example.rst
│ │ ├── ppo_code_architecture.rst
│ │ ├── sandbox_fusion_example.rst
│ │ └── skypilot_examples.rst
│ ├── faq/
│ │ └── faq.rst
│ ├── hybrid_flow.rst
│ ├── index.rst
│ ├── perf/
│ │ ├── best_practices.rst
│ │ ├── device_tuning.rst
│ │ ├── dpsk.md
│ │ ├── nsight_profiling.md
│ │ ├── perf_tuning.rst
│ │ ├── perf_tuning_on_ascend.rst
│ │ ├── torch_profiling.md
│ │ └── verl_profiler_system.md
│ ├── preparation/
│ │ ├── prepare_data.rst
│ │ └── reward_function.rst
│ ├── requirements-docs.txt
│ ├── sglang_multiturn/
│ │ ├── interaction_system.rst
│ │ ├── multiturn.rst
│ │ ├── sandbox_fusion.rst
│ │ └── search_tool_example.rst
│ ├── single_controller.rst
│ ├── start/
│ │ ├── agentic_rl.rst
│ │ ├── install.rst
│ │ ├── more_resources.rst
│ │ ├── multinode.rst
│ │ ├── quickstart.rst
│ │ └── ray_debug_tutorial.rst
│ └── workers/
│ ├── automodel_workers.rst
│ ├── fsdp_workers.rst
│ ├── megatron_workers.rst
│ ├── model_engine.rst
│ ├── ray_trainer.rst
│ ├── sglang_worker.rst
│ └── trtllm_worker.rst
├── examples/
│ ├── cispo_trainer/
│ │ └── run_cispo_qwen2_5_0_5b_gsm8k.sh
│ ├── data_preprocess/
│ │ ├── aime2024_multiturn_w_tool.py
│ │ ├── dapo_multiturn_w_tool.py
│ │ ├── full_hh_rlhf.py
│ │ ├── geo3k.py
│ │ ├── geo3k_multiturn_w_tool.py
│ │ ├── gsm8k.py
│ │ ├── gsm8k_multiturn_sft.py
│ │ ├── gsm8k_multiturn_w_interaction.py
│ │ ├── gsm8k_multiturn_w_tool.py
│ │ ├── gsm8k_tool_agent_loop.py
│ │ ├── hellaswag.py
│ │ ├── math_dataset.py
│ │ ├── multiturn.py
│ │ ├── pokemon.py
│ │ └── preprocess_search_r1_dataset.py
│ ├── dppo_trainer/
│ │ ├── dppo.md
│ │ └── run_qwen30b_dppo.sh
│ ├── fapo_trainer/
│ │ ├── README.md
│ │ ├── prepare_data.py
│ │ ├── reward_fn.py
│ │ ├── run_qwen_7b_rm_colocate.sh
│ │ └── run_qwen_7b_rm_standalone.sh
│ ├── gdpo_trainer/
│ │ └── run_qwen1_5b_gdpo.sh
│ ├── generation/
│ │ ├── run_deepseek7b_mutli_node.sh
│ │ └── run_deepseek_v2_lite_math.sh
│ ├── gmpo_trainer/
│ │ ├── README.md
│ │ ├── run_qwen2_5-7b_math.sh
│ │ ├── test_dapo_7b_math.sh
│ │ └── test_dapo_qwen3_30b_math.sh
│ ├── gpg_trainer/
│ │ ├── gpg.md
│ │ ├── run_qwen2-7b_math.sh
│ │ └── run_qwen2-7b_math_megatron.sh
│ ├── grpo_trainer/
│ │ ├── README.md
│ │ ├── run_deepseek671b_math_megatron_80gb.sh
│ │ ├── run_deepseek671b_math_megatron_96gb.sh
│ │ ├── run_deepseek7b_llm.sh
│ │ ├── run_deepseek7b_llm_math.sh
│ │ ├── run_deepseek7b_llm_math_megatron.sh
│ │ ├── run_deepseek7b_llm_seq_balance.sh
│ │ ├── run_glm41v_9b.sh
│ │ ├── run_gptoss_20b.sh
│ │ ├── run_minicpmo2_6.sh
│ │ ├── run_mistral13b_skyworkrm_hhrlhf.sh
│ │ ├── run_moonlight16b_math_megatron.sh
│ │ ├── run_nemotron_nano_v3_megatron.sh
│ │ ├── run_qwen2-32b_sglang_fsdp_npu.sh
│ │ ├── run_qwen2-7b.sh
│ │ ├── run_qwen2-7b_math.sh
│ │ ├── run_qwen2-7b_math_megatron.sh
│ │ ├── run_qwen2-7b_math_megatron_lora.sh
│ │ ├── run_qwen2-7b_math_megatron_trtllm.sh
│ │ ├── run_qwen2-7b_math_trtllm.sh
│ │ ├── run_qwen2-7b_seq_balance.sh
│ │ ├── run_qwen2-7b_seq_balance_math_megatron.sh
│ │ ├── run_qwen2-7b_sgl_megatron.sh
│ │ ├── run_qwen2_5-32b_grpo_megatron_vllm_npu.sh
│ │ ├── run_qwen2_5-3b_gsm8k_grpo_lora.sh
│ │ ├── run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh
│ │ ├── run_qwen2_5-7b_math_megatron_diff_tp.sh
│ │ ├── run_qwen2_5_32b_grpo_npu.sh
│ │ ├── run_qwen2_5_7b_grpo_discrete_prof_npu.sh
│ │ ├── run_qwen2_5_7b_grpo_e2e_prof_npu.sh
│ │ ├── run_qwen2_5_7b_grpo_npu.sh
│ │ ├── run_qwen2_5_vl-7b-megatron.sh
│ │ ├── run_qwen2_5_vl-7b-sglang.sh
│ │ ├── run_qwen2_5_vl-7b-trtllm.sh
│ │ ├── run_qwen2_5_vl-7b.sh
│ │ ├── run_qwen2_5_vl-7b_freeze_vision.sh
│ │ ├── run_qwen2_5_vl-7b_lora.sh
│ │ ├── run_qwen2_5_vl-7b_seq_balance.sh
│ │ ├── run_qwen2_5_vl_32b_npu.sh
│ │ ├── run_qwen2_5_vl_3b_npu.sh
│ │ ├── run_qwen2_5_vl_3b_trtllm.sh
│ │ ├── run_qwen2_5_vl_7b_npu.sh
│ │ ├── run_qwen3-235b_megatron_96gb.sh
│ │ ├── run_qwen3-30b_dapo_megatron_fp8_trtllm.sh
│ │ ├── run_qwen3-32b_npu.sh
│ │ ├── run_qwen3-4b_gsm8k_grpo_lora_merge.sh
│ │ ├── run_qwen3-8b.sh
│ │ ├── run_qwen3-8b_npu.sh
│ │ ├── run_qwen3_235b_megatron_npu.sh
│ │ ├── run_qwen3_4b_grpo_vllm_1k_npu.sh
│ │ ├── run_qwen3_5-35b-megatron.sh
│ │ ├── run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
│ │ ├── run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
│ │ ├── run_qwen3_vl-235b-megatron.sh
│ │ ├── run_qwen3_vl-30b-megatron.sh
│ │ ├── run_qwen3_vl-8b-megatron.sh
│ │ ├── run_qwen3_vl-8b_npu.sh
│ │ ├── run_qwen3_vl_30b_vllm_fsdp_npu.sh
│ │ ├── run_qwen3moe-30b_grpo_megatron_vllm_npu.sh
│ │ ├── run_qwen3moe-30b_megatron_96gb.sh
│ │ ├── run_qwen3moe-30b_megatron_lora.sh
│ │ ├── run_qwen3moe-30b_megatron_lora_fp16.sh
│ │ ├── run_qwen3moe-30b_sglang_megatron_npu.sh
│ │ ├── run_qwen3next_80b_fsdp_npu.sh
│ │ └── run_seed_oss_36b.sh
│ ├── gspo_trainer/
│ │ ├── run_qwen30b_gspo.sh
│ │ ├── run_qwen3_32b_gspo_npu.sh
│ │ ├── test_gspo_3b_math.sh
│ │ ├── test_gspo_3b_math_slurm.sh
│ │ └── test_gspo_qwen30b_a3b_ep.sh
│ ├── mtp_trainer/
│ │ ├── runtime_env.yaml
│ │ ├── test_dapo_mimo_7b_with_mtp_math_megatron.sh
│ │ └── test_dapo_mimo_7b_with_mtp_math_megatron_4_4.sh
│ ├── otb_trainer/
│ │ └── run_qwen2_5-7b.sh
│ ├── ppo_trainer/
│ │ ├── README.md
│ │ ├── run_deepseek7b_llm.sh
│ │ ├── run_deepseek7b_llm_modelscope.sh
│ │ ├── run_deepseek7b_llm_pfppo.sh
│ │ ├── run_deepseek7b_llm_sandbox_fusion.sh
│ │ ├── run_deepseek7b_llm_sp2.sh
│ │ ├── run_deepseek_full_hh_rlhf.sh
│ │ ├── run_deepseek_math_gsm8k_megatron.sh
│ │ ├── run_deepseek_math_gsm8k_megatron_nsys.sh
│ │ ├── run_gemma.sh
│ │ ├── run_moonlight16b_a3b_gsm8k_megatron.sh
│ │ ├── run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
│ │ ├── run_qwen2-7b_math_gsm8k_megatron.sh
│ │ ├── run_qwen2-7b_rm.sh
│ │ ├── run_qwen2-7b_rm_reward_loop_colocate.sh
│ │ ├── run_qwen2-7b_rm_seq_balance.sh
│ │ ├── run_qwen2-7b_rm_seq_balance_fused_kernels.sh
│ │ ├── run_qwen2-7b_rm_seq_balance_nsys.sh
│ │ ├── run_qwen2-7b_seq_balance.sh
│ │ ├── run_qwen2-7b_sglang_seq_balance.sh
│ │ ├── run_qwen2.5-32b.sh
│ │ ├── run_qwen2.5-3b_rm_reward_loop_colocate.sh
│ │ └── run_qwen3-8b_npu.sh
│ ├── prefix_grouper/
│ │ ├── README.md
│ │ └── run_qwen3_prefix_grouper.sh
│ ├── ray/
│ │ └── tutorial.ipynb
│ ├── reinforce_plus_plus_trainer/
│ │ ├── run_qwen2-7b_math_rf.sh
│ │ └── run_qwen2-7b_math_rf_baseline.sh
│ ├── remax_trainer/
│ │ ├── run_qwen2.5-3b_seq_balance.sh
│ │ └── run_qwen2.5-7b_seq_balance.sh
│ ├── rloo_trainer/
│ │ └── run_qwen2-7b.sh
│ ├── rollout_correction/
│ │ ├── run_with_rollout_corr.sh
│ │ └── run_with_rollout_corr_multi_rs.sh
│ ├── router_replay/
│ │ ├── README.md
│ │ ├── run_qwen30_a3b_megatron_sglang.sh
│ │ └── run_qwen30_a3b_megatron_vllm.sh
│ ├── sapo_trainer/
│ │ ├── run_qwen30b_sapo.sh
│ │ └── run_qwen3_8b_sapo_npu.sh
│ ├── sft/
│ │ ├── gsm8k/
│ │ │ ├── run_deepseek_6b7.sh
│ │ │ ├── run_gemma_2b.sh
│ │ │ ├── run_gemma_7b.sh
│ │ │ ├── run_mimo_megatron_mtp.sh
│ │ │ ├── run_nemotron_nano_v3.sh
│ │ │ ├── run_qwen3_30b_automodel.sh
│ │ │ ├── run_qwen3_5_megatron.sh
│ │ │ ├── run_qwen3_8b_sft_peft_sp2_npu.sh
│ │ │ ├── run_qwen_05_automodel.sh
│ │ │ ├── run_qwen_05_peft.sh
│ │ │ ├── run_qwen_05_sp2.sh
│ │ │ ├── run_qwen_05_sp2_liger.sh
│ │ │ └── run_seed_oss_36b_sft.sh
│ │ ├── multiturn/
│ │ │ └── run_qwen_05_sp2.sh
│ │ └── vlm/
│ │ └── run_qwen3_vl_2b.sh
│ ├── sglang_multiturn/
│ │ ├── README.md
│ │ ├── config/
│ │ │ ├── geo3k_multiturn_grpo.yaml
│ │ │ ├── geo3k_multiturn_megatron_grpo.yaml
│ │ │ ├── gsm8k_multiturn_grpo.yaml
│ │ │ ├── gsm8k_multiturn_grpo_server.yaml
│ │ │ ├── gsm8k_multiturn_grpo_w_interaction.yaml
│ │ │ ├── gsm8k_multiturn_megatron_grpo.yaml
│ │ │ ├── interaction_config/
│ │ │ │ └── gsm8k_interaction_config.yaml
│ │ │ ├── retool_multiturn_grpo.yaml
│ │ │ ├── search_multiturn_grpo.yaml
│ │ │ ├── search_multiturn_grpo_one_step_off.yaml
│ │ │ └── tool_config/
│ │ │ ├── geo3k_tool_config.yaml
│ │ │ ├── gsm8k_tool_config.yaml
│ │ │ ├── mcp_server.json
│ │ │ ├── mcp_tool_config.yaml
│ │ │ ├── sandbox_fusion_tool_config.yaml
│ │ │ └── search_tool_config.yaml
│ │ ├── geo3k/
│ │ │ ├── run_qwen2.5-3b_geo3k_multiturn.sh
│ │ │ ├── run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
│ │ │ └── run_qwen2.5-3b_megatron_geo3k_multiturn.sh
│ │ ├── gsm8k_toolcall_shaping/
│ │ │ ├── gsm8k_toolcall_shaping.py
│ │ │ └── run_gsm8k_grpo_toolcall_shaping.sh
│ │ ├── run_qwen0.5b_gsm8k_multiturn_curriculum.sh
│ │ ├── run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
│ │ ├── run_qwen2.5-3b_gsm8k_multiturn.sh
│ │ ├── run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
│ │ ├── run_qwen2.5-3b_gsm8k_multiturn_4xgpu_server.sh
│ │ ├── run_qwen2.5-3b_gsm8k_multiturn_server.sh
│ │ ├── run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
│ │ ├── run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
│ │ ├── run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
│ │ ├── run_qwen3-4b_gsm8k_multiturn.sh
│ │ ├── run_qwen3_4b_dapo_multiturn.sh
│ │ └── search_r1_like/
│ │ ├── local_dense_retriever/
│ │ │ ├── download.py
│ │ │ └── retrieval_server.py
│ │ └── run_qwen2.5-3b_instruct_search_multiturn.sh
│ ├── skypilot/
│ │ ├── README.md
│ │ ├── verl-grpo.yaml
│ │ ├── verl-multiturn-tools.yaml
│ │ └── verl-ppo.yaml
│ ├── slurm/
│ │ └── ray_on_slurm.slurm
│ ├── split_placement/
│ │ ├── README.md
│ │ ├── config/
│ │ │ └── ppo_trainer_split.yaml
│ │ ├── main_ppo_split.py
│ │ ├── run_deepseek7b_llm.sh
│ │ └── split_monkey_patch.py
│ ├── tuning/
│ │ ├── 0.5b/
│ │ │ └── qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
│ │ ├── 1.5b/
│ │ │ └── qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
│ │ ├── 14b/
│ │ │ ├── qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh
│ │ │ └── qwen2_14b_grpo_4_h800_fsdp_vllm.sh
│ │ ├── 32b/
│ │ │ ├── qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh
│ │ │ └── qwen2_32B_grpo_8_h20_megatron_vllm.sh
│ │ ├── 3b/
│ │ │ └── qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh
│ │ ├── 70b/
│ │ │ ├── qwen2-70b_grpo_32_h20_fsdp_vllm.sh
│ │ │ ├── qwen2-70b_grpo_32_h800_fsdp_vllm.sh
│ │ │ └── qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh
│ │ └── 7b/
│ │ ├── qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh
│ │ └── qwen2-7b_grpo_2_h800_fsdp_vllm.sh
│ └── tutorial/
│ └── agent_loop_get_started/
│ ├── agent_loop_tutorial.ipynb
│ └── sandbox.py
├── pyproject.toml
├── requirements-cuda.txt
├── requirements-npu.txt
├── requirements-test.txt
├── requirements.txt
├── requirements_sglang.txt
├── scripts/
│ ├── __init__.py
│ ├── converter_hf_to_mcore.py
│ ├── diagnose.py
│ ├── generate_trainer_config.sh
│ ├── init_random_model.py
│ ├── install_sglang_mcore_npu.sh
│ ├── install_vllm_sglang_mcore.sh
│ ├── legacy_model_merger.py
│ ├── megatron_merge_lora.py
│ ├── print_cfg.py
│ ├── rollout_viewer.py
│ └── veomni/
│ ├── moe_merge.py
│ └── moe_split.py
├── setup.py
├── tests/
│ ├── README.md
│ ├── __init__.py
│ ├── checkpoint_engine/
│ │ ├── __init__.py
│ │ ├── test_correctness_on_gpu.py
│ │ ├── test_correctness_on_npu.py
│ │ ├── test_special_server_adapter.py
│ │ └── test_utils.py
│ ├── experimental/
│ │ ├── agent_loop/
│ │ │ ├── agent_utils.py
│ │ │ ├── qwen_vl_tool_chat_template.jinja2
│ │ │ ├── test_agent_loop_extra_fields_schema_on_cpu.py
│ │ │ ├── test_basic_agent_loop.py
│ │ │ ├── test_gpt_oss_tool_parser.py
│ │ │ ├── test_multi_modal.py
│ │ │ └── test_standalone_rollout.py
│ │ ├── reward_loop/
│ │ │ ├── reward_fn.py
│ │ │ ├── test_agent_reward_loop_colocate.py
│ │ │ ├── test_agent_reward_loop_standalone.py
│ │ │ ├── test_async_token_bucket_on_cpu.py
│ │ │ ├── test_math_verify.py
│ │ │ ├── test_rate_limited_reward_manager_on_cpu.py
│ │ │ ├── test_reward_model_disrm.py
│ │ │ └── test_reward_model_genrm.py
│ │ └── vla/
│ │ └── test_sim_envs.py
│ ├── interactions/
│ │ ├── __init__.py
│ │ ├── test_gsm8k_interaction.py
│ │ └── test_interaction_registry.py
│ ├── kill_github_tests.sh
│ ├── models/
│ │ ├── test_engine.py
│ │ ├── test_tiled_mlp_accuracy.py
│ │ ├── test_transformer.py
│ │ └── test_transformers_ulysses.py
│ ├── single_controller/
│ │ ├── __init__.py
│ │ ├── base/
│ │ │ └── test_decorator.py
│ │ ├── check_worker_alive/
│ │ │ └── main.py
│ │ ├── detached_worker/
│ │ │ ├── README.md
│ │ │ ├── client.py
│ │ │ ├── run.sh
│ │ │ └── server.py
│ │ ├── test_auto_padding_on_cpu.py
│ │ ├── test_colocated_workers.py
│ │ ├── test_colocated_workers_fused.py
│ │ ├── test_data_transfer.py
│ │ ├── test_decorator_on_cpu.py
│ │ ├── test_device_mesh_register.py
│ │ ├── test_driverfunc_to_worker.py
│ │ ├── test_fused_workers_on_cpu.py
│ │ ├── test_get_set_dispatch_collect_cpu.py
│ │ ├── test_high_level_scheduling_api.py
│ │ ├── test_nested_worker.py
│ │ ├── test_ray_collectives.py
│ │ ├── test_ray_local_envs_on_cpu.py
│ │ ├── test_ray_utils_on_cpu.py
│ │ ├── test_rvdz.py
│ │ ├── test_split_resource_pool.py
│ │ ├── test_worker_group_basics.py
│ │ └── test_worker_group_torch.py
│ ├── special_distributed/
│ │ ├── README.md
│ │ ├── run_all.sh
│ │ ├── test_fsdp_ckpt.py
│ │ ├── test_mcore_config_converter.py
│ │ ├── test_tensor_dict.py
│ │ └── test_torch_functional.py
│ ├── special_e2e/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── check_custom_rwd_fn.py
│ │ ├── check_results.py
│ │ ├── envs/
│ │ │ ├── __init__.py
│ │ │ └── digit_completion/
│ │ │ ├── __init__.py
│ │ │ ├── task.py
│ │ │ └── tokenizer.py
│ │ ├── generation/
│ │ │ ├── run_gen_qwen05.sh
│ │ │ └── run_gen_qwen05_server.sh
│ │ ├── ppo_trainer/
│ │ │ ├── expert_parallel/
│ │ │ │ ├── qwen2moe_minimal.json
│ │ │ │ └── qwen3moe_minimal.json
│ │ │ ├── run_function_reward.sh
│ │ │ ├── run_model_reward.sh
│ │ │ ├── run_single_gpu.sh
│ │ │ └── run_single_gpu_with_engine.sh
│ │ ├── run_dapo.sh
│ │ ├── run_fully_async_policy.sh
│ │ ├── run_geo3k_fsdp_sgl_multiturn_w_tool.sh
│ │ ├── run_grpo_lora_with_merge.sh
│ │ ├── run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
│ │ ├── run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
│ │ ├── run_one_step_off_policy.sh
│ │ ├── run_ppo_trainer_megatron.sh
│ │ ├── run_ppo_trainer_torchtitan.sh
│ │ ├── run_ppo_trainer_veomni.sh
│ │ ├── run_test.sh
│ │ └── sft/
│ │ ├── compare_sft_engine_results.py
│ │ ├── run_sft.sh
│ │ ├── run_sft_engine.sh
│ │ └── test_sft_engine_all.sh
│ ├── special_npu/
│ │ ├── nightly_ci_ascend/
│ │ │ ├── run_grpo_qwen25-7b-instruct_fsdp_npu.sh
│ │ │ ├── run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
│ │ │ └── run_ppo_qwen3-8b_fsdp_npu.sh
│ │ ├── run_qwen2_5_05b_grpo.sh
│ │ ├── run_qwen2_5_05b_grpo_mindspeed.sh
│ │ ├── run_qwen2_5_05b_sft_peft_sp2.sh
│ │ ├── run_qwen2_5_vl_3b_npu.sh
│ │ ├── run_qwen3_06b_ppo.sh
│ │ └── run_qwen3_30b_grpo_mindspeed.sh
│ ├── special_sanity/
│ │ ├── check_api_docs.py
│ │ ├── check_dataproto_usage.py
│ │ ├── check_device_api_usage.py
│ │ ├── check_docs_time_info.py
│ │ ├── check_docstrings.py
│ │ ├── check_license.py
│ │ ├── check_pr_description.py
│ │ ├── check_pr_title.py
│ │ ├── test_config_docs.py
│ │ ├── test_import.py
│ │ ├── type_coverage_check.py
│ │ ├── validate_imported_docs.py
│ │ └── validate_structure.py
│ ├── special_standalone/
│ │ ├── README.md
│ │ └── test_memory_buffers.py
│ ├── test_base_config_on_cpu.py
│ ├── test_protocol_on_cpu.py
│ ├── test_protocol_v2_on_cpu.py
│ ├── trainer/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── legacy_ppo_megatron_trainer.yaml
│ │ │ ├── legacy_ppo_trainer.yaml
│ │ │ ├── test_algo_config_on_cpu.py
│ │ │ └── test_legacy_config_on_cpu.py
│ │ └── ppo/
│ │ ├── __init__.py
│ │ ├── test_core_algos_on_cpu.py
│ │ ├── test_metric_utils_on_cpu.py
│ │ ├── test_rollout_corr.py
│ │ └── test_rollout_corr_integration.py
│ ├── utils/
│ │ ├── _test_module.py
│ │ ├── ckpt/
│ │ │ ├── test_checkpoint_cleanup_on_cpu.py
│ │ │ └── test_esi_save_ckpt_on_cpu.py
│ │ ├── dataset/
│ │ │ ├── test_create_rl_sampler_on_cpu.py
│ │ │ ├── test_multiturn_sft_dataset_on_cpu.py
│ │ │ ├── test_rl_collate_fn_on_cpu.py
│ │ │ └── test_rl_dataset_on_cpu.py
│ │ ├── debug/
│ │ │ └── test_metrics.py
│ │ ├── megatron/
│ │ │ └── test_pipeline_parallel.py
│ │ ├── reward_score/
│ │ │ ├── reward_score/
│ │ │ │ └── test_sandbox_fusion_on_cpu.py
│ │ │ └── test_sandbox_on_cpu.py
│ │ ├── test_activation_offload.py
│ │ ├── test_bucketed_weight_transfer.py
│ │ ├── test_check_ipc_version_support_on_npu.py
│ │ ├── test_check_profiler_output.py
│ │ ├── test_config_on_cpu.py
│ │ ├── test_flops_counter.py
│ │ ├── test_fs_on_cpu.py
│ │ ├── test_fsdp2_peft_wrapping.py
│ │ ├── test_fsdp_lora_merge.py
│ │ ├── test_groupwise.py
│ │ ├── test_import_utils_on_cpu.py
│ │ ├── test_linear_cross_entropy.py
│ │ ├── test_mlflow_key_sanitization.py
│ │ ├── test_model_on_cpu.py
│ │ ├── test_normalize_peft_param_name.py
│ │ ├── test_normalize_peft_param_name_on_cpu.py
│ │ ├── test_nvtx_profile.py
│ │ ├── test_padding_on_cpu.py
│ │ ├── test_prepare_micro_batches_with_group_size.py
│ │ ├── test_rollout_skip_on_cpu.py
│ │ ├── test_rollout_trace_on_cpu.py
│ │ ├── test_seqlen_balancing.py
│ │ ├── test_server_profiler.py
│ │ ├── test_shared_memory.py
│ │ ├── test_special_linear_cross_entropy_tp.py
│ │ ├── test_special_mstx_profile.py
│ │ ├── test_temp_env_on_cpu.py
│ │ ├── test_timeout_decorator_cpu.py
│ │ ├── test_tokenizer_normalize_on_cpu.py
│ │ ├── test_torch_functional.py
│ │ └── test_torch_profile.py
│ └── workers/
│ ├── actor/
│ │ └── test_special_dp_actor.py
│ ├── config/
│ │ ├── test_actor_config_on_cpu.py
│ │ ├── test_critic_config_on_cpu.py
│ │ ├── test_engine_config_on_cpu.py
│ │ ├── test_model_config_on_cpu.py
│ │ └── test_optim_config_on_cpu.py
│ ├── critic/
│ │ └── test_special_dp_critic.py
│ ├── reward_manager/
│ │ └── test_registry_on_cpu.py
│ ├── rollout/
│ │ ├── perf/
│ │ │ └── vllm_async_rollout.py
│ │ ├── resource/
│ │ │ └── tool_configs/
│ │ │ ├── mcp_server.json
│ │ │ ├── mcp_tool_config
│ │ │ ├── sandbox_fusion_tool_config
│ │ │ └── search_tool_config
│ │ ├── rollout_sglang/
│ │ │ └── test_http_server_engine.py
│ │ ├── rollout_trtllm/
│ │ │ ├── __init__.py
│ │ │ ├── test_adapter.py
│ │ │ ├── test_async_server.py
│ │ │ └── test_trtllm_rollout_utils.py
│ │ ├── rollout_vllm/
│ │ │ ├── run_fsdp_vllm.py
│ │ │ └── test_vllm_abort.py
│ │ ├── test_hf_rollout.py
│ │ ├── test_sglang_async_rollout_multimodal_delta.py
│ │ ├── test_sglang_rollout_sharding_manager.py
│ │ └── test_vllm_cli_args_on_cpu.py
│ ├── test_fsdp_attn_implementation.py
│ └── test_fsdp_workers.py
└── verl/
├── __init__.py
├── base_config.py
├── checkpoint_engine/
│ ├── README.md
│ ├── __init__.py
│ ├── base.py
│ ├── hccl_checkpoint_engine.py
│ ├── kimi_checkpoint_engine.py
│ ├── mooncake_checkpoint_engine.py
│ ├── nccl_checkpoint_engine.py
│ └── nixl_checkpoint_engine.py
├── experimental/
│ ├── __init__.py
│ ├── agent_loop/
│ │ ├── __init__.py
│ │ ├── agent_loop.py
│ │ ├── prometheus_utils.py
│ │ ├── single_turn_agent_loop.py
│ │ ├── tool_agent_loop.py
│ │ ├── tool_parser.py
│ │ └── utils.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ └── sampler.py
│ ├── dynamic_dataset/
│ │ ├── __init__.py
│ │ └── dynamicgen_dataset.py
│ ├── fully_async_policy/
│ │ ├── README.md
│ │ ├── README_zh.md
│ │ ├── agent_loop/
│ │ │ ├── __init__.py
│ │ │ └── agent_loop.py
│ │ ├── config/
│ │ │ ├── fully_async_ppo_megatron_trainer.yaml
│ │ │ └── fully_async_ppo_trainer.yaml
│ │ ├── detach_utils.py
│ │ ├── fully_async_main.py
│ │ ├── fully_async_rollouter.py
│ │ ├── fully_async_trainer.py
│ │ ├── message_queue.py
│ │ ├── shell/
│ │ │ ├── dapo_30b_a3b_base_math_fsdp.sh
│ │ │ ├── dapo_7b_async_retool.sh
│ │ │ ├── dapo_7b_math_fsdp2_16_16.sh
│ │ │ ├── dapo_7b_math_fsdp2_32_32.sh
│ │ │ ├── dapo_7b_math_fsdp2_4_12.sh
│ │ │ ├── dapo_7b_math_fsdp2_4_4.sh
│ │ │ ├── dapo_7b_math_fsdp2_64_64.sh
│ │ │ ├── dapo_7b_math_fsdp2_64_64_mis.sh
│ │ │ ├── dapo_7b_math_fsdp2_8_8.sh
│ │ │ ├── geo3k_qwen25vl_7b_megatron_4_4.sh
│ │ │ ├── grpo_30b_a3b_base_math_megatron_96_32.sh
│ │ │ ├── grpo_30b_a3b_base_math_megatron_96_32_mis.sh
│ │ │ └── runtime_env.yaml
│ │ └── unittest/
│ │ └── simple_streaming_demo.py
│ ├── one_step_off_policy/
│ │ ├── README.md
│ │ ├── config/
│ │ │ ├── one_step_off_ppo_megatron_trainer.yaml
│ │ │ └── one_step_off_ppo_trainer.yaml
│ │ ├── main_ppo.py
│ │ ├── ray_trainer.py
│ │ └── shell/
│ │ ├── dapo_7b_math_fsdp2_4_12.sh
│ │ ├── dapo_7b_math_fsdp2_64_64.sh
│ │ ├── dapo_7b_math_fsdp2_64_64_ris.sh
│ │ ├── dapo_7b_math_fsdp2_colocate.sh
│ │ ├── dapo_7b_math_fsdp2_sglang_4_12.sh
│ │ ├── dapo_7b_math_fsdp2_sglang_colocate.sh
│ │ ├── dapo_7b_math_megatron_4_12.sh
│ │ ├── dapo_7b_math_megatron_colocate.sh
│ │ ├── grpo_0.6b_gsm8k_fsdp2_2_6.sh
│ │ ├── grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
│ │ ├── grpo_3b_gsm8k_fsdp2_2_6.sh
│ │ └── grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
│ ├── reward_loop/
│ │ ├── __init__.py
│ │ ├── reward_loop.py
│ │ ├── reward_manager/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── dapo.py
│ │ │ ├── gdpo.py
│ │ │ ├── limited.py
│ │ │ ├── naive.py
│ │ │ ├── registry.py
│ │ │ └── remote.py
│ │ ├── reward_model.py
│ │ └── router/
│ │ ├── inner_sglang_router.py
│ │ └── naive_router.py
│ ├── separation/
│ │ ├── __init__.py
│ │ ├── engine_workers.py
│ │ ├── ray_trainer.py
│ │ └── utils.py
│ └── vla/
│ ├── README.md
│ ├── config/
│ │ ├── rob_ppo_trainer.yaml
│ │ └── rob_sac_trainer.yaml
│ ├── dp_rob.py
│ ├── env_loop.py
│ ├── envs/
│ │ ├── __init__.py
│ │ ├── action_utils.py
│ │ ├── isaac_env/
│ │ │ ├── __init__.py
│ │ │ └── isaac_env.py
│ │ └── libero_env/
│ │ ├── __init__.py
│ │ ├── libero_env.py
│ │ ├── utils.py
│ │ └── venv.py
│ ├── fsdp_workers.py
│ ├── main_ppo.py
│ ├── main_sac.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── modules/
│ │ │ └── mlp.py
│ │ ├── openvla_oft/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_prismatic.py
│ │ │ ├── constants.py
│ │ │ ├── modeling_prismatic.py
│ │ │ ├── processing_prismatic.py
│ │ │ └── train_utils.py
│ │ ├── pi0_torch/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_pi0_torch.py
│ │ │ ├── model/
│ │ │ │ ├── modeling_pi0.py
│ │ │ │ └── paligemma_with_expert.py
│ │ │ ├── modeling_pi0_torch.py
│ │ │ ├── pi0_utils.py
│ │ │ └── policy/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── libero_policy.py
│ │ └── register_vla_models.py
│ ├── naive_rollout_rob.py
│ ├── prepare_libero_dataset.py
│ ├── requirements_vla.txt
│ ├── rob_ray_trainer.py
│ ├── run_pi05_libero_sac.sh
│ ├── run_pi05_libero_sac_disagg.sh
│ ├── run_simpleVLA_isaac_disagg.sh
│ ├── run_simpleVLA_libero_grpo.sh
│ ├── sac/
│ │ ├── base.py
│ │ ├── naive_rollout_pi05.py
│ │ ├── replay_pool.py
│ │ ├── sac_actor.py
│ │ └── sac_ray_trainer.py
│ └── workers/
│ └── env/
│ ├── env_loop_wg_test.py
│ ├── env_manager.py
│ └── env_worker.py
├── interactions/
│ ├── __init__.py
│ ├── base.py
│ ├── gsm8k_interaction.py
│ ├── utils/
│ │ ├── __init__.py
│ │ └── interaction_registry.py
│ └── weather_interaction.py
├── model_merger/
│ ├── __init__.py
│ ├── __main__.py
│ ├── base_model_merger.py
│ ├── fsdp_model_merger.py
│ └── megatron_model_merger.py
├── models/
│ ├── README.md
│ ├── __init__.py
│ ├── llama/
│ │ ├── __init__.py
│ │ └── megatron/
│ │ ├── __init__.py
│ │ ├── checkpoint_utils/
│ │ │ ├── __init__.py
│ │ │ ├── llama_loader.py
│ │ │ ├── llama_loader_depracated.py
│ │ │ └── llama_saver.py
│ │ ├── layers/
│ │ │ ├── __init__.py
│ │ │ ├── parallel_attention.py
│ │ │ ├── parallel_decoder.py
│ │ │ ├── parallel_linear.py
│ │ │ ├── parallel_mlp.py
│ │ │ └── parallel_rmsnorm.py
│ │ └── modeling_llama_megatron.py
│ ├── mcore/
│ │ ├── __init__.py
│ │ ├── bridge.py
│ │ ├── config_converter.py
│ │ ├── loader.py
│ │ ├── mbridge.py
│ │ ├── model_forward.py
│ │ ├── model_forward_1f1b_overlap.py
│ │ ├── model_forward_fused.py
│ │ ├── model_initializer.py
│ │ ├── mtp_patch.py
│ │ ├── patch.py
│ │ ├── qwen2_5_vl/
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── model.py
│ │ │ ├── rope_utils.py
│ │ │ ├── vision_config.py
│ │ │ ├── vision_model.py
│ │ │ └── vision_transformer_block.py
│ │ ├── readme.md
│ │ ├── registry.py
│ │ ├── saver.py
│ │ ├── util.py
│ │ └── weight_converter.py
│ ├── qwen2/
│ │ ├── __init__.py
│ │ └── megatron/
│ │ ├── __init__.py
│ │ ├── checkpoint_utils/
│ │ │ ├── __init__.py
│ │ │ ├── qwen2_loader.py
│ │ │ ├── qwen2_loader_depracated.py
│ │ │ └── qwen2_saver.py
│ │ ├── layers/
│ │ │ ├── __init__.py
│ │ │ ├── parallel_attention.py
│ │ │ ├── parallel_decoder.py
│ │ │ ├── parallel_linear.py
│ │ │ ├── parallel_mlp.py
│ │ │ └── parallel_rmsnorm.py
│ │ └── modeling_qwen2_megatron.py
│ ├── registry.py
│ ├── transformers/
│ │ ├── __init__.py
│ │ ├── apertus.py
│ │ ├── dense_common.py
│ │ ├── glm4v.py
│ │ ├── kimi_vl.py
│ │ ├── llama.py
│ │ ├── monkey_patch.py
│ │ ├── npu_patch.py
│ │ ├── qwen2.py
│ │ ├── qwen2_vl.py
│ │ ├── qwen3_vl.py
│ │ └── tiled_mlp.py
│ └── weight_loader_registry.py
├── protocol.py
├── py.typed
├── single_controller/
│ ├── __init__.py
│ ├── base/
│ │ ├── __init__.py
│ │ ├── decorator.py
│ │ ├── worker.py
│ │ └── worker_group.py
│ └── ray/
│ ├── __init__.py
│ └── base.py
├── third_party/
│ ├── __init__.py
│ ├── torch/
│ │ ├── __init__.py
│ │ └── distributed/
│ │ ├── __init__.py
│ │ ├── _state_dict_utils.py
│ │ └── checkpoint/
│ │ ├── __init__.py
│ │ └── state_dict.py
│ └── vllm/
│ └── __init__.py
├── tools/
│ ├── __init__.py
│ ├── base_tool.py
│ ├── geo3k_tool.py
│ ├── gsm8k_tool.py
│ ├── image_zoom_in_tool.py
│ ├── mcp_base_tool.py
│ ├── mcp_search_tool.py
│ ├── sandbox_fusion_tools.py
│ ├── schemas.py
│ ├── search_tool.py
│ └── utils/
│ ├── __init__.py
│ ├── mcp_clients/
│ │ ├── McpClientManager.py
│ │ └── utils.py
│ ├── search_r1_like_utils.py
│ └── tool_registry.py
├── trainer/
│ ├── README.md
│ ├── __init__.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── _generated_ppo_megatron_trainer.yaml
│ │ ├── _generated_ppo_torchtitan_trainer.yaml
│ │ ├── _generated_ppo_trainer.yaml
│ │ ├── _generated_ppo_veomni_trainer.yaml
│ │ ├── actor/
│ │ │ ├── actor.yaml
│ │ │ ├── dp_actor.yaml
│ │ │ ├── megatron_actor.yaml
│ │ │ ├── torchtitan_actor.yaml
│ │ │ └── veomni_actor.yaml
│ │ ├── algorithm/
│ │ │ └── rollout_correction.yaml
│ │ ├── algorithm.py
│ │ ├── config.py
│ │ ├── critic/
│ │ │ ├── critic.yaml
│ │ │ ├── dp_critic.yaml
│ │ │ ├── megatron_critic.yaml
│ │ │ ├── torchtitan_critic.yaml
│ │ │ └── veomni_critic.yaml
│ │ ├── data/
│ │ │ └── legacy_data.yaml
│ │ ├── engine/
│ │ │ ├── automodel.yaml
│ │ │ ├── fsdp.yaml
│ │ │ ├── megatron.yaml
│ │ │ ├── torchtitan.yaml
│ │ │ └── veomni.yaml
│ │ ├── evaluation.yaml
│ │ ├── legacy_reward_impl.yaml
│ │ ├── model/
│ │ │ └── hf_model.yaml
│ │ ├── model_engine/
│ │ │ ├── dp.yaml
│ │ │ ├── torchtitan.yaml
│ │ │ └── veomni.yaml
│ │ ├── npu_profile/
│ │ │ └── npu_profile.yaml
│ │ ├── optim/
│ │ │ ├── automodel.yaml
│ │ │ ├── fsdp.yaml
│ │ │ ├── megatron.yaml
│ │ │ ├── torchtitan.yaml
│ │ │ └── veomni.yaml
│ │ ├── ppo_megatron_trainer.yaml
│ │ ├── ppo_trainer.yaml
│ │ ├── profiler/
│ │ │ └── profiler.yaml
│ │ ├── ref/
│ │ │ ├── dp_ref.yaml
│ │ │ ├── megatron_ref.yaml
│ │ │ ├── ref.yaml
│ │ │ ├── torchtitan_ref.yaml
│ │ │ └── veomni_ref.yaml
│ │ ├── reward/
│ │ │ └── reward.yaml
│ │ ├── rollout/
│ │ │ └── rollout.yaml
│ │ └── sft_trainer_engine.yaml
│ ├── constants_ppo.py
│ ├── main_eval.py
│ ├── main_generation_server.py
│ ├── main_ppo.py
│ ├── ppo/
│ │ ├── __init__.py
│ │ ├── core_algos.py
│ │ ├── metric_utils.py
│ │ ├── prefix_grouper_utils.py
│ │ ├── ray_trainer.py
│ │ ├── reward.py
│ │ ├── rollout_corr_helper.py
│ │ └── utils.py
│ ├── runtime_env.yaml
│ ├── sft_trainer.py
│ └── sft_trainer_ray.py
├── utils/
│ ├── __init__.py
│ ├── activation_offload.py
│ ├── attention_utils.py
│ ├── chat_template.py
│ ├── checkpoint/
│ │ ├── __init__.py
│ │ ├── checkpoint_handler.py
│ │ ├── checkpoint_manager.py
│ │ ├── fsdp_checkpoint_manager.py
│ │ └── megatron_checkpoint_manager.py
│ ├── config.py
│ ├── dataset/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── dataset_utils.py
│ │ ├── multiturn_sft_dataset.py
│ │ ├── rl_dataset.py
│ │ ├── rm_dataset.py
│ │ └── vision_utils.py
│ ├── debug/
│ │ ├── __init__.py
│ │ ├── metrics.py
│ │ ├── performance.py
│ │ └── trajectory_tracker.py
│ ├── device.py
│ ├── distributed.py
│ ├── experimental/
│ │ ├── __init__.py
│ │ └── torch_functional.py
│ ├── flops_counter.py
│ ├── fp8_utils.py
│ ├── fs.py
│ ├── fsdp_utils.py
│ ├── groupwise.py
│ ├── hdfs_io.py
│ ├── import_utils.py
│ ├── kernel/
│ │ ├── __init__.py
│ │ ├── fp8_kernel.py
│ │ ├── kernels.py
│ │ └── linear_cross_entropy.py
│ ├── logger/
│ │ ├── __init__.py
│ │ └── aggregate_logger.py
│ ├── logging_utils.py
│ ├── megatron/
│ │ ├── __init__.py
│ │ ├── dist_checkpointing.py
│ │ ├── memory.py
│ │ ├── optimizer.py
│ │ ├── pipeline_parallel.py
│ │ ├── router_replay_patch.py
│ │ ├── router_replay_utils.py
│ │ ├── sequence_parallel.py
│ │ └── tensor_parallel.py
│ ├── megatron_peft_utils.py
│ ├── megatron_utils.py
│ ├── memory_utils.py
│ ├── metric/
│ │ ├── __init__.py
│ │ └── utils.py
│ ├── model.py
│ ├── net_utils.py
│ ├── npu_flash_attn_utils.py
│ ├── profiler/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── empty_annotations.py
│ │ ├── mstx_profile.py
│ │ ├── nvtx_profile.py
│ │ ├── performance.py
│ │ ├── profile.py
│ │ └── torch_profile.py
│ ├── py_functional.py
│ ├── qat/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── linear.py
│ │ ├── quantizer.py
│ │ └── vllm_patch.py
│ ├── ray_utils.py
│ ├── rendezvous/
│ │ ├── __init__.py
│ │ └── ray_backend.py
│ ├── reward_score/
│ │ ├── __init__.py
│ │ ├── geo3k.py
│ │ ├── gsm8k.py
│ │ ├── math_batch.py
│ │ ├── math_dapo.py
│ │ ├── math_reward.py
│ │ ├── math_verify.py
│ │ ├── prime_code/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── testing_util.py
│ │ │ └── utils.py
│ │ ├── prime_math/
│ │ │ ├── __init__.py
│ │ │ ├── grader.py
│ │ │ └── math_normalize.py
│ │ ├── rlla.py
│ │ ├── sandbox_fusion/
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ └── search_r1_like_qa_em.py
│ ├── rollout_skip.py
│ ├── rollout_trace.py
│ ├── seqlen_balancing.py
│ ├── sglang/
│ │ └── sglang_fp8_utils.py
│ ├── tensordict_utils.py
│ ├── tokenizer.py
│ ├── torch_dtypes.py
│ ├── torch_functional.py
│ ├── tracking.py
│ ├── transformers_compat.py
│ ├── trtllm/
│ │ └── trtllm_fp8_utils.py
│ ├── ulysses.py
│ └── vllm/
│ ├── __init__.py
│ ├── npu_vllm_patch.py
│ ├── patch.py
│ ├── utils.py
│ └── vllm_fp8_utils.py
├── version/
│ └── version
└── workers/
├── __init__.py
├── actor/
│ ├── __init__.py
│ ├── base.py
│ ├── dp_actor.py
│ └── megatron_actor.py
├── config/
│ ├── __init__.py
│ ├── actor.py
│ ├── critic.py
│ ├── engine.py
│ ├── megatron_peft.py
│ ├── model.py
│ ├── optimizer.py
│ ├── reward.py
│ └── rollout.py
├── critic/
│ ├── __init__.py
│ ├── base.py
│ ├── dp_critic.py
│ └── megatron_critic.py
├── engine/
│ ├── __init__.py
│ ├── automodel/
│ │ ├── __init__.py
│ │ ├── transformer_impl.py
│ │ └── utils.py
│ ├── base.py
│ ├── fsdp/
│ │ ├── __init__.py
│ │ ├── transformer_impl.py
│ │ └── utils.py
│ ├── megatron/
│ │ ├── __init__.py
│ │ ├── transformer_impl.py
│ │ └── utils.py
│ ├── mindspeed/
│ │ ├── __init__.py
│ │ └── transformer_impl.py
│ ├── torchtitan/
│ │ ├── __init__.py
│ │ ├── transformer_impl.py
│ │ └── utils.py
│ ├── utils.py
│ └── veomni/
│ ├── __init__.py
│ ├── transformer_impl.py
│ └── utils.py
├── engine_workers.py
├── fsdp_workers.py
├── megatron_workers.py
├── reward_manager/
│ ├── __init__.py
│ ├── abstract.py
│ ├── batch.py
│ ├── dapo.py
│ ├── naive.py
│ ├── prime.py
│ └── registry.py
├── rollout/
│ ├── __init__.py
│ ├── base.py
│ ├── hf_rollout.py
│ ├── naive/
│ │ ├── __init__.py
│ │ └── naive_rollout.py
│ ├── replica.py
│ ├── schemas.py
│ ├── sglang_rollout/
│ │ ├── __init__.py
│ │ ├── async_sglang_server.py
│ │ ├── http_server_engine.py
│ │ ├── sglang_rollout.py
│ │ └── utils.py
│ ├── tokenizer.py
│ ├── trtllm_rollout/
│ │ ├── trtllm_async_rollout.md
│ │ ├── trtllm_async_server.py
│ │ ├── trtllm_rollout.py
│ │ └── trtllm_worker_extension.py
│ ├── utils.py
│ └── vllm_rollout/
│ ├── __init__.py
│ ├── bucketed_weight_transfer.py
│ ├── utils.py
│ ├── vllm_async_server.py
│ └── vllm_rollout.py
├── sharding_manager/
│ ├── __init__.py
│ ├── base.py
│ └── fsdp_ulysses.py
└── utils/
├── __init__.py
├── losses.py
└── padding.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gemini/config.yaml
================================================
have_fun: false
code_review:
disable: false
comment_severity_threshold: HIGH
max_review_comments: -1
pull_request_opened:
help: false
summary: false
code_review: true
ignore_patterns: []
================================================
FILE: .git-blame-ignore-revs
================================================
# Local uasge: git config blame.ignoreRevsFile .git-blame-ignore-revs
# [dev] feat: immigrate from yapf & pylint to ruff based on pre-commit
# Changed 268 files, +10k/-9k lines. This is the biggest formatter change.
b00f77d8559b48d57a33c0132a5ba1c81891a536
# [ci] refactor: reduce ruff line-length from 300 to 120
# Changed 238 files, +6k/-1k lines. Global formatting change.
00a10a8ef389556f957a2f36132b2358fd6a109f
# [Lint] fix: linting errors in all files
# Changed 179 files, +1k/-3k lines. Global lint fix.
8e5ad4688a13de81727c014a3c2e2fb26324bc20
================================================
FILE: .github/CODEOWNERS
================================================
/docs @eric-haibin-lin @zhaochenyang20 @hongpeng-guo
/docs/amd_tutorial @yushengsu-thu
/docs/slang_multiturn @zhaochenyang20 @SwordFaith
/docs/ascend_tutorial @FightingZhen
/third_party/sglang @zhaochenyang20 @SwordFaith
/third_party/vllm @PeterSH6 @wuxibin89
/examples/grpo_trainer @vermouth1992 @PeterSH6 @tardis-key @FightingZhen @ji-huazhong
/verl/single_controller @zw0610 @wuxibin89 @hongpeng-guo
/verl/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6
/verl/models/mcore @ISEEKYAN @vermouth1992
/verl/models/transformers @vermouth1992 @PeterSH6 @tardis-key @FightingZhen @ji-huazhong
/verl/workers/engine @eric-haibin-lin @vermouth1992 @ZihengJiang
/verl/workers/roles @eric-haibin-lin @vermouth1992 @ZihengJiang
/verl/workers/engine/fsdp @eric-haibin-lin @vermouth1992 @ZihengJiang
/verl/workers/rollout/vllm_rollout @wuxibin89 @PeterSH6 @chenhaiq
/verl/workers/rollout/sglang_rollout @zhaochenyang20 @SwordFaith @chenhaiq
/verl/workers/actor/megatron_actor.py @ISEEKYAN @vermouth1992
/verl/workers/critic/megatron_critic.py @ISEEKYAN @vermouth1992
/verl/workers/megatron_workers.py @ISEEKYAN @vermouth1992
/verl/experimental @wuxibin89 @ArronHZG
/tests/single_controller @zw0610 @wuxibin89
/tests/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6
/tests/workers/rollout/vllm_rollout @wuxibin89 @PeterSH6 @chenhaiq
================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yml
================================================
# modified from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/bug-report.yml?plain=1
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve verl
labels: [ "bug" ]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report! 🤗
- type: textarea
id: system-info
attributes:
label: System Info
description: Please share your system info with us. You can run the command `python scripts/diagnose.py` and copy-paste its output below.
placeholder: verl version, platform, python version, ...
validations:
required: true
- type: checkboxes
id: information-scripts-examples
attributes:
label: Information
description: 'The problem arises when using:'
options:
- label: "The official example scripts"
- label: "My own modified scripts"
- type: checkboxes
id: information-tasks
attributes:
label: Tasks
description: "The tasks I am working on are:"
options:
- label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
- label: "My own task or dataset (give details below)"
- type: textarea
id: reproduction
validations:
required: true
attributes:
label: Reproduction
description: |
Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
Please include relevant config information with your code.
If you have code snippets, error messages, stack traces please provide them here as well.
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
placeholder: |
Steps to reproduce the behavior:
1.
2.
3.
- type: textarea
id: expected-behavior
validations:
required: true
attributes:
label: Expected behavior
description: "A clear and concise description of what you would expect to happen."
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true
version: 0.1
================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.yml
================================================
# modified from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml?plain=1
name: "\U0001F680 Feature request"
description: Submit a proposal/request for a new verl feature
labels: [ "Feature request" ]
body:
- type: textarea
id: feature-request
validations:
required: true
attributes:
label: Feature request
description: |
A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
- type: textarea
id: motivation
validations:
required: true
attributes:
label: Motivation
description: |
Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
- type: textarea
id: contribution
validations:
required: true
attributes:
label: Your contribution
description: |
Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md)
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### What does this PR do?
> Add **concise** overview of what this PR aims to achieve or accomplish. Reference related GitHub issues and PRs that help with the review.
### Checklist Before Starting
- [ ] Search for similar PRs. Paste at least one query link here: ...
- [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI)
- `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`, `fully_async`, `one_step_off`
- If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]`
- `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
- If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title.
- Example: `[BREAKING][fsdp, megatron] feat: dynamic batching`
### Test
> For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc.
### API and Usage Example
> Demonstrate how the API changes if any, and provide usage example(s) if possible.
```python
# Add code snippet or script demonstrating how to use this
```
### Design & Code Changes
> Demonstrate the high-level design if this PR is complex, and list the specific changes.
### Checklist Before Submitting
> [!IMPORTANT]
> Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review.
- [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md).
- [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always`
- [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs).
- [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ...
- [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
- [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`.
================================================
FILE: .github/dependabot.yml
================================================
## Enabled the dependabot to check the dependencies of the project
## Dependabot will open pull requests to update dependencies automatically
version: 2
updates:
- package-ecosystem: pip
directory: "/"
schedule:
interval: weekly
================================================
FILE: .github/workflows/README.md
================================================
### Adding a New Workflow
When adding a new workflow for continuous integration (CI), you have two runner options: a fixed runner or a machine from the vemlp.
- **Fixed Runner**: To use a fixed runner, specify it in your workflow using the `runs-on` keyword, like `runs-on: [L20x8]`.
- **Vemlp Runner**: Opting for a Vemlp machine allows you to launch tasks elastically.
Here is a template to assist you. This template is designed for using Vemlp machines. Currently, for each workflow, you need to create a `setup` and a `cleanup` job. When using this template, the main parts you need to modify are the `IMAGE` environment variable and the specific `job steps`.
```yaml
name: Your Default Workflow
on:
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- ".github/workflows/template.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read
env:
IMAGE: "your vemlp image" # e.g. "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_URL: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" # public veFaas api
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
task-id: ${{ steps.create-runner.outputs.task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_URL }}"
image: "${{ env.DEFAULT_IMAGE }}"
your_job:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'default-runner' }}"]
steps:
xxxx # your jobs
cleanup:
runs-on: ubuntu-latest
needs: [setup, your_job]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_URL }}"
task-id: "${{ needs.setup.outputs.task-id }}"
```
### Model and Dataset
To avoid CI relies on network, we pre-download dataset on a NFS on the CI machine. The path for models are \${HOME}/models and the path for dataset is \${HOME}/models/hf_data.
================================================
FILE: .github/workflows/check-pr-title.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
on:
pull_request:
types: [opened, edited, synchronize]
jobs:
check-title:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Run PR title checker
run: python3 tests/special_sanity/check_pr_title.py
env:
PR_TITLE: ${{ github.event.pull_request.title }}
- name: Run PR description checker
run: python3 tests/special_sanity/check_pr_description.py
env:
PR_TITLE: ${{ github.event.pull_request.title }}
GITHUB_EVENT_PATH: ${{ github.event_path }}
================================================
FILE: .github/workflows/cpu_unit_tests.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: cpu_unit_tests
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- .github/workflows/cpu_unit_tests.yml
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
cpu_unit_tests:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
TORCH_COMPILE_DISABLE: 1
TORCHINDUCTOR_DISABLE: 1
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install --upgrade "transformers>=5.0.0"
- name: Download datasets
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k
- name: Running CPU unit tests
run: |
echo '[pytest]' > pytest.ini
echo 'python_files = *_on_cpu.py' >> pytest.ini
pytest -s -x --asyncio-mode=auto tests/
cleanup:
runs-on: ubuntu-latest
needs: [setup, cpu_unit_tests]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/doc.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: doc_test
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "docs/**"
- .github/workflows/doc.yml
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read # for checkout
pages: write # for deploy-pages
id-token: write # for deploy-pages
jobs:
doc_test:
runs-on: ubuntu-latest
timeout-minutes: 5 # Increase this timeout value as needed
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip install -r docs/requirements-docs.txt
- name: Run doc make html
run: |
cd docs
make clean
make html SPHINXOPTS="--keep-going -w _build/sphinx.log"
if grep -q ": ERROR:" _build/sphinx.log; then
echo "🚨 Sphinx doc build contained ERRORs - see _build/sphinx.log"
exit 1
fi
if grep -q "WARNING: document isn't included in any toctree" _build/sphinx.log; then
echo "🚨 Sphinx doc build contained WARNING. Please include newly added docs in index.rst. See _build/sphinx.log for details"
exit 1
fi
if grep -q "WARNING: Inline emphasis" _build/sphinx.log; then
echo "🚨 Sphinx doc build contained WARNING. Please check inline emphasis is correct. See _build/sphinx.log for details"
exit 1
fi
if grep -q "WARNING: Definition list ends without a blank line" _build/sphinx.log; then
echo "🚨 Sphinx doc build contained WARNING. Please check if the indentation is correct. See _build/sphinx.log for details"
exit 1
fi
================================================
FILE: .github/workflows/docker-build-ascend-a2.yml
================================================
name: docker-build-ascend-a2
on:
workflow_dispatch:
push:
branches: ["main"]
paths:
- "docker/ascend/Dockerfile.ascend_8.5.0_a2"
- ".github/workflows/docker-build-ascend-a2.yml"
release:
types: [published]
schedule:
- cron: "0 16 * * *"
jobs:
build-ascend-image-a2:
if: ${{ github.event_name != 'pull_request' && github.repository_owner == 'verl-project' }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-build-ascend-image-a2
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Remove unnecessary parts in github actions runners to free up disk space
uses: jlumbroso/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Get base image name and tag
id: base_image
run: |
BASE_IMAGE_FULL=$(grep '^FROM' ./docker/ascend/Dockerfile.ascend_8.5.0_a2 | head -1 | cut -d' ' -f2)
echo "Base image full: $BASE_IMAGE_FULL"
BASE_IMAGE_TAG=$(echo "$BASE_IMAGE_FULL" | cut -d':' -f2)
echo "Base image tag: $BASE_IMAGE_TAG"
NEW_IMAGE_NAME="verl-$BASE_IMAGE_TAG"
echo "New image name: $NEW_IMAGE_NAME"
echo "base_image_tag=$BASE_IMAGE_TAG" >> "$GITHUB_OUTPUT"
echo "new_image_name=$NEW_IMAGE_NAME" >> "$GITHUB_OUTPUT"
- name: Get image tag
id: version
run: |
BRANCH_NAME=$(echo "${{ github.ref }}" | sed 's/refs\/heads\///g' | sed 's/[^a-zA-Z0-9._-]/_/g')
if [ "${{ github.event_name }}" = "release" ]; then
echo "tag=${{ steps.base_image.outputs.new_image_name }}-${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
elif [ "$BRANCH_NAME" = "main" ]; then
echo "tag=${{ steps.base_image.outputs.new_image_name }}-latest" >> "$GITHUB_OUTPUT"
fi
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Quay.io
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
- name: Clean Docker cache before build
run: |
docker system prune -a -f --volumes || true
- name: Build and push images Quay
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64,linux/arm64
file: ./docker/ascend/Dockerfile.ascend_8.5.0_a2
push: true
tags: |
quay.io/ascend/verl:${{ steps.version.outputs.tag }}
cache-from: type=gha
cache-to: type=gha,mode=max
build-args: |
BUILDKIT_INLINE_CACHE=1
================================================
FILE: .github/workflows/docker-build-ascend-a3.yml
================================================
name: docker-build-ascend-a3
on:
workflow_dispatch:
push:
branches: ["main"]
paths:
- "docker/ascend/Dockerfile.ascend_8.5.0_a3"
- ".github/workflows/docker-build-ascend-a3.yml"
release:
types: [published]
schedule:
- cron: "0 19 * * *"
jobs:
build-ascend-image-a3:
if: ${{ github.event_name != 'pull_request' && github.repository_owner == 'verl-project' }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-build-ascend-image-a3
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Remove unnecessary parts in github actions runners to free up disk space
uses: jlumbroso/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Get base image name and tag
id: base_image
run: |
BASE_IMAGE_FULL=$(grep '^FROM' ./docker/ascend/Dockerfile.ascend_8.5.0_a3 | head -1 | cut -d' ' -f2)
echo "Base image full: $BASE_IMAGE_FULL"
BASE_IMAGE_TAG=$(echo "$BASE_IMAGE_FULL" | cut -d':' -f2)
echo "Base image tag: $BASE_IMAGE_TAG"
NEW_IMAGE_NAME="verl-$BASE_IMAGE_TAG"
echo "New image name: $NEW_IMAGE_NAME"
echo "base_image_tag=$BASE_IMAGE_TAG" >> "$GITHUB_OUTPUT"
echo "new_image_name=$NEW_IMAGE_NAME" >> "$GITHUB_OUTPUT"
- name: Get image tag
id: version
run: |
BRANCH_NAME=$(echo "${{ github.ref }}" | sed 's/refs\/heads\///g' | sed 's/[^a-zA-Z0-9._-]/_/g')
if [ "${{ github.event_name }}" = "release" ]; then
echo "tag=${{ steps.base_image.outputs.new_image_name }}-${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
elif [ "$BRANCH_NAME" = "main" ]; then
echo "tag=${{ steps.base_image.outputs.new_image_name }}-latest" >> "$GITHUB_OUTPUT"
fi
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Quay.io
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
- name: Clean Docker cache before build
run: |
docker system prune -a -f --volumes || true
- name: Build and push images Quay
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64,linux/arm64
file: ./docker/ascend/Dockerfile.ascend_8.5.0_a3
push: true
tags: |
quay.io/ascend/verl:${{ steps.version.outputs.tag }}
cache-from: type=gha
cache-to: type=gha,mode=max
build-args: |
BUILDKIT_INLINE_CACHE=1
================================================
FILE: .github/workflows/e2e_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
paths:
- ".github/workflows/e2e_ascend.yml"
- "examples/data_preprocess/**"
- "examples/grpo_trainer/**"
- "examples/ppo_trainer/**"
- "examples/sft/**"
- "verl/experimental/one_step_off_policy/**"
- "tests/special_npu/**"
- "tests/special_sanity/check_device_api_usage.py"
- "verl/**"
- "pyproject.toml"
- "requirements-npu.txt"
- "setup.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read
jobs:
llm_rl_job:
if: github.repository_owner == 'verl-project'
name: E2E Ascend testing for RL training scenarios of LLM models
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 120
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout volcengine/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install -e .
- name: Check final pip list
run: |
pip list
- name: Preprocess gsm8k dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running gsm8k e2e training tests with PPO on ASCEND NPU (FSDP backend)
run: |
ray stop --force
bash tests/special_npu/run_qwen3_06b_ppo.sh
rm -rf $HOME/ckpts
- name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (FSDP backend)
run: |
ray stop --force
bash tests/special_npu/run_qwen2_5_05b_grpo.sh
rm -rf $HOME/ckpts
- name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeed backend)
run: |
ray stop --force
USE_DIST_CKPT=True bash tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh
rm -rf $HOME/dist_ckpt/qwen2_5_05b_grpo_mindspeed
rm -rf $HOME/ckpts
- name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeed backend, MoE Model)
run: |
ray stop --force
USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_grpo_mindspeed bash tests/special_npu/run_qwen3_30b_grpo_mindspeed.sh
vlm_rl_job:
if: github.repository_owner == 'verl-project'
name: E2E Ascend testing for RL training scenarios of VLM models
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 120
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout volcengine/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install -e .
- name: Check final pip list
run: |
pip list
- name: Preprocess geo3k dataset
run: |
python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k
- name: Running geo3k e2e training tests with GRPO on ASCEND NPU
run: |
ray stop --force
bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh
rm -rf $HOME/ckpts
================================================
FILE: .github/workflows/e2e_fully_async_policy.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_fully_async_policy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/*trainer*"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "verl/experimental/fully_async_policy"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Home
- "verl/experimental/fully_async_policy"
# Entrypoints
- ".github/workflows/e2e_fully_async_policy.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_fully_async_policy.sh"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
# Test FSDP2 strategy
e2e_fully_async_policy_fsdp2:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 10 # Increase timeout for async training
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "fsdp2"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install cupy-cuda12x==13.6.0
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running the E2E test with fully_async_policy algorithm (FSDP2)
run: |
ray stop --force
bash tests/special_e2e/run_fully_async_policy.sh
# Test Megatron strategy
e2e_fully_async_policy_megatron:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 10 # Increase timeout for async training
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "megatron"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install cupy-cuda12x==13.6.0
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running the E2E test with fully_async_policy algorithm (Megatron)
run: |
ray stop --force
bash tests/special_e2e/run_fully_async_policy.sh
cleanup:
runs-on: ubuntu-latest
needs: [setup, e2e_fully_async_policy_fsdp2]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_fully_async_policy_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_fully_async_policy_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/*trainer*"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "verl/experimental/fully_async_policy"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Home
- "verl/experimental/fully_async_policy"
# Entrypoints
- ".github/workflows/e2e_fully_async_policy_ascend.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_fully_async_policy.sh"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
# Test FSDP2 strategy
e2e_fully_async_policy_fsdp2_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "fsdp2"
device_name: "npu"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running the E2E test with fully_async_policy algorithm (FSDP2)
run: |
ray stop --force
bash tests/special_e2e/run_fully_async_policy.sh
# Test Megatron strategy
e2e_fully_async_policy_megatron_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "megatron"
device_name: "npu"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running the E2E test with fully_async_policy algorithm (Megatron)
run: |
ray stop --force
bash tests/special_e2e/run_fully_async_policy.sh
================================================
FILE: .github/workflows/e2e_one_step_off_policy.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_one_step_off_policy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/*trainer*"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "verl/experimental/one_step_off_policy"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Home
- "verl/experimental/one_step_off_policy"
# Entrypoints
- ".github/workflows/e2e_one_step_off_policy.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_one_step_off_policy.sh"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
# Test FSDP2 strategy
e2e_one_step_off_policy_fsdp2:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 10 # Increase timeout for async training
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "fsdp2"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install cupy-cuda12x==13.6.0
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running the E2E test with one_step_off_policy algorithm (FSDP2)
run: |
ray stop --force
bash tests/special_e2e/run_one_step_off_policy.sh
# Test Megatron strategy
e2e_one_step_off_policy_megatron:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 10 # Increase timeout for async training
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "megatron"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install cupy-cuda12x==13.6.0
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running the E2E test with one_step_off_policy algorithm (Megatron)
run: |
ray stop --force
bash tests/special_e2e/run_one_step_off_policy.sh
cleanup:
runs-on: ubuntu-latest
needs:
[setup, e2e_one_step_off_policy_fsdp2, e2e_one_step_off_policy_megatron]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_one_step_off_policy_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_one_step_off_policy_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/*trainer*"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "verl/experimental/one_step_off_policy"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- "!**/*.md"
- "!**/*.sh"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Home
- "verl/experimental/one_step_off_policy"
# Entrypoints
- ".github/workflows/e2e_one_step_off_policy_ascend.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_one_step_off_policy.sh"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
# Test FSDP2 strategy
e2e_one_step_off_policy_fsdp2_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "fsdp2"
device_name: "npu"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running the E2E test with one_step_off_policy algorithm (FSDP2)
run: |
ray stop --force
bash tests/special_e2e/run_one_step_off_policy.sh
# Test Megatron strategy
e2e_one_step_off_policy_megatron_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ACTOR_STRATEGY: "megatron"
device_name: "npu"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running the E2E test with one_step_off_policy algorithm (Megatron)
run: |
ray stop --force
export PYTHONPATH=$PYTHONPATH:/Megatron-LM
bash tests/special_e2e/run_one_step_off_policy.sh
================================================
FILE: .github/workflows/e2e_ppo_grpo_trainer_trtllm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_trtllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe/**"
# FSDP
- "!verl/workers/**/*dp_*.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe/**"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Entrypoints
- "verl/workers/rollout/trtllm_rollout/**"
- "tests/workers/rollout/rollout_trtllm/**"
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "examples/data_preprocess/dapo_multiturn_w_tool.py"
- "examples/data_preprocess/aime2024_multiturn_w_tool.py"
- "examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh"
- "examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh"
- "examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh"
# add back when ppo flow is ready
# - "tests/special_e2e/run_ppo_trainer_megatron.sh"
# - "verl/trainer/main_ppo.py"
# - "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:trtllm1.3.0rc4"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
trtllm_unit_tests:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install pytest-asyncio
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Run TRTLLM unit tests
run: |
export TRTLLM_TEST_MODEL_PATH_ROOT="${HOME}/models"
ray stop --force
pytest -v -s \
tests/workers/rollout/rollout_trtllm/test_adapter.py \
tests/workers/rollout/rollout_trtllm/test_async_server.py \
tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py
e2e_grpo_trainer_fsdp-qwen2:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
- name: Running GSM8K E2E training tests with FSDP on 8 L20 GPUs (Qwen)
run: |
ray stop --force
DATADIR=${HOME}/data \
bash examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh 2 \
trainer.total_training_steps=1 \
data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
trainer.logger='["console"]' \
actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct"
- name: clean up
run: |
rm -rf checkpoints
e2e_grpo_trainer_megatron-qwen2:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
run: |
ray stop --force
DATADIR=${HOME}/data \
ACTOR_TP=2 \
bash examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh 2 \
trainer.total_training_steps=1 \
data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
trainer.logger='["console"]' \
actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct"
- name: clean up
run: |
rm -rf checkpoints
e2e_grpo_trainer_fsdp-vlm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install qwen_vl_utils
pip3 install mathruler
- name: Prepare GEO3K dataset
run: |
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k --local_save_dir ${PWD}/data/geo3k
- name: Running GEO3K E2E training tests with FSDP on 8 L20 GPUs (VLM)
run: |
ray stop --force
DATADIR=${HOME}/data \
bash examples/grpo_trainer/run_qwen2_5_vl_3b_trtllm.sh 2 \
trainer.total_training_steps=1 \
data.train_files="['${PWD}/data/geo3k/train.parquet']" \
data.val_files="['${PWD}/data/geo3k/test.parquet']" \
trainer.logger='["console"]' \
actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen3-VL-2B-Instruct"
- name: clean up
run: |
rm -rf checkpoints
- name: Prepare DAPO-Math-17k and AIME-2024 datasets (data_preprocess)
run: |
python3 examples/data_preprocess/dapo_multiturn_w_tool.py --local_save_dir ${PWD}/data/dapo-math-17k
python3 examples/data_preprocess/aime2024_multiturn_w_tool.py --local_save_dir ${PWD}/data/aime-2024
- name: Running DAPO E2E with FP8 TRT-LLM rollout (Qwen3-0.6B)
run: |
ray stop --force
export INFER_TP=2 ACTOR_TP=2 ACTOR_PP=2 ACTOR_VPP=2 ACTOR_EP=1 ACTOR_CP=2 REF_TP=2 REF_PP=2 REF_VPP=2 REF_EP=1 REF_CP=2 GEN_MOE_TP=null GEN_MOE_EP=null
export NNODES=1 GPUS_PER_NODE=8 TRTLLM_MOE_BACKEND=CUTLASS
export DATA_DIR=${PWD} DAPO_MATH_TRAIN=${PWD}/data/dapo-math-17k/train.parquet AIME_VAL=${PWD}/data/aime-2024/train.parquet MODEL_PATH=${HOME}/models/Qwen/Qwen3-0.6B
bash examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh \
reward_model.reward_kwargs.overlong_buffer_cfg.len=258 \
reward_model.reward_kwargs.max_resp_len=512 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.train_batch_size=32 \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.rollout.max_num_seqs=16 \
actor_rollout_ref.rollout.max_num_batched_tokens=1024 \
actor_rollout_ref.rollout.max_model_len=1024 \
actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=False \
actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=False \
trainer.total_training_steps=1 \
trainer.logger='["console"]'
- name: clean up
run: |
rm -rf checkpoints
cleanup:
runs-on: ubuntu-latest
needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2, e2e_grpo_trainer_fsdp-vlm]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_ppo_trainer.yml
================================================
name: e2e_ppo_trainer
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!**/*.md"
- "!docker/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Docs
- "!docs/**"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/ppo_trainer"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
pre_commit_for_ppo:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install pre-commit hydra-core
pip3 install --no-deps -e .
- name: Set ruff --output-format=github
run: |
sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml
git add .pre-commit-config.yaml
- uses: pre-commit/action@v3.0.1
with:
extra_args: "" # Overriding default "--all-files"
================================================
FILE: .github/workflows/e2e_ppo_trainer_megatron_sglang.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_sglang
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py" # FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py" # FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
# Entrypoints
- "verl/worksers/rollout/sglang_rollout/*"
- ".github/workflows/e2e_ppo_trainer_megatron_sglang.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_ppo_trainer_megatron-deepseek:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ENGINE: sglang
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install git+https://github.com/ISEEKYAN/mbridge.git@main --no-deps --no-build-isolation
pip3 install --no-deps -e .
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
run: |
ray stop --force
OPTIM_MEMORY_EFFICIENT=True ENGINE=sglang SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
run: |
ray stop --force
export VLLM_USE_V1=1
ray start --head
ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
run: |
ray stop --force
PROFILE_ENABLE=True ENGINE=sglang ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
if [ -z "$( ls -A '/tmp/ray/session_latest/logs/nsight/' )" ]; then
echo "[ERROR] not found any profiling files"
exit 1
else
echo "[SUCCESS] profile success"
fi
- name: clean up
run: |
rm -rf checkpoints
# Qwen3-0.6B: dense, tie_word_embeddings=True
e2e_ppo_trainer_megatron-qwen3:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
ENGINE: sglang
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
run: |
ray stop --force
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
run: |
ray stop --force
export VLLM_USE_V1=1
ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints
cleanup:
runs-on: ubuntu-latest
needs:
[setup, e2e_ppo_trainer_megatron-deepseek, e2e_ppo_trainer_megatron-qwen3]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_sglang_2
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py" # FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py" # FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
# Entrypoints
- "verl/worksers/rollout/sglang_rollout/*"
- ".github/workflows/e2e_ppo_trainer_megatron_sglang.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_ppo_trainer_fsdp_sglang:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt
run: |
ray stop --force
ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh
e2e_ppo_trainer_fsdp-qwen2_5vl-3b:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
# Geo3k
- name: Prepare GEO3K dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
- name: Running GEO3K VLM E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
ENGINE=sglang ROLLOUT_MODE=async GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL)
run: |
ray stop --force
FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
ENGINE=sglang ROLLOUT_MODE=async GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL)
run: |
ray stop --force
FUSED_KERNELS=True FUSED_KERNEL_BACKEND=triton \
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
ENGINE=sglang ROLLOUT_MODE=async GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
cleanup:
runs-on: ubuntu-latest
needs:
[setup, e2e_ppo_trainer_fsdp-qwen2_5vl-3b, e2e_ppo_trainer_fsdp_sglang]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_ppo_trainer_megatron_vllm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer_megatron_vllm.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
# deepseek-ai/deepseek-coder-1.3b-instruct: dense, tie_word_embeddings=False
e2e_ppo_trainer_megatron-deepseek:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps --force-reinstall .
pip3 install git+https://github.com/ISEEKYAN/mbridge.git@main --no-deps --no-build-isolation
pip3 install math-verify
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
# Full training save&load
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use mbridge e2e to pre-load and save (Deepseek)
run: |
ray stop --force
ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True USE_DIST_CKPT=False \
bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use mbridge e2e to pre-load and save (Deepseek)
run: |
ray stop --force
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True USE_DIST_CKPT=False \
bash tests/special_e2e/run_ppo_trainer_megatron.sh
# LoRA training save&load
- name: clean up and install Megatron-Bridge
run: |
rm -rf checkpoints
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@83a7c11 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@5455f0a --no-deps --no-build-isolation
pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
run: |
ray stop --force
ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
run: |
ray stop --force
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 LORA_RANK=8 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False USE_DIST_CKPT=False \
bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints
# Qwen3-0.6B: dense, tie_word_embeddings=True
e2e_ppo_trainer_megatron-qwen3:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install math-verify
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
run: |
ray stop --force
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
run: |
ray stop --force
export VLLM_USE_V1=1
ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints
cleanup:
runs-on: ubuntu-latest
needs:
[setup, e2e_ppo_trainer_megatron-deepseek, e2e_ppo_trainer_megatron-qwen3]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_vllm_2
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_ppo_trainer_megatron-moe-expert-parallel:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps --force-reinstall .
pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@83a7c11 --no-deps --no-build-isolation
pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@5455f0a --no-deps --no-build-isolation
pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron-Bridge (Qwen3-30B-A3B-Instruct-2507)
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
PPO_MAX_TOKEN_LEN=1024 FWD_MAX_TOKEN_LEN=1024 \
MAX_PROMPT_LENGTH=512 MAX_RESPONSE_LENGTH=512 \
MODEL_ID=Qwen/Qwen3-30B-A3B-Instruct-2507 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False \
COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=8 \
USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism with FP8 rollout on 8 L20 GPUs with Megatron-Bridge (Qwen3-30B-A3B-Instruct-2507)
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
PPO_MAX_TOKEN_LEN=1024 FWD_MAX_TOKEN_LEN=1024 \
MAX_PROMPT_LENGTH=512 MAX_RESPONSE_LENGTH=512 \
MODEL_ID=Qwen/Qwen3-30B-A3B-Instruct-2507 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False \
COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=2 \
USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 ROLLOUT_QUANTIZATION=fp8 bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron-Bridge LoRA (Qwen3-30B-A3B-Instruct-2507)
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
PPO_MAX_TOKEN_LEN=1024 FWD_MAX_TOKEN_LEN=1024 \
MAX_PROMPT_LENGTH=512 MAX_RESPONSE_LENGTH=512 LORA_RANK=8 CRITIC_LORA_RANK=8 \
MODEL_ID=Qwen/Qwen3-30B-A3B-Instruct-2507 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False \
COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=2 COMMON_ETP=1 INFER_TP=8 \
USE_DIST_CKPT=False LORA_MERGE=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
- name: clean up
run: |
rm -rf checkpoints
e2e_ppo_trainer_fsdp_vllm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare GSM8K dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
# Function RM
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8)
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp-size8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm after resuming
run: |
ray stop --force
RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp-size8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test merging FSDP checkpoints (Qwen Actor)
run: |
exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp-size8"
python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4)
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 USE_KL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test merging DDP+FSDP checkpoints (Qwen Actor)
run: |
exp_name="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4"
python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP2)
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8" STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test merging FSDP2 checkpoints (Qwen Actor)
run: |
exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8"
python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
- name: Running GSM8K E2E without rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
run: |
ray stop --force
CUSTOM_REWARD_FN=True ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
# - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax)
# run: |
# ray stop --force
# ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
# LoRA tests
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test GRPO LoRA checkpoints merging function
run: |
export EXP_NAME="qwen2.5-0.5b-function-reward-minimal"
ls checkpoints/verl-test/${EXP_NAME}/global_step_1/actor
cat checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface/config.json
python3 -m verl.model_merger merge --backend fsdp --local_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/ --target_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon with fsdp2
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
e2e_ppo_trainer_fsdp-qwen2_5vl-3b:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
# Geo3k
- name: Prepare GEO3K dataset
run: |
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
- name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM GRPO E2E lora training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
LORA_RANK=32 LORA_EXCLUDE=".*visual.*" \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
cleanup:
runs-on: ubuntu-latest
needs:
[
setup,
e2e_ppo_trainer_megatron-moe-expert-parallel,
e2e_ppo_trainer_fsdp-qwen2_5vl-3b,
e2e_ppo_trainer_fsdp_vllm,
]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_megatron_vllm_2_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
- "!verl/utils/fsdp_utils.py"
- "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
- "!verl/model_merger/fsdp_model_merger.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_ppo_trainer_fsdp_vllm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 90 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
# Function RM
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4)
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 USE_KL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test merging DDP+FSDP checkpoints (Qwen Actor)
run: |
exp_name="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4"
python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP2)
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8" STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test merging FSDP2 checkpoints (Qwen Actor)
run: |
exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8"
python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
- name: Running GSM8K E2E without rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
run: |
ray stop --force
CUSTOM_REWARD_FN=True ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Test GRPO LoRA checkpoints merging function
run: |
export EXP_NAME="qwen2.5-0.5b-function-reward-minimal"
ls checkpoints/verl-test/${EXP_NAME}/global_step_1/actor
cat checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface/config.json
python3 -m verl.model_merger merge --backend fsdp --local_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/ --target_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon with fsdp2
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
e2e_ppo_trainer_fsdp-qwen2_5vl-3b_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
pip install trl==0.26.0
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
# Geo3k
- name: Prepare GEO3K dataset
run: |
python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k
- name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
- name: Running GEO3K VLM GRPO E2E lora training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
LORA_RANK=32 LORA_EXCLUDE=".*visual.*" \
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
================================================
FILE: .github/workflows/e2e_ppo_trainer_veomni_vllm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_ppo_trainer_veomni_vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch.
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!docker/**"
# Docs
- "!**/*.md"
- "!docs/**"
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer_veomni_vllm.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/special_e2e/run_ppo_trainer_veomni.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_ppo_trainer_veomni_vllm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install git+https://github.com/ByteDance-Seed/VeOmni.git@v0.1.4
- name: Prepare GSM8K dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Prepare GEO3K dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
- name: Running GSM8K E2E training tests on 8 L20 GPUs with veomni engine (FSDP_SIZE=4, USP=2)
run: |
ray stop --force
FSDP_SIZE=4 SP_SIZE=2 bash tests/special_e2e/run_ppo_trainer_veomni.sh
- name: Running GEO3K E2E training tests on 8 L20 GPUs with veomni engine (FSDP_SIZE=8, USP=1)
run: |
ray stop --force
MODEL_ID=Qwen/Qwen3-VL-2B-Instruct TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/gsm8k/test.parquet FSDP_SIZE=8 SP_SIZE=1 bash tests/special_e2e/run_ppo_trainer_veomni.sh
cleanup:
runs-on: ubuntu-latest
needs:
[
setup,
e2e_ppo_trainer_veomni_vllm,
]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_sft_llm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_sft_llm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Entrypoints
- ".github/workflows/e2e_sft_llm.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/sft"
- "verl/trainer/fsdp_sft_trainer.py"
- "verl/trainer/config/sft_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_sft_llm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install peft
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install git+https://github.com/ByteDance-Seed/VeOmni.git@v0.1.4
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs w/o rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism
run: |
ray stop --force
SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger
run: |
ray stop --force
SP_SIZE=2 LIGER=True bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests with LoRA
run: |
ray stop --force
LORA_RANK=32 bash tests/special_e2e/sft/run_sft.sh
- name: Run GSM8K E2E training and resume tests resuming from the checkpoint manager
run: |
ray stop --force
LORA_RANK=32 RESUME_MODE=auto TOTAL_TRAIN_STEP=2 bash tests/special_e2e/sft/run_sft.sh
# TODO: multiturn
- name: Running GSM8K E2E training tests with multiturn and various configs and compare results
run: |
bash tests/special_e2e/sft/test_sft_engine_all.sh
cleanup:
runs-on: ubuntu-latest
needs: [setup, e2e_sft_llm]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/e2e_sft_llm_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_sft_llm_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Entrypoints
- ".github/workflows/e2e_sft_llm_ascend.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/sft"
- "verl/trainer/fsdp_sft_trainer.py"
- "verl/trainer/config/sft_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_sft_llm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 90 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install -e .
pip install git+https://github.com/ByteDance-Seed/VeOmni.git@v0.1.4
pip install pandas==2.3.3
pip uninstall -y mbridge
pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare gsm8k dataset
run: |
python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running GSM8K E2E training tests on 8 NPUs with rmpad using function rm
run: |
ray stop --force
bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 NPUs w/o rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 NPUs with sequence parallism
run: |
ray stop --force
SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests with LoRA
run: |
ray stop --force
LORA_RANK=32 bash tests/special_e2e/sft/run_sft.sh
- name: Run GSM8K E2E training and resume tests resuming from the checkpoint manager
run: |
ray stop --force
LORA_RANK=32 RESUME_MODE=auto TOTAL_TRAIN_STEP=2 bash tests/special_e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests with multiturn and various configs and compare results
run: |
ray stop --force
rm -rf ~/verl/test/log
mkdir -p ~/verl/test/log
export VERL_FILE_LOGGER_ROOT=~/verl/test/log
# test with single gpu as golden
echo "run with single gpu as golden"
BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine.sh
# test with fsdp 1
echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine.sh
# test with fsdp 1 use_remove_padding and pad_mode no_padding
echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding use_remove_padding False"
BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine.sh
# test with fsdp 2
echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp2"
BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh
# test with veomni
echo "run with sp2 fsdp_size4 num_gpus8 fsdp_strategy fsdp2"
BACKEND=veomni SP_SIZE=2 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh
# test with megatron
echo "run with tp2 pp2 vpp2 cp2 num_gpus8"
BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=NULL CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine.sh
# test with cp in ray
echo "run with tp2 pp2 vpp2 cp2 num_gpus8 mode=ray"
BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=NULL CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine.sh
rm -rf ~/verl/test/log
================================================
FILE: .github/workflows/e2e_sft_vlm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: e2e_sft_vlm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Entrypoints
- ".github/workflows/e2e_sft_vlm.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/sft"
- "verl/trainer/fsdp_sft_trainer.py"
- "verl/trainer/config/sft_trainer.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
e2e_sft_vlm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install peft
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install git+https://github.com/ByteDance-Seed/VeOmni.git@v0.1.4
- name: Prepare pokemon-gpt4o-captions dataset
run: |
ray stop --force
python3 examples/data_preprocess/pokemon.py --local_dataset_path ${HOME}/models/hf_data/pokemon-gpt4o-captions
- name: Running Pokemon E2E training tests with multiturn and various configs and compare results
run: |
MODEL_ID=Qwen/Qwen3-VL-2B-Instruct DATASET_DIR=~/data/pokemon-gpt4o-captions VPP_SIZE=null bash tests/special_e2e/sft/test_sft_engine_all.sh
cleanup:
runs-on: ubuntu-latest
needs: [setup, e2e_sft_vlm]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/gpu_unit_tests.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: GPU unit tests
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.4.x
paths:
- "**/*.py"
- .github/workflows/gpu_unit_tests.yml
pull_request:
branches:
- main
- v0.4.x
paths:
# The order that you define paths patterns matters:
# A matching negative pattern (prefixed with !) after a positive match will exclude the path.
# A matching positive pattern after a negative match will include the path again.
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Entrypoints
- .github/workflows/gpu_unit_tests.yml
- "tests/**test_*.py"
# Ignore CPU tests
- "!tests/*_on_cpu.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
gpu_unit_tests:
if: github.repository_owner == 'verl-project'
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: 1
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install hf_transfer
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install cupy-cuda12x==13.6.0 pytest-asyncio
pip3 install --ignore-installed blinker
pip3 install --ignore-installed mlflow "numpy<2.0"
- name: Run all GPU unit tests
run: |
pytest -s -x --ignore-glob="*on_npu.py" --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_shared_memory*" --ignore-glob="tests/workers/rollout/rollout_trtllm" --ignore-glob="*test_bucketed_weight_transfer*" tests/
- name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
run: |
LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py
- name: Testing FSDP2 actor functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/actor/test_special_dp_actor.py
- name: Testing FSDP2 critic functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/critic/test_special_dp_critic.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, gpu_unit_tests]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/model.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: model
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/model.yml"
- "tests/special_distributed/test_fsdp_ckpt.py"
- "tests/special_distributed/test_tensor_dict.py"
- "tests/models/**"
- "tests/special_distributed/run_all.sh"
# Declare permissions just read content.
permissions:
contents: read
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
model_rmpad:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install --upgrade "transformers<5.0.0"
- name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
run: |
pytest -s tests/models/test_transformer.py
- name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
pytest -s tests/models/test_transformer.py
- name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
- name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
run: |
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.54.1
run: |
pip3 install transformers==4.54.1
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Run distributed test
run: |
bash tests/special_distributed/run_all.sh
# TODO: Move this back to model_rmpad once FSDP2 is stable.
# NOTE: List as an independent job to make rerun easier.
model_rmpad_fsdp2_unstable:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository and upgrade to latest transformers/flash_attn
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
model_engine:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Download model config files
run: |
hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir $HOME/models/Qwen/Qwen2.5-0.5B-Instruct
- name: Running mcore engine tests on 8 L20 GPUs
run: |
ray stop --force
pytest -s -x tests/models/test_engine.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, model_rmpad, model_rmpad_fsdp2_unstable, model_engine]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/model_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: model_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/model_ascend.yml"
- "tests/special_distributed/test_fsdp_ckpt.py"
- "tests/special_distributed/test_tensor_dict.py"
- "tests/models/**"
- "tests/special_distributed/run_all.sh"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read
jobs:
model_rmpad_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .[test]
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Running rmpad model tests on 8 NPUs
run: |
pytest -s tests/models/test_transformer.py
- name: Running FSDP rmpad model tests on 8 NPUs
run: |
STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
- name: Running transformers ulysses tests on 8 NPUs
run: |
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Run distributed test
run: |
bash tests/special_distributed/run_all.sh
# TODO: Move this back to model_rmpad once FSDP2 is stable.
# NOTE: List as an independent job to make rerun easier.
model_rmpad_fsdp2_unstable_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .[test]
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Running FSDP2 rmpad model tests on 8 NPUs
run: |
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
================================================
FILE: .github/workflows/nightly_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: nightly_ci_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
# For push, for now only anti-patterns are specified so it is more conservative
# and achieves higher coverage.
schedule:
- cron: "0 17 * * *"
# Declare permissions just read content.
permissions:
contents: read
jobs:
# Test ppo qwen3-8b fsdp+vllm
nightlyCI_ppo-qwen3-8b-fsdp-vllm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 180 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running nightlyCI_ppo-qwen3-8b-fsdp-vllm_ascend
run: |
ray stop --force
bash tests/special_npu/nightly_ci_ascend/run_ppo_qwen3-8b_fsdp_npu.sh
# Test grpo qwen25-7b-Instruct fsdp+vllm
nightlyCI_grpo-qwen25-7b-Instruct-fsdp-vllm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 180 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare GSM8K dataset
run: |
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
- name: Running nightlyCI_grpo-qwen25-7b-Instruct-fsdp-vllm_ascend
run: |
ray stop --force
bash tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-7b-instruct_fsdp_npu.sh
# Test grpo qwen25-vl-3b-Instruct fsdp+vllm
nightlyCI_grpo-qwen25-vl-3b-Instruct-fsdp-vllm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 180 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Preprocess geo3k dataset
run: |
python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k
- name: Running nightlyCI_grpo-qwen25-vl-3b-Instruct-fsdp-vllm_ascend
run: |
ray stop --force
bash tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
================================================
FILE: .github/workflows/npu_unit_tests.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - `npu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix on ascend device.
# - Since cpu/gpu/npu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: NPU unit tests
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- .github/workflows/npu_unit_tests.yml
pull_request:
branches:
- main
paths:
# The order that you define paths patterns matters:
# A matching negative pattern (prefixed with !) after a positive match will exclude the path.
# A matching positive pattern after a negative match will include the path again.
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "!recipe/**"
# Entrypoints
- .github/workflows/npu_unit_tests.yml
- "tests/**test_*.py"
# Ignore CPU tests
- "!tests/*_on_cpu.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
npu_unit_tests:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout volcengine/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .[test]
pip install mlflow pytest-asyncio
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Run all NPU unit tests
run: |
pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob="*on_cpu.py" --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob="tests/special*" --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_rvdz*" --ignore-glob="*test_ray_collectives*" --ignore-glob="*test_nvtx_profile*" --ignore-glob="tests/checkpoint_engine" --ignore-glob="*test_shared_memory*" --ignore-glob="tests/workers/rollout/rollout_trtllm" --ignore-glob="*test_fsdp_lora_merge*" --ignore-glob="*test_activation_offload*" --ignore-glob="*test_normalize_peft_param_name.py*" tests/
- name: Testing activation offload
run: |
pytest -s -x tests/utils/test_activation_offload.py
- name: Testing normalize peft param name
run: |
pytest -s -x tests/utils/test_normalize_peft_param_name.py
- name: Testing FSDP2 actor functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/actor/test_special_dp_actor.py
- name: Testing FSDP2 critic functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/critic/test_special_dp_critic.py
- name: Running NPU profiling unit tests
run: |
pytest -s -x tests/utils/test_special_mstx_profile.py
================================================
FILE: .github/workflows/pre-commit.yml
================================================
# c.f. https://github.com/pre-commit/action?tab=readme-ov-file#using-this-action
name: pre-commit
# No need to avoid / cancel lightweight pre-commit jobs
on:
schedule:
- cron: "0 0 * * 0"
pull_request:
push:
branches:
- main
- v0.*
# Allow manual triggering
workflow_dispatch:
# Declare permissions just read content.
permissions:
contents: read
jobs:
pre-commit:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install pre-commit hydra-core
pip install --no-deps -e .
- name: Set ruff --output-format=github
run: |
sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml
git add .pre-commit-config.yaml
# Check "--all-files" by default
- uses: pre-commit/action@v3.0.1
================================================
FILE: .github/workflows/precommit-autofix.yml
================================================
name: scheduled pre-commit autofix
on:
schedule:
# Every hour
- cron: "0 * * * *"
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
precommit:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install pre-commit
run: |
python -m pip install --upgrade pip
pip install pre-commit hydra-core
- name: Run pre-commit
run: |
pre-commit run --all-files || true
- name: Create or update PR
uses: peter-evans/create-pull-request@v6
with:
branch: bot/precommit-autofix
delete-branch: true
title: "[ci] chore: scheduled pre-commit autofix"
commit-message: "chore: auto-fix pre-commit issues"
body: |
This PR was created automatically by a scheduled GitHub Action.
- Runs `pre-commit run --all-files`
- Triggered hourly
labels: |
automated
pre-commit
================================================
FILE: .github/workflows/reward_model_sglang.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: reward_model_sglang
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/reward_model_sglang.yml"
- "tests/experimental/reward_loop/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
reward_model_sglang:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install sglang-router==0.2.2
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_dir ${HOME}/data/gsm8k
- name: Running sglang generative reward model tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_reward_model_genrm.py
- name: Running sglang discriminative reward model tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
- name: Running sglang agent loop with reward manager tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
- name: Running sglang agent loop with reward model colocate tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, reward_model_sglang]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/reward_model_vllm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: reward_model_vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/reward_model_vllm.yml"
- "tests/experimental/reward_loop/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
reward_model_vllm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 30 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_dir ${HOME}/data/gsm8k
- name: Running vllm generative reward model tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_genrm.py
- name: Running vllm discriminative reward model tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
- name: Running vllm agent loop with reward manager tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
- name: Running vllm agent loop with reward model colocate tests on 8 L20 GPUs
run: |
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, reward_model_vllm]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/reward_model_vllm_ascend.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: reward_model_vllm_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/reward_model_vllm_ascend.yml"
- "tests/experimental/reward_loop/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
reward_model_vllm_ascend:
if: github.repository_owner == 'verl-project'
runs-on: linux-aarch64-a2b3-8
timeout-minutes: 60 # Increase this timeout value as needed
container:
image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: |
pip list
- name: Checkout verl-project/verl repo
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install the current repository
run: |
pip install -r requirements-npu.txt
pip install --no-deps -e .[test]
- name: Check final pip list
run: |
pip list
- name: Prepare weights
run: |
ln -s /root/.cache/models ~/models
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k --local_dir ${HOME}/data/gsm8k
- name: Running vllm generative reward model tests on 8 NPUs
run: |
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_genrm.py
- name: Running vllm discriminative reward model tests on 8 NPUs
run: |
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
- name: Running vllm agent loop with reward manager tests on 8 NPUs
run: |
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
- name: Running vllm agent loop with reward model colocate tests on 8 NPUs
run: |
export HCCL_HOST_SOCKET_PORT_RANGE=auto
export HCCL_NPU_SOCKET_PORT_RANGE=auto
ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
================================================
FILE: .github/workflows/sanity.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: sanity
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- .github/workflows/sanity.yml
- "tests/special_sanity/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
sanity:
runs-on: ubuntu-latest
timeout-minutes: 5 # Increase this timeout value as needed
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip3 install -r requirements.txt
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Run sanity test
run: |
pytest -s -x tests/special_sanity
- name: Run license test
run: |
python3 tests/special_sanity/check_license.py --directories .
- name: Assert naming convention
run: |
if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
echo "Please use verl instead of veRL in the codebase"
exit 1
fi
- name: Assert SGLang naming convention
run: |
if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ --exclude=ascend_sglang_best_practices.rst -E 'Sglang|sgLang|sglAng|sglaNg|sglanG' .; then
echo "Please use SGLang or sglang as the formal name of SGLang rollout engine"
exit 1
fi
- name: Validate test folder structure
run: python3 tests/special_sanity/validate_structure.py
- name: Assert documentation requirement for functions
run: python3 tests/special_sanity/validate_imported_docs.py
- name: Assert device api usage in verl/verl
run: python3 tests/special_sanity/check_device_api_usage.py --directory ./verl
- name: Assert documentation time info
run: python3 tests/special_sanity/check_docs_time_info.py
- name: Check docstrings for specified files
run: python3 tests/special_sanity/check_docstrings.py
- name: Check DataProto for specified folders
run: python3 tests/special_sanity/check_dataproto_usage.py -d ./verl/workers/engine
================================================
FILE: .github/workflows/scorecard.yml
================================================
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: "27 7 * * 1"
push:
branches:
- main
- v0.*
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
# Uncomment the permissions below if installing in a private repository.
# contents: read
# actions: read
steps:
- name: "Checkout code"
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecard on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@9e8d0789d4a0fa9ceb6b1738f7e269594bdd67f0 #v3.28.9
with:
sarif_file: results.sarif
================================================
FILE: .github/workflows/secrets_scan.yml
================================================
on:
push:
branches:
- main
- v0.*
pull_request:
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
fetch-depth: 0
- name: Secret Scanning
uses: trufflesecurity/trufflehog@7dc056a193116ba8d82154bf0549381c8fb8545c # v3.88.14
with:
extra_args: --results=verified,unknown
================================================
FILE: .github/workflows/sgl.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: sgl
on:
# workflow_dispatch: # Manual
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
- .github/workflows/sgl.yml
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py" # FSDP
- "!verl/workers/**/*dp_*.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# vLLM
- "!**/*vllm*"
# Entrypoints
- ".github/workflows/sgl.yml"
- "tests/rollout/*sglang*"
- "tests/rollout/async_rollout_utils.py"
- "tests/workers/rollout/*interaction*"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
sgl:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 35 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: 1
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install cupy-cuda12x==13.6.0 pytest-asyncio
pip3 install hf_transfer fastmcp pytest-asyncio
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Test the latest SGLang Rollout async with agent loop
run: |
ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop
sgl_checkpoint_engine:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 35 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: 1
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install cupy-cuda12x==13.6.0 pytest-asyncio
pip3 install hf_transfer fastmcp pytest-asyncio
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
- name: Test SGLang ServerAdapter with Checkpoint Engine (NCCL)
run: |
ROLLOUT_NAME=sglang pytest -svvv tests/checkpoint_engine/test_special_server_adapter.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, sgl, sgl_checkpoint_engine]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .github/workflows/type-coverage-check.yml
================================================
name: Type Annotation and Docstring Coverage
on:
pull_request:
paths:
- '**/*.py'
- '.github/workflows/type-coverage-check.yml'
jobs:
type-coverage-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # 🚨 Important: fetch full history so `origin/main` is available
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: |
pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip3 install -r requirements.txt
pip3 install --no-deps -e .
- name: Run type annotation coverage check
run: |
python3 tests/special_sanity/type_coverage_check.py
- name: Run docstring coverage check
run: |
python3 tests/special_sanity/check_api_docs.py verl
================================================
FILE: .github/workflows/vllm.yml
================================================
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# SGLang
- "!**/*sglang*"
# Entrypoints
- ".github/workflows/vllm.yml"
- "tests/special_e2e/generation"
- "tests/workers/rollout"
- "verl/trainer/main_generation.py"
- "verl/trainer/config/generation.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs:
setup:
if: github.repository_owner == 'verl-project'
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner-label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
steps:
- uses: actions/checkout@v4
- id: create-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "create"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-image: "${{ env.IMAGE }}"
vllm:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 35 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install --upgrade "transformers<5.0"
# - name: Download Model to Use
# run: |
# hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct
# hf download Qwen/Qwen2.5-1.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-1.5B-Instruct
# hf download Qwen/Qwen2.5-VL-3B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct
# hf download OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN --local-dir ${HOME}/models/OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN
# export HF_HUB_OFFLINE=1
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
- name: Test the latest vLLM Rollout async with agent loop
run: |
ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop
- name: Test vllm server abort functionality
run: |
pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s
vllm_checkpoint_engine:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 35 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install pytest-asyncio
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install --upgrade "transformers<5.0"
pip3 install cupy-cuda12x==13.6.0
- name: Test vLLM ServerAdapter with Checkpoint Engine (NCCL)
run: |
ROLLOUT_NAME=vllm pytest -svvv tests/checkpoint_engine/test_special_server_adapter.py
- name: Test bucketed weight transfer
run: |
pytest -svvv tests/utils/test_bucketed_weight_transfer.py
cleanup:
runs-on: ubuntu-latest
needs: [setup, vllm, vllm_checkpoint_engine]
if: always()
steps:
- id: destroy-runner
uses: volcengine/vemlp-github-runner@v1
with:
mode: "destroy"
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
================================================
FILE: .gitignore
================================================
**/*.pt
**/checkpoints
**/wget-log
**/_build/
**/*.ckpt
**/outputs
**/*.tar.gz
**/playground
**/wandb
/pyrightconfig.json
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
dataset/*
tensorflow/my_graph/*
.idea/
# C extensions
*.so
# Distribution / packaging
.Python
# env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
tmp/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
pytest.ini
output.txt
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
.venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# vscode
.vscode
# Mac
.DS_Store
# vim
*.swp
# emacs
*~
# ckpt
*.lock
# data
*.parquet
# local logs
logs
log
outputs
.history
================================================
FILE: .gitmodules
================================================
[submodule "recipe"]
path = recipe
url = https://github.com/verl-project/verl-recipe.git
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.12.2"
hooks:
- id: ruff
args: ["--fix", "--show-fixes", "--output-format=full"]
exclude: ^.*\.(ipynb)$
- id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v1.17.0"
hooks:
- id: mypy
- repo: local
hooks:
- id: autogen-trainer-cfg
name: Generate and verify verl/trainer/config/_generated_*.yaml
entry: scripts/generate_trainer_config.sh
language: script
pass_filenames: false
- repo: local
hooks:
- id: check-docstrings
name: Check doc string coverage
entry: python3 tests/special_sanity/check_docstrings.py
language: python
pass_filenames: false
- repo: local
hooks:
- id: check-license
name: Check license
entry: python3 tests/special_sanity/check_license.py --directories examples scripts tests verl setup.py
language: python
pass_filenames: false
- repo: local
hooks:
- id: compileall
name: Compile all python files
entry: sh -c 'PYTHONWARNINGS=error python3 -m compileall -q . -x "(^|[\\/])(\.venv|venv|\.git)([\\/]|$)"'
language: python
pass_filenames: false
================================================
FILE: .readthedocs.yaml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
rust: "1.70"
sphinx:
configuration: docs/conf.py
python:
install:
- requirements: docs/requirements-docs.txt
- method: pip
path: .
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to verl
Thank you for considering a contribution to verl! We welcome contributions of any kind - bug fixes, enhancements, documentation improvements, or even just feedback. Whether you're an experienced developer or this is your first open-source project, your help is invaluable.
Your support can take many forms:
- Report issues or unexpected behaviors.
- Suggest or implement new features.
- Improve or expand documentation.
- Review pull requests and assist other contributors.
- Spread the word: share verl in blog posts, social media, or give the repo a ⭐.
## Finding Issues to Contribute
Looking for ways to dive in? Check out these issues:
- [Good first issues](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
- [Call for contribution](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22call%20for%20contribution%22)
Furthermore, you can learn the development plan and roadmap via [RFC](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3ARFC) and [Roadmap](https://github.com/volcengine/verl/issues?q=state%3Aopen%20label%3A%22roadmap%22).
## Developing
- **Python-only**: install verl via `pip install -e .[test,vllm]` or `pip install -e .[test,sglang]` and iterate quickly. For full dependency setup, check out the verl [installation doc](https://verl.readthedocs.io/en/latest/start/install.html).
## Code Linting and Formatting
We rely on pre-commit to keep our code consistent. To set it up:
```bash
pip install pre-commit
pre-commit install
# for staged changes
pre-commit run
# for all files in the repo
pre-commit run --all-files
# run a specific hook with pre-commit
# pre-commit run --all-files --show-diff-on-failure --color=always
pre-commit run --all-files --show-diff-on-failure --color=always ruff
pre-commit run --all-files --show-diff-on-failure --color=always autogen-trainer-cfg
```
## Testing
Our test suites run on GitHub Actions. Check these workflows for details:
- [GPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/gpu_unit_tests.yml)
- [CPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/cpu_unit_tests.yml)
- [vLLM tests](https://github.com/volcengine/verl/blob/main/.github/workflows/vllm.yml)
- [SGLang tests](https://github.com/volcengine/verl/blob/main/.github/workflows/sgl.yml)
### Adding CI tests
If possible, please add CI test(s) for your new feature:
1. Find the most relevant workflow yml file, which usually corresponds to a `hydra` default config (e.g. `ppo_trainer`, `ppo_megatron_trainer`, `sft_trainer`, etc).
2. Add related path patterns to the `paths` section if not already included.
3. Minimize the workload of the test script(s) (see existing scripts for examples).
## Building the Docs
```
# Ensure verl is on your PYTHONPATH, e.g.:
pip install -e .[test]
# Install documentation dependencies
cd docs
pip install -r requirements-docs.txt
# Generate HTML docs
make clean
make html
# Preview locally
python -m http.server -d _build/html/
```
Open your browser at http://localhost:8000 to explore the docs.
## Pull Requests & Code Reviews
Thanks for submitting a PR! To streamline reviews:
- Follow our Pull Request Template for title format and checklist.
- Adhere to our pre-commit lint rules and ensure all checks pass.
- Update docs for any user-facing changes.
- Add or update tests in the CI workflows, or explain why tests aren't applicable.
## License
See the [LICENSE](https://github.com/volcengine/verl/blob/main/LICENSE) file for full details.
## Thank You
We appreciate your contributions to verl. Your efforts help make the project stronger and more user-friendly. Happy coding!
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Notice.txt
================================================
Copyright 2023-2024 Bytedance Ltd. and/or its affiliates
================================================
FILE: README.md
================================================
👋 Hi, everyone!
verl is a RL training library initiated by ByteDance Seed team and maintained by the verl community.

verl: Volcano Engine Reinforcement Learning for LLMs
verl is a flexible, efficient and production-ready RL training library for large language models (LLMs).
verl is the open-source version of **[HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)** paper.
verl is flexible and easy to use with:
- **Easy extension of diverse RL algorithms**: The hybrid-controller programming model enables flexible representation and efficient execution of complex post-training dataflows. Build RL dataflows such as GRPO, PPO in a few lines of code.
- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as FSDP, Megatron-LM, vLLM, SGLang, etc
- **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
- Ready integration with popular HuggingFace models
verl is fast with:
- **State-of-the-art throughput**: SOTA LLM training and inference engine integrations and SOTA RL throughput.
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
## News
- [2026/01] verl has been migrated to the [verl-project](https://github.com/verl-project)
- [2026/01] verl first meetup was successfully held in Shanghai on 01/10, hosted by Volcengine and NVIDIA, the slides has been uploaded to [verl-data](https://github.com/verl-project/verl-data).
- [2026/01] The `recipe` directory has been migrated to a dedicated repository: [verl-recipe](https://github.com/verl-project/verl-recipe) and added as a submodule. See https://github.com/volcengine/verl/pull/4795. It can be used as it was after `git submodule update --init --recursive recipe`. Note that [`transfer_queue`](verl/experimental/transfer_queue), [`fully_async_policy`](verl/experimental/fully_async_policy), [`one_step_off_policy`](verl/experimental/one_step_off_policy) and [`vla`](verl/experimental/vla) are kept under [`verl/experimental`](verl/experimental) since they are planned to be merged into the main library. Use them through `verl.experimental.{module}`.
- [2025/12] [Mind Lab](https://macaron.im/mindlab) successfully used [verl](https://github.com/volcengine/verl) and [Megatron-bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) to train GRPO Lora for Trillion-parameter model on 64 H800 - See their [techblog](https://macaron.im/mindlab/research/building-trillion-parameter-reasoning-rl-with-10-gpus).
- [2025/10] verl is presented in the [PyTorch Conference 2025](https://pytorch.org/event/pytorch-conference-2025/).
- [2025/08] verl is presented in the [PyTorch Expert Exchange Webinar](https://www.youtube.com/watch?v=Vd79NmmqY3Q&t=2s). [Slides](https://github.com/eric-haibin-lin/verl-community/blob/main/slides/verl_talk_pytorch_2025_08.pdf) available.
- [2025/07] The [ReTool](https://arxiv.org/pdf/2504.11536) recipe is fully open sourced. [Blog](https://www.notion.so/verl-reTool-recipe-Using-multi-round-conversations-and-code-sandboxing-to-improve-the-math-of-large-23a8b5b7feba80b386b2e5b5e3c1cde0)
- [2025/07] The first verl meetup will be held at ICML Vancouver on July 16th! Please [join us](https://lu.ma/0ek2nyao) if you are at ICML! (onsite only)
- [2025/06] verl with Megatron backend enables large MoE models such as [DeepSeek-671B and Qwen3-235B](https://verl.readthedocs.io/en/latest/perf/dpsk.html).
- [2025/03] [DAPO](https://dapo-sia.github.io/) is the open-sourced SOTA RL algorithm that achieves 50 points on AIME 2024 based on the Qwen2.5-32B pre-trained model, surpassing the previous SOTA achieved by DeepSeek's GRPO (DeepSeek-R1-Zero-Qwen-32B). DAPO's training is fully powered by verl and the reproduction code is available in `recipe/dapo` now.
more...
- [2025/04] [Seed-Thinking-v1.5](https://github.com/ByteDance-Seed/Seed-Thinking-v1.5/blob/main/seed-thinking-v1.5.pdf) tech report is released! Trained with verl, Seed-Thinking-v1.5 achieves 86.7 on AIME 2024, 55.0 on Codeforces and 77.3 on GPQA, demonstrating excellent reasoning abilities in STEM and coding. Beyond reasoning tasks, the method demonstrates notable generalization across diverse domains.
- [2025/07] verl keynote at [AWS AI Hours Singapore](https://pages.awscloud.com/aws-ai-hours-sg.html#agenda) on 7/8, verl & verl-agent project updates at [Agent for SWE meetup](https://lu.ma/e498qhsi) by LF AI & Data Singapore on 7/11.
- [2025/06] verl team will provide latest project updates at [PyTorch Day China](https://www.lfasiallc.com/pytorch-day-china/) on June 7th. Meet our dev team in Beijing!
- [2025/04] [VAPO](https://arxiv.org/pdf/2504.05118) (value-based augmented PPO) paper covers our latest RL method for reasoning models. Trained from Qwen-32B-base model, VAPO achieves 60.4 on AIME 2024, outperforming DAPO-32B.
- [2025/05] [PF-PPO](https://arxiv.org/abs/2409.06957), accepted to ICML 2025, is now supported in verl! PF-PPO enhances policy learning efficiency and robustness by filtering potentially noisy reward signals and reusing high-quality experiences via a replay buffer.
- [2025/04] We will give a tutorial about latest post-training techniques and programming guide for verl at [ICLR 2025 Expo](https://iclr.cc/virtual/2025/calendar?filter_events=Expo+Talk+Panel&filter_rooms=), [SCI-FM workshop](https://open-foundation-model.github.io/) and [LMSys afterparty](https://lu.ma/d23nyynm). Talk materials available [here](https://github.com/eric-haibin-lin/verl-community/tree/main/iclr25).
- [2025/03] verl v0.3.0.post1 is released! See [release note](https://github.com/volcengine/verl/releases/) for details. It achieves [~1.4x speedup](https://tongyx361.github.io/blogs/posts/verl-intro/#/verl-flexible-and-efficient-rl-for-llms) compared to prev versions.
- [2025/05] verl will be presented at [A2M Shanghai](https://a2m.msup.com.cn/home/?aid=4488&city=shanghai) on 5/16 - 5/17.
- [2025/05] verl will be presented at [GOSIM x PyTorch Day 2025](https://paris2025.gosim.org/). See you in Paris!
- [2025/03] We introduced the programming model of verl at the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg) and [verl intro and updates](https://github.com/eric-haibin-lin/verl-community/blob/main/slides/verl-lmsys-meetup.pdf) at the [SGLang-LMSYS Org Meetup](https://lu.ma/ntjrr7ig) in Sunnyvale mid-March.
- [2025/03] We will present verl(HybridFlow) at EuroSys 2025. See you in Rotterdam!
- [2025/02] verl v0.2.0.post2 is released!
- [2025/02] We presented verl in the Bytedance/NVIDIA/Anyscale Ray Meetup. See you in San Jose!
- [2025/01] [Doubao-1.5-pro](https://team.doubao.com/zh/special/doubao_1_5_pro) is released with SOTA-level performance on LLM & VLM. The RL scaling preview model is trained using verl, reaching OpenAI O1-level performance on math benchmarks (70.0 pass@1 on AIME).
- [2024/12] verl is presented at Ray Forward 2024. Slides available here
- [2024/12] The team presented Post-training LLMs: From Algorithms to Infrastructure at NeurIPS 2024. Slides and video available.
- [2024/10] verl is presented at Ray Summit. Youtube video available.
- [2024/08] HybridFlow (verl) is accepted to EuroSys 2025.
## Key Features
- **FSDP**, **FSDP2** and **Megatron-LM** for training.
- **vLLM**, **SGLang** and **HF Transformers** for rollout generation.
- Compatible with Hugging Face Transformers and Modelscope Hub: [Qwen-3](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-8b.sh), Qwen-2.5, Llama3.1, Gemma2, DeepSeek-LLM, etc
- Supervised fine-tuning.
- Reinforcement learning with [PPO](examples/ppo_trainer/), [GRPO](examples/grpo_trainer/), [GSPO](https://github.com/verl-project/verl-recipe/tree/main/gspo/), [ReMax](examples/remax_trainer/), [REINFORCE++](https://verl.readthedocs.io/en/latest/examples/config.html#algorithm), [RLOO](examples/rloo_trainer/), [PRIME](https://github.com/verl-project/verl-recipe/tree/main/prime/), [DAPO](https://github.com/verl-project/verl-recipe/tree/main/dapo/), [DrGRPO](https://github.com/verl-project/verl-recipe/tree/main/drgrpo), [KL_Cov & Clip_Cov](https://github.com/verl-project/verl-recipe/tree/main/entropy) etc.
- Support model-based reward and function-based reward (verifiable reward) for math, [coding](https://github.com/verl-project/verl-recipe/tree/main/dapo), etc
- Support vision-language models (VLMs) and [multi-modal RL](examples/grpo_trainer/run_qwen2_5_vl-7b.sh) with Qwen2.5-vl, Kimi-VL
- [Multi-turn with tool calling](https://github.com/volcengine/verl/tree/main/examples/sglang_multiturn)
- LLM alignment recipes such as [Self-play preference optimization (SPPO)](https://github.com/verl-project/verl-recipe/tree/main/sppo)
- Flash attention 2, [sequence packing](examples/ppo_trainer/run_qwen2-7b_seq_balance.sh), [sequence parallelism](examples/ppo_trainer/run_deepseek7b_llm_sp2.sh) support via DeepSpeed Ulysses, [LoRA](examples/sft/gsm8k/run_qwen_05_peft.sh), [Liger-kernel](examples/sft/gsm8k/run_qwen_05_sp2_liger.sh).
- Scales up to 671B models and hundreds of GPUs with [expert parallelism](https://github.com/volcengine/verl/pull/1467)
- Multi-gpu [LoRA RL](https://verl.readthedocs.io/en/latest/advance/ppo_lora.html) support to save memory.
- Experiment tracking with wandb, swanlab, mlflow and tensorboard.
- Hardware Support: Supports NVIDIA, AMD, [Ascend](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/quick_start/ascend_quick_start.rst)
## Upcoming Features and Changes
- Q3 Roadmap https://github.com/volcengine/verl/issues/2388
- DeepSeek 671b optimizations with Megatron https://github.com/volcengine/verl/issues/1033
- Multi-turn rollout and tools using optimizations https://github.com/volcengine/verl/issues/1882
- [Agent integration](https://github.com/volcengine/verl/tree/main/verl/experimental/agent_loop)
- Async and off-policy architecture https://github.com/volcengine/verl/pull/2231
- List of breaking changes since v0.4 https://github.com/volcengine/verl/discussions/2270
## Getting Started
Documentation
**Quickstart:**
- [Installation](https://verl.readthedocs.io/en/latest/start/install.html)
- [Quickstart](https://verl.readthedocs.io/en/latest/start/quickstart.html)
- [Programming Guide](https://verl.readthedocs.io/en/latest/hybrid_flow.html) & [Tech Talk](https://hcqnc.xetlk.com/sl/3vACOK) (in Chinese)
- [PPO in verl](https://verl.readthedocs.io/en/latest/algo/ppo.html)
- [GRPO in verl](https://verl.readthedocs.io/en/latest/algo/grpo.html)
**Running a PPO example step-by-step:**
- [Prepare Data for Post-Training](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html)
- [Implement Reward Function for Dataset](https://verl.readthedocs.io/en/latest/preparation/reward_function.html)
- [PPO Example Architecture](https://verl.readthedocs.io/en/latest/examples/ppo_code_architecture.html)
- [Config Explanation](https://verl.readthedocs.io/en/latest/examples/config.html)
**Reproducible algorithm baselines:**
- [RL performance on coding, math](https://verl.readthedocs.io/en/latest/algo/baseline.html)
**For code explanation and advance usage (extension):**
- PPO Trainer and Workers
- [PPO Ray Trainer](https://verl.readthedocs.io/en/latest/workers/ray_trainer.html)
- [PyTorch FSDP Backend](https://verl.readthedocs.io/en/latest/workers/fsdp_workers.html)
- [Megatron-LM Backend](https://verl.readthedocs.io/en/latest/index.html)
- Advanced Usage and Extension
- [Add Models with the FSDP Backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
- [Add Models with the Megatron-LM Backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
- [Multi-turn Rollout Support](https://verl.readthedocs.io/en/latest/sglang_multiturn/multiturn.html)
- [Search Tool Integration](https://verl.readthedocs.io/en/latest/sglang_multiturn/search_tool_example.html)
- [Sandbox Fusion Integration](https://verl.readthedocs.io/en/latest/examples/sandbox_fusion_example.html)
- [Deployment using Separate GPU Resources](https://github.com/volcengine/verl/tree/main/examples/split_placement)
- [Extend to Other RL(HF) algorithms](https://verl.readthedocs.io/en/latest/advance/dpo_extension.html)
- [Ray API design tutorial](https://verl.readthedocs.io/en/latest/advance/placement.html)
**Blogs from the community**
- [When Reasoning Models Break Tokenization: The Hidden Complexity of Multiturn Training](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/fast_tokenization/multiturn_tokenization_and_masking.md)
- [verl deployment on AWS SageMaker](https://medium.com/@kaige.yang0110/run-verl-on-sagemaker-using-4x8-l40s-gpus-8e6d5c3c61d3)
- [verl x SGLang Multi-turn Code Walkthrough](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/code-walk-through/readme_EN.md)
- [Optimizing SGLang Memory Usage in verl](https://hebiao064.github.io/rl-memory-management)
- [SGLang, verl, OpenBMB and Tsinghua University: Pioneering End-to-End Multi-Turn RLHF](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/verl-multiturn-rollout-Release.md)
- [Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration](https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html)
- [veMLP x verl :玩转强化学习训练](https://mp.weixin.qq.com/s/7nbqxk4knMGd-hQE9ls2tA)
- [使用 verl 进行 GRPO 分布式强化学习训练最佳实践](https://www.volcengine.com/docs/6459/1463942)
- [HybridFlow verl 原文浅析](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/readme.md)
- [最高提升 20 倍吞吐量!豆包大模型团队发布全新 RLHF 框架,现已开源!](https://team.doubao.com/en/blog/%E6%9C%80%E9%AB%98%E6%8F%90%E5%8D%8720%E5%80%8D%E5%90%9E%E5%90%90%E9%87%8F-%E8%B1%86%E5%8C%85%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%A2%E9%98%9F%E5%8F%91%E5%B8%83%E5%85%A8%E6%96%B0-rlhf-%E6%A1%86%E6%9E%B6-%E7%8E%B0%E5%B7%B2%E5%BC%80%E6%BA%90)
## Performance Tuning Guide
The performance is essential for on-policy RL algorithm. We have written a detailed [performance tuning guide](https://verl.readthedocs.io/en/latest/perf/perf_tuning.html) to help you optimize performance.
## Upgrade to vLLM >= v0.8.2
verl now supports vLLM>=0.8.2 when using FSDP as the training backend. Please refer to [this document](https://github.com/volcengine/verl/blob/main/docs/README_vllm0.8.md) for the installation guide and more information. Please avoid vllm 0.7.x, which contains bugs that may lead to OOMs and unexpected errors.
## Use Latest SGLang
SGLang is fully supported with verl, and SGLang RL Group is working extensively on building unique features, including multi-turn agentic RL, VLM RLHF, server-based RL, and partial rollout. Please refer to [this document](https://verl.readthedocs.io/en/latest/workers/sglang_worker.html) for the installation guide and more information.
## Upgrade to FSDP2
verl is fully embracing FSDP2! FSDP2 is recommended by torch distributed team, providing better throughput and memory usage, and is composible with other features (e.g. torch.compile). To enable FSDP2, simply use verl main and set the following options:
```
actor_rollout_ref.ref.strategy=fsdp2
actor_rollout_ref.actor.strategy=fsdp2
critic.strategy=fsdp2
```
Furthermore, FSDP2 cpu offloading is compatible with gradient accumulation. You can turn it on to save memory with `actor_rollout_ref.actor.fsdp_config.offload_policy=True`. For more details, see https://github.com/volcengine/verl/pull/1026
## AMD Support (ROCm Kernel)
verl now supports FSDP as the training engine (Megatron support coming soon) and both integrates with vLLM and SGLang as inference engines. Please refer to [this document](https://github.com/volcengine/verl/blob/main/docs/amd_tutorial/amd_build_dockerfile_page.rst) for the installation guide and more information, and [this document](https://github.com/volcengine/verl/blob/main/docs/amd_tutorial/amd_vllm_page.rst) for the vLLM performance tuning for ROCm.
## Citation and acknowledgement
If you find the project helpful, please cite:
- [HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)
- [A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization](https://i.cs.hku.hk/~cwu/papers/gmsheng-NL2Code24.pdf)
```bibtex
@article{sheng2024hybridflow,
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
year = {2024},
journal = {arXiv preprint arXiv: 2409.19256}
}
```
verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and contributed by Bytedance, Anyscale, LMSys.org, [Alibaba Qwen team](https://github.com/QwenLM/), Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, University of Hong Kong, ke.com, [All Hands AI](https://www.all-hands.dev/), [ModelBest](http://modelbest.cn/), JD AI Lab, Microsoft Research, [StepFun](https://www.stepfun.com/), Amazon, LinkedIn, Meituan, [Camel-AI](https://www.camel-ai.org/), [OpenManus](https://github.com/OpenManus), Xiaomi, NVIDIA research, [Baichuan](https://www.baichuan-ai.com/home), [RedNote](https://www.xiaohongshu.com/), [SwissAI](https://www.swiss-ai.org/), [Moonshot AI (Kimi)](https://www.moonshot-ai.com/), Baidu, Snowflake, Skywork.ai, JetBrains, [IceSword Lab](https://www.iceswordlab.com), and many more.
## Awesome Projects Built with `verl`
Welcome to register your awesome project build with `verl` for other developers' reference!
- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of **DeepSeek R1 Zero** recipe for reasoning tasks 
- [SkyThought](https://github.com/NovaSky-AI/SkyThought): RL training for Sky-T1-7B by NovaSky AI team. 
- [simpleRL-reason](https://github.com/hkust-nlp/simpleRL-reason): SimpleRL-Zoo: Investigating and Taming Zero Reinforcement Learning for Open Base Models in the Wild 
- [Easy-R1](https://github.com/hiyouga/EasyR1): **Multi-modal** RL training framework 
- [OpenManus-RL](https://github.com/OpenManus/OpenManus-RL): LLM Agents RL tuning framework for multiple agent environments. 
- [rllm](https://github.com/agentica-project/rllm): async RL training with [verl-pipeline](https://github.com/agentica-project/verl-pipeline) 
- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning **agent** training framework 
- [Search-R1](https://github.com/PeterGriffinJin/Search-R1): RL with reasoning and **searching (tool-call)** interleaved LLMs 
- [ReSearch](https://github.com/Agent-RL/ReSearch): Learning to **Re**ason with **Search** for LLMs via Reinforcement Learning 
- [Skywork-OR1](https://github.com/SkyworkAI/Skywork-OR1): Skywork open reaonser series 
- [ToRL](https://github.com/GAIR-NLP/ToRL): Scaling tool-integrated RL 
- [Absolute Zero Reasoner](https://github.com/LeapLabTHU/Absolute-Zero-Reasoner): [A no human curated data self-play framework for reasoning](https://arxiv.org/abs/2505.03335) 
- [verl-agent](https://github.com/langfengQ/verl-agent): A scalable training framework for **long-horizon LLM/VLM agents**, along with a new algorithm **GiGPO** 
- [RL-Factory](https://github.com/Simple-Efficient/RL-Factory): An easy and efficient RL post-training framework for Agentic Learning 
- [ReTool](https://retool-rl.github.io/): ReTool: reinforcement learning for strategic tool use in LLMs. Code release is in progress...
- [verl-tool](https://github.com/TIGER-AI-Lab/verl-tool): An unified and easy-to-extend tool-agent training framework based on verl
- [PRIME](https://github.com/PRIME-RL/PRIME): Process reinforcement through implicit rewards 
- [MemAgent](https://github.com/BytedTsinghua-SIA/MemAgent): MemAgent: Reshaping Long-Context LLM with Multi-Conv RL based Memory Agent 
- [POLARIS](https://github.com/ChenxinAn-fdu/POLARIS): A Post-training recipe for scaling RL on Advanced Reasoning models 
- [GUI-R1](https://github.com/ritzz-ai/GUI-R1): **GUI-R1**: A Generalist R1-style Vision-Language Action Model For **GUI Agents** 
- [DeepRetrieval](https://github.com/pat-jj/DeepRetrieval): RL Training of **Search Agent** with **Search/Retrieval Outcome** 
- [Code-R1](https://github.com/ganler/code-r1): Reproducing R1 for **Code** with Reliable Rewards 
- [DeepResearcher](https://github.com/GAIR-NLP/DeepResearcher): Scaling deep research via reinforcement learning in real-world environments 
- [VAGEN](https://github.com/RAGEN-AI/VAGEN): Training VLM agents with multi-turn reinforcement learning 
- [RM-R1](https://arxiv.org/abs/2505.02387): RL training of reasoning reward models 
- [Dr. MAS](https://arxiv.org/pdf/2602.08847): Stable **end-to-end RL** post-training for **multi-agent LLM systems** 
- [LUFFY](https://arxiv.org/pdf/2504.14945): Learning to Reason under Off-Policy Guidance
- [DeepMath](https://github.com/zwhe99/DeepMath): DeepMath-103K data and series models for math reasoning
- [PACS](https://github.com/ritzz-ai/PACS): Implicit Actor Critic Coupling via a Supervised Learning Framework for RLVR 
- [Entropy Mechanism of RL](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL): The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning
- [LLaSA-TTS-GRPO](https://github.com/channel-io/ch-tts-llasa-rl-grpo): TTS fine-tuning with GRPO optimization based on LLASA models 
- [PF-PPO](https://arxiv.org/abs/2409.06957): Policy Filtration for PPO based on the reliability of reward signals for more efficient and robust RLHF.
- [RACRO](https://github.com/gyhdog99/RACRO2): Build multi-modal reasoning models via decoupling it into query-conditioned captioning and text-only reasoning 
- [Agent Lightning](https://github.com/microsoft/agent-lightning): A flexible and extensible framework that enables seamless agent optimization for any existing agent framework. 
- [VTool-R1](https://github.com/VTOOL-R1/vtool-r1): VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. 
- [Kimina-Prover-RL](https://github.com/project-numina/kimina-prover-rl/tree/main/recipe/kimina_prover_rl): Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1.
- [RL-PLUS](https://github.com/YihongDong/RL-PLUS): Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization.
- [rStar2-Agent](https://github.com/microsoft/rStar): Using reinforcement learning with multi-step tool-calling for math tasks, rStar2-Agent-14B reaches frontier-level math reasoning in just 510 RL training steps 
- [Vision-SR1](https://github.com/zli12321/Vision-SR1): Self-Rewarding Vision-Language Model via Reasoning Decomposition 
- [SimpleVLA-RL](https://github.com/PRIME-RL/SimpleVLA-RL): SimpleVLA-RL: A Simple yet Effective Vision-Language Action Model for Reinforcement Learning 
- [Table-R1](https://github.com/Table-R1/Table-R1): Table-R1: Inference-Time Scaling for Table Reasoning 
- [Revisual-R1](https://github.com/CSfufu/Revisual-R1): Revisual-R1: Advancing Multimodal Reasoning From Optimized Cold Start to Staged Reinforcement Learning 
- [ARES](https://github.com/shawn0728/ARES): ARES: Multimodal Adaptive Reasoning via Difficulty-Aware Token-Level Entropy Shaping 
- [Meta-Bandit-LLM](https://github.com/sanxing-chen/meta-bandit-llm): Meta-Bandit-LLM: Long-horizon multiturn interactive training for meta-bandit agents 
- [PokeeResearch](https://github.com/Pokee-AI/PokeeResearchOSS): PokeeResearch: State-of-the-art 7B DeepResearch Agent that leverages web search and content reading capabilities to answer complex questions using the most up-to-date information available online. 
- [Search Self-play](https://github.com/Alibaba-Quark/SSP): Pushing the Frontier of Agent Capability without Supervision 
- [OneThinker](https://github.com/tulerfeng/OneThinker): All-in-one Reasoning Model for Image and Video 
- [OpenTinker](https://github.com/open-tinker/OpenTinker): Democratizing Agentic Reinforcement Learning as a Service 
- [FlowRL](https://github.com/Xuekai-Zhu/FlowRL): Matching reward distributions via **flow balance** for diverse exploration and generalizable reasoning 
- [Logic-RL](https://github.com/Unakar/Logic-RL): a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. 
- [Seed-Coder](https://github.com/ByteDance-Seed/Seed-Coder): RL training of Seed-Coder boosts performance on competitive programming 
- [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model): A strong, open coding agent model, trained with [multi-turn fine-tuning](https://github.com/volcengine/verl/pull/195)
- [s3](https://github.com/pat-jj/s3) **Efficient Yet Effective** Search Agent Training via RL 
- [Rec-R1](https://arxiv.org/pdf/2503.24289): Bridging Generative Large Language Models and Recommendation Systems via Reinforcement Learning
- [Explore RL Data Scaling](https://arxiv.org/abs/2503.22230): Exploring Data Scaling Trends and Effects in Reinforcement Learning from Human Feedback
- [FIRE](https://arxiv.org/abs/2410.21236): Flaming-hot initiation with regular execution sampling for large language models
- [DQO](https://arxiv.org/abs/2410.09302): Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization
- [ProRL](https://arxiv.org/abs/2505.24864): Prolonged Reinforcement Learning Expands Reasoning Boundaries in Large Language Models
- [cognition-engineering](https://github.com/gair-nlp/cognition-engineering): Test time scaling drives cognition engineering. 
- [Trust Region Preference Approximation](https://github.com/XueruiSu/Trust-Region-Preference-Approximation): A simple and stable **reinforcement learning algorithm** for LLM reasoning. 
- [AdaRFT](https://github.com/uscnlp-lime/verl): Efficient Reinforcement Finetuning via **Adaptive Curriculum Learning** 
- [critic-rl](https://github.com/HKUNLP/critic-rl): LLM critics for code generation 
- [self-rewarding-reasoning-LLM](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with **generative reward models** 
- [DeepEnlighten](https://github.com/DolbyUUU/DeepEnlighten): Reproduce R1 with **social reasoning** tasks and analyze key findings 
- [MetaSpatial](https://github.com/PzySeere/MetaSpatial): Reinforcing **3D Spatial Reasoning** in **VLMs** for the **Metaverse** 
- [PURE](https://github.com/CJReinforce/PURE): **Credit assignment** is the key to successful reinforcement fine-tuning using **process reward model** 
- [cognitive-behaviors](https://github.com/kanishkg/cognitive-behaviors): Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs 
- [deepscaler](https://github.com/agentica-project/rllm/tree/deepscaler): iterative context scaling with GRPO 
- [DAPO](https://dapo-sia.github.io/): the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B 
- [NoisyRollout](https://github.com/NUS-TRAIL/NoisyRollout): Reinforcing Visual Reasoning with Data Augmentation 
- [SPEAR](https://github.com/TencentYoutuResearch/SPEAR): **Self-imitation** with **Progressive Exploration** for Agentic Reinforcement Learning (ICLR 2026) 
- [RuleReasoner](https://github.com/bigai-nlco/RuleReasoner): **RuleReasoner:** Reinforced Rule-based Reasoning via **Domain-aware Dynamic Sampling** (ICLR 2026) 
- [MetaphorStar](https://metaphorstar.github.io/): **Image Metaphor** Understanding and Reasoning with End-to-End **Visual Reinforcement Learning** 
## Contribution Guide
See [contributions guide](CONTRIBUTING.md)
## About [ByteDance Seed Team](https://team.doubao.com/)
Founded in 2023, ByteDance Seed Team is dedicated to crafting the industry's most advanced AI foundation models. The team aspires to become a world-class research team and make significant contributions to the advancement of science and society. You can get to know Bytedance Seed better through the following channels👇
We are HIRING! Send us an [email](mailto:the.verl.project@gmail.com) if you are interested in internship/FTE opportunities in RL for agents.
================================================
FILE: docker/Dockerfile.isaaclab230
================================================
#FROM nvcr.nju.edu.cn/nvidia/isaac-lab:2.3.0
FROM isaac-lab-base:latest
ENV ACCEPT_EULA=Y
ENTRYPOINT []
# desktop
RUN --mount=type=cache,target=/var/cache/apt \
sed -i 's/archive.ubuntu.com/mirrors.ivolces.com/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.ivolces.com/g' /etc/apt/sources.list && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y locales && \
locale-gen en_US.UTF-8 && \
update-locale LANG=en_US.UTF-8 LC_CTYPE=en_US.UTF-8 && \
apt-get install -y wget curl \
xfce4 \
xfce4-goodies \
xorg \
dbus-x11 \
x11-xserver-utils \
tigervnc-standalone-server \
tigervnc-common \
tigervnc-tools \
fonts-dejavu \
fonts-liberation
# cuda 12.2
RUN --mount=type=cache,target=/var/cache/apt \
cd /tmp && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
apt-key add 3bf863cc.pub && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
apt-get update && \
apt-get install -y libcusparselt0 libnccl2=2.27.3-1+cuda12.2 libglfw3 libgl1-mesa-glx libosmesa6 && \
rm -f 3bf863cc.pub
# libero
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install easydict==1.9 robosuite==1.4.0 bddl==1.0.1 future==0.18.2 cloudpickle==2.1.0
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install transformers[hf_xet]
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install --upgrade numpy==1.26.4 ray[default] \
accelerate codetiming datasets dill hydra-core pandas peft pyarrow>=19.0.0 pybind11 pylatexenc
# openvla-oft
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install pre-commit torchdata packaging>=20.0 uvicorn fastapi latex2sympy2_extended math_verify tensorboard
# flash_attn
RUN cd /tmp && \
wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl && \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install /tmp/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl && \
rm -f /tmp/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install --upgrade protobuf==3.20.3 timm==0.9.16
RUN --mount=type=cache,target=/root/.cache/pip \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install orjson==3.11.3 pyvers==0.1.0 tensordict==0.10.0 --force --no-deps
RUN mkdir -p /root/.vnc && \
cat <<'EOP' > /root/.vnc/xstartup
#!/bin/sh
unset SESSION_MANAGER
unset DBUS_SESSION_BUS_ADDRESS
[ -r \$HOME/.Xresources ] && xrdb \$HOME/.Xresources
xsetroot -solid grey
exec startxfce4
EOP
RUN cat <<'EOP' > /root/.vnc/config
geometry=1920x1080
depth=24
desktop=Isaac-Sim-Desktop
dpi=96
localhost=no
EOP
RUN cat <<'EOP' > /root/start_isaac_vnc.sh
#!/bin/bash
# 设置显示变量
export DISPLAY=:1
# 检查VNC是否运行
if ! pgrep -f "Xvnc.*:1" > /dev/null; then
echo "Starting VNC server..."
vncserver :1 -localhost no -geometry 1920x1080 -depth 24 -desktop "Isaac-Sim-Desktop"
sleep 3
fi
# 启动Isaac Sim
echo "Starting Isaac Sim..."
/workspace/isaaclab/_isaac_sim/isaac-sim.sh --allow-root
EOP
RUN chmod +x /root/.vnc/xstartup && \
chmod +x /root/start_isaac_vnc.sh
RUN /workspace/isaaclab/_isaac_sim/isaac-sim.sh --allow-root --ext-precache-mode
RUN cd /root && \
git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git && \
cd LIBERO && \
git apply <<'EOP'
diff --git a/setup.py b/setup.py
index 59d4900..dbe9811 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,8 @@ long_description = "".join(lines)
setup(
name="libero",
- packages=[package for package in find_packages() if package.startswith("libero")],
+ #packages=[package for package in find_packages() if package.startswith("libero")],
+ packages=["libero"],
install_requires=[],
eager_resources=["*"],
include_package_data=True,
EOP
RUN cd /root/LIBERO && \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install -e .
# libero config
RUN mkdir -p /root/.libero && \
cat <<'EOP' > /root/.libero/config.yaml
assets: /root/LIBERO/libero/libero/./assets
bddl_files: /root/LIBERO/libero/libero/./bddl_files
benchmark_root: /root/LIBERO/libero/libero
datasets: /root/LIBERO/libero/libero/../datasets
init_states: /root/LIBERO/libero/libero/./init_files
EOP
# from https://github.com/nvidia-china-sae/RobotLearningLab
COPY RobotLearningLab/ /root/RobotLearningLab/
RUN cd /workspace/isaaclab/ && \
rm -rf source && \
ln -s /root/RobotLearningLab/source source && \
/workspace/isaaclab/_isaac_sim/python.sh -m pip install -e ./source/isaaclab
# Ray cmd
RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install colorama && \
cat <<'EOP' >> /root/.bashrc
alias ray='/workspace/isaaclab/_isaac_sim/python.sh /workspace/isaaclab/_isaac_sim/kit/python/lib/python3.11/site-packages/ray/scripts/scripts.py'
EOP
================================================
FILE: docker/Dockerfile.stable.sglang
================================================
# sgl059
FROM lmsysorg/sglang:v0.5.9
ARG PIP_NO_CACHE_DIR=1
RUN pip install pybind11
RUN pip install nvidia-mathdx
RUN MAX_JOBS=128 pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
RUN export NVTE_FRAMEWORK=pytorch && MAX_JOBS=128 NVTE_BUILD_THREADS_PER_JOB=4 pip3 install --resume-retries 999 --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.12
# RUN pip install --upgrade transformers tokenizers
RUN pip install codetiming mathruler pylatexenc qwen_vl_utils cachetools pytest-asyncio
RUN pip install --no-build-isolation flash_attn==2.8.3
RUN NSIGHT_VERSION=2025.6.1_2025.6.1.190-1_$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_6/nsight-systems-${NSIGHT_VERSION}.deb && \
apt-get update && apt-get install -y libxcb-cursor0 && \
apt-get install -y ./nsight-systems-${NSIGHT_VERSION}.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-${NSIGHT_VERSION}.deb
# sglang image has already installed DeepEP
RUN pip3 install --no-deps trl==0.27.0
RUN pip3 install nvtx matplotlib liger_kernel
RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git
RUN pip install --no-deps git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.16.0
RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0 && \
pip uninstall -y verl
RUN sed -i '/nvidia-cudnn-cu12/d' /usr/local/lib/python3.12/dist-packages/torch-2.9.1+cu129.dist-info/METADATA && \
pip install --no-deps --force-reinstall nvidia-cudnn-cu12==9.16.0.29
# for packages compiled from source code
RUN apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages \
libcudnn9-cuda-12=9.16.0.29-1 \
libcudnn9-dev-cuda-12=9.16.0.29-1 \
libcudnn9-headers-cuda-12=9.16.0.29-1 && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: docker/Dockerfile.stable.trtllm
================================================
# Base image from NGC TensorRT-LLM, which includes a pre-installed TensorRT-LLM.
# For available images, visit: https://nvidia.github.io/TensorRT-LLM/installation/containers.html
# Use TRTLLM_BASE_IMAGE to specify the base image (default: release:1.2.0rc6)
ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc4
FROM ${TRTLLM_BASE_IMAGE}
# ==============================================================================
# Install Megatron dependencies
# ==============================================================================
# DeepEP is required for IBGDA support.
# Clone and build gdrcopy and deepep-nvshmem dependencies.
WORKDIR /home/dpsk_a2a
RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \
pushd gdrcopy && \
make prefix=/usr/local lib_install && \
popd && rm -rf gdrcopy && \
pip install nvidia-nvshmem-cu13==3.3.20 && \
export NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem && \
export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" && \
export PATH="${NVSHMEM_DIR}/bin:$PATH" && \
pushd ${NVSHMEM_DIR}/lib && \
ln -s libnvshmem_host.so.3 libnvshmem_host.so && \
popd && \
git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git && \
pushd DeepEP && \
wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch && \
patch -p1 < deepep.patch && \
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install && \
popd && rm -rf deepep
# Install Python dependencies
RUN pip3 install --no-cache-dir --no-deps trl && \
pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools && \
pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git && \
pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0
# ==============================================================================
# Install verl dependencies
# ==============================================================================
RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0
RUN pip uninstall -y verl
RUN pip install "verl[mcore] @ git+https://github.com/volcengine/verl.git@v0.7.0"
RUN pip uninstall -y verl
# ==============================================================================
# Install a specific TensorRT-LLM on demand
# ==============================================================================
# Note: The NGC image already includes a pre-installed TensorRT-LLM, but you can install a specific version if needed.
# Refer to https://nvidia.github.io/TensorRT-LLM/installation/index.html for more details.
================================================
FILE: docker/Dockerfile.stable.vllm
================================================
# vllm017
FROM nvidia/cuda:12.9.1-devel-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
ARG PIP_NO_CACHE_DIR=1
RUN apt-get update && apt-get install -y \
git \
wget \
cmake \
build-essential \
libibverbs-dev \
libnuma-dev \
librdmacm-dev \
numactl \
software-properties-common \
vim && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get update && \
apt-get install -y \
python3.12 \
python3.12-dev \
&& rm -rf /var/lib/apt/lists/*
RUN wget https://bootstrap.pypa.io/get-pip.py && \
python3.12 get-pip.py && \
rm get-pip.py
RUN ln -sf /usr/bin/python3.12 /usr/bin/python3 && \
ln -sf /usr/bin/python3.12 /usr/bin/python
RUN pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu129
RUN pip install vllm==0.17.0
RUN pip install pybind11
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb && \
apt-get update && \
apt-get -y install cudnn && \
rm -rf /var/lib/apt/lists/*
RUN pip install nvidia-mathdx
RUN MAX_JOBS=128 pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
RUN export NVTE_FRAMEWORK=pytorch && \
MAX_JOBS=128 \
NVTE_BUILD_THREADS_PER_JOB=4 \
pip3 install --resume-retries 999 --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.12
# RUN pip install --upgrade transformers tokenizers
RUN pip install codetiming mathruler pylatexenc qwen_vl_utils cachetools pytest-asyncio
RUN export FLASH_ATTENTION_FORCE_BUILD="TRUE" && MAX_JOBS=16 pip install --no-build-isolation flash_attn==2.8.3
RUN NSIGHT_VERSION=2025.6.1_2025.6.1.190-1_$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_6/nsight-systems-${NSIGHT_VERSION}.deb && \
apt-get update && apt-get install -y libxcb-cursor0 && \
apt-get install -y ./nsight-systems-${NSIGHT_VERSION}.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-${NSIGHT_VERSION}.deb && \
rm -rf /var/lib/apt/lists/*
# =========================
# Install DeepEP
# =========================
# Clone and build deepep and deepep-nvshmem
WORKDIR /home/dpsk_a2a
RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \
cd gdrcopy && \
make prefix=/usr/local lib_install && \
cd .. && rm -rf gdrcopy
ENV GDRCOPY_HOME=/usr/local
RUN git clone -b hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
export NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem && \
export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" && \
export PATH="${NVSHMEM_DIR}/bin:$PATH" && \
cd ${NVSHMEM_DIR}/lib && \
ln -sf libnvshmem_host.so.3 libnvshmem_host.so && \
cd /home/dpsk_a2a/DeepEP && \
export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:$CPATH && \
python setup.py install
RUN pip3 install --no-deps trl==0.27.0
RUN pip3 install nvtx matplotlib liger_kernel
RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git
RUN pip install --no-deps git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.16.0
RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0 && \
pip uninstall -y verl
RUN apt-get update && apt-get install -y curl \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages \
libcudnn9-cuda-12=9.16.0.29-1 \
libcudnn9-dev-cuda-12=9.16.0.29-1 \
libcudnn9-headers-cuda-12=9.16.0.29-1 && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: docker/README.md
================================================
# Dockerfiles of verl
We provide pre-built Docker images for quick setup. And from this version, we utilize a new image release hierarchy for productivity and stability.
Start from v0.6.0, we use vllm and sglang release image as our base image.
Start from v0.7.0, since vllm/vllm-openai:v0.12.0 is a minimal image without some essential libraries, we use nvidia/cuda:12.9.1-devel-ubuntu22.04 as our base image for vllm.
## Base Image
- vLLM: https://hub.docker.com/r/nvidia/cuda
- SGLang: https://hub.docker.com/r/lmsysorg/sglang
## Application Image
Upon base image, the following packages are added:
- flash_attn
- Megatron-LM
- Apex
- TransformerEngine
- DeepEP
Latest docker file:
- [Dockerfile.stable.vllm](https://github.com/volcengine/verl/blob/main/docker/Dockerfile.stable.vllm)
- [Dockerfile.stable.sglang](https://github.com/volcengine/verl/blob/main/docker/Dockerfile.stable.sglang)
All pre-built images are available in dockerhub: https://hub.docker.com/r/verlai/verl. For example, `verlai/verl:sgl059.latest`, `verlai/verl:vllm017.latest`.
You can find the latest images used for development and ci in our github workflows:
- [.github/workflows/vllm.yml](https://github.com/volcengine/verl/blob/main/.github/workflows/vllm.yml)
- [.github/workflows/sgl.yml](https://github.com/volcengine/verl/blob/main/.github/workflows/sgl.yml)
## Installation from Docker
After pulling the desired Docker image and installing desired inference and training frameworks, you can run it with the following steps:
1. Launch the desired Docker image and attach into it:
```sh
docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl sleep infinity
docker start verl
docker exec -it verl bash
```
2. If you use the images provided, you only need to install verl itself without dependencies:
```sh
# install the nightly version (recommended)
git clone https://github.com/volcengine/verl && cd verl
pip3 install --no-deps -e .
```
[Optional] If you hope to switch between different frameworks, you can install verl with the following command:
```sh
# install the nightly version (recommended)
git clone https://github.com/volcengine/verl && cd verl
pip3 install -e .[vllm]
pip3 install -e .[sglang]
```
## Release History
- 2026/03/10: update vllm stable image to vllm==0.17.0; update sglang stable image to sglang==0.5.9
- 2026/01/17: update vllm stable image to torch==2.9.1, cudnn==9.16, deepep==1.2.1
- 2025/12/23: update vllm stable image to vllm==0.12.0; update sglang stable image to sglang==0.5.6
- 2025/11/18: update vllm stable image to vllm==0.11.1; update sglang stable image to sglang==0.5.5
================================================
FILE: docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.3.rc1-910b-ubuntu22.04-py3.11
ARG ASCEND_CANN_PATH="/usr/local/Ascend"
ARG PIP_INDEX_URL="https://mirrors.aliyun.com/pypi/simple"
ARG PTA_BASE_VERSION="torch_npu-2.7.1.post2-cp311-cp311-manylinux_2_28"
ARG PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/v7.3.0-pytorch2.7.1"
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential net-tools iputils-ping && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip config set global.index-url ${PIP_INDEX_URL} && \
pip config set install.trusted-host mirrors.aliyun.com && \
pip install --upgrade pip setuptools packaging && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
echo "[LOG INFO] Detected architecture: $ARCH" && \
# Set extra pip index for x86_64 platform
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.5.8 https://github.com/sgl-project/sglang.git && \
git clone https://github.com/sgl-project/sgl-kernel-npu.git && cd sgl-kernel-npu && git checkout 46b73de && cd .. && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd ..
# Install repositories with low update frequency
RUN cd sglang && \
# Install sglang
mv python/pyproject.toml python/pyproject.toml.backup && \
mv python/pyproject_other.toml python/pyproject.toml && \
pip install -e "python[srt_npu]" && \
pip install torch==2.7.1 torchvision==0.22.1 && \
# Install torch_npu
ARCH=$(uname -m) && wget ${PTA_URL}/${PTA_BASE_VERSION}_${ARCH}.whl && pip install ${PTA_BASE_VERSION}_${ARCH}.whl && \
echo "[LOG INFO] Torch_npu version is: ${PTA_BASE_VERSION}_${ARCH}.whl" && \
cd ..
# Install sgl-kernel-npu
RUN ARCH=$(uname -m) && \
# Export and source env
export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/ascend-toolkit/8.3.RC1/${ARCH}-linux/devlib/linux/${ARCH}:$LD_LIBRARY_PATH && \
source ${ASCEND_CANN_PATH}/ascend-toolkit/set_env.sh && \
source ${ASCEND_CANN_PATH}/nnal/atb/set_env.sh && \
pip install pybind11 && \
cd sgl-kernel-npu && \
bash build.sh && \
pip install output/torch_memory_saver*.whl && \
pip install output/sgl_kernel_npu*.whl && \
# Deep_ep package is compiled for A3 by default; Recompile in deepep2 mode for A2, following https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md.
bash build.sh -a deepep2 && \
pip install output/deep_ep*.whl && \
cd "$(pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so && cd - && \
cd ..
# Install MindSpeed & Megatron
RUN pip install -e MindSpeed && \
pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1 && \
# Remove existing triton or triton-ascend installed by some third-party packages
pip uninstall -y triton timm && \
# Install mbridge
pip install mbridge && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Prepare and install verl (update frequently)
RUN git clone --recursive https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
pip install ray==2.46.0 click==8.2.1 cachetools && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.3.rc1-a3-ubuntu22.04-py3.11
ARG ASCEND_CANN_PATH="/usr/local/Ascend"
ARG PIP_INDEX_URL="https://mirrors.aliyun.com/pypi/simple"
ARG PTA_BASE_VERSION="torch_npu-2.7.1.post2-cp311-cp311-manylinux_2_28"
ARG PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/v7.3.0-pytorch2.7.1"
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential net-tools iputils-ping && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip config set global.index-url ${PIP_INDEX_URL} && \
pip config set install.trusted-host mirrors.aliyun.com && \
pip install --upgrade pip setuptools packaging && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
echo "[LOG INFO] Detected architecture: $ARCH" && \
# Set extra pip index for x86_64 platform
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.5.8 https://github.com/sgl-project/sglang.git && \
git clone https://github.com/sgl-project/sgl-kernel-npu.git && cd sgl-kernel-npu && git checkout 46b73de && cd .. && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd ..
# Install repositories with low update frequency
RUN cd sglang && \
# Install sglang
mv python/pyproject.toml python/pyproject.toml.backup && \
mv python/pyproject_other.toml python/pyproject.toml && \
pip install -e "python[srt_npu]" && \
pip install torch==2.7.1 torchvision==0.22.1 && \
# Install torch_npu
ARCH=$(uname -m) && wget ${PTA_URL}/${PTA_BASE_VERSION}_${ARCH}.whl && pip install ${PTA_BASE_VERSION}_${ARCH}.whl && \
echo "[LOG INFO] Torch_npu version is: ${PTA_BASE_VERSION}_${ARCH}.whl" && \
cd ..
# Install sgl-kernel-npu
RUN ARCH=$(uname -m) && \
# Export and source env
export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/ascend-toolkit/8.3.RC1/${ARCH}-linux/devlib/linux/${ARCH}:$LD_LIBRARY_PATH && \
source ${ASCEND_CANN_PATH}/ascend-toolkit/set_env.sh && \
source ${ASCEND_CANN_PATH}/nnal/atb/set_env.sh && \
pip install pybind11 && \
cd sgl-kernel-npu && \
bash build.sh && \
pip install output/torch_memory_saver*.whl && \
pip install output/sgl_kernel_npu*.whl && \
pip install output/deep_ep*.whl && \
cd "$(pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so && cd - && \
cd ..
# Install MindSpeed & Megatron
RUN pip install -e MindSpeed && \
pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1 && \
# Remove existing triton or triton-ascend installed by some third-party packages
pip uninstall -y triton timm && \
# Install mbridge
pip install mbridge && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Prepare and install verl (update frequently)
RUN git clone --recursive https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
pip install ray==2.46.0 click==8.2.1 cachetools && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.2.rc1_a2
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.2.rc1-910b-ubuntu22.04-py3.11
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip setuptools packaging && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.9.1 https://github.com/vllm-project/vllm && \
git clone --depth 1 --branch v0.9.1 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.2.RC1/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.2.RC1/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install torch & torch_npu & torchvision
pip install torch==2.5.1 torch_npu==2.5.1 torchvision==0.20.1 && \
# Install vllm
cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
ENV PYTHONPATH="/Megatron-LM${PYTHONPATH:+:${PYTHONPATH}}"
# Prepare and install verl (update frequently)
RUN git clone --depth 1 https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.2.rc1_a3
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.2.rc1-a3-ubuntu22.04-py3.11
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip setuptools packaging && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.9.1 https://github.com/vllm-project/vllm && \
git clone --depth 1 --branch v0.9.1 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.2.RC1/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.2.RC1/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install torch & torch_npu & torchvision
pip install torch==2.5.1 torch_npu==2.5.1 torchvision==0.20.1 && \
# Install vllm
cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
ENV PYTHONPATH="/Megatron-LM${PYTHONPATH:+:${PYTHONPATH}}"
# Prepare and install verl (update frequently)
RUN git clone --depth 1 https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.3.rc1_a2
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.3.rc1-910b-ubuntu22.04-py3.11
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip packaging setuptools==80.10.2 && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm.git && \
git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.3.RC1/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.3.RC1/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install torch & torch_npu & torchvision
pip install torch==2.7.1 torch_npu==2.7.1 torchvision==0.22.1 transformers==4.57.6 && \
# Install vllm
cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
# Remove existing triton or triton-ascend installed by some third-party packages
pip uninstall -y triton triton-ascend && \
# Install mbridge
pip install mbridge && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
ENV PYTHONPATH="/Megatron-LM${PYTHONPATH:+:${PYTHONPATH}}"
# Prepare and install verl (update frequently)
RUN git clone --depth 1 https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.3.rc1_a3
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.3.rc1-a3-ubuntu22.04-py3.11
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip packaging setuptools==80.10.2 && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm.git && \
git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout f2b0977e && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.3.RC1/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/8.3.RC1/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install torch & torch_npu & torchvision
pip install torch==2.7.1 torch_npu==2.7.1 torchvision==0.22.1 transformers==4.57.6 && \
# Install vllm
cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
# Remove existing triton or triton-ascend installed by some third-party packages
pip uninstall -y triton triton-ascend && \
# Install mbridge
pip install mbridge && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
ENV PYTHONPATH="/Megatron-LM${PYTHONPATH:+:${PYTHONPATH}}"
# Prepare and install verl (update frequently)
RUN git clone --depth 1 https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.5.0_a2
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.5.0-910b-ubuntu22.04-py3.11
ARG SOC_VERSION="ascend910b1"
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip packaging setuptools==80.10.2 && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.13.0 https://github.com/vllm-project/vllm.git && \
git clone -b releases/v0.13.0 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout 2.3.0_core_r0.12.1 && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/cann-8.5.0/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/cann-8.5.0/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install transformers
pip install transformers==4.57.6 && \
# Install vllm
cd vllm && pip install -r requirements/build.txt && \
VLLM_TARGET_DEVICE=empty pip install -v -e. && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -r requirements.txt && \
export COMPILE_CUSTOM_KERNELS=1 && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
pip install -e Megatron-LM && \
# Remove existing triton installed by some third-party packages
pip uninstall -y triton && \
# Install mbridge
pip install mbridge torch_npu==2.8.0 && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Prepare and install verl (update frequently)
RUN git clone --recursive https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/ascend/Dockerfile.ascend_8.5.0_a3
================================================
# Pull base image
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.5.0-a3-ubuntu22.04-py3.11
ARG SOC_VERSION="ascend910_9392"
# Prepare required system dependencies
RUN apt-get update -y && \
apt-get install -y --no-install-recommends gcc g++ cmake libnuma-dev wget git curl jq vim build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
pip install --upgrade pip packaging setuptools==80.10.2 && \
pip cache purge
# Prepare repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Set extra pip index for x86_64 platform
echo "[LOG INFO] Detected architecture: $ARCH" && \
if [ "$ARCH" = "x86_64" ]; then \
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"; \
fi && \
# Clone libs
git clone --depth 1 --branch v0.13.0 https://github.com/vllm-project/vllm.git && \
git clone -b releases/v0.13.0 https://github.com/vllm-project/vllm-ascend.git && \
git clone https://gitcode.com/Ascend/MindSpeed.git && \
cd MindSpeed && git checkout 2.3.0_core_r0.12.1 && cd .. && \
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# Install repositories with low update frequency
RUN ARCH=$(uname -m) && \
# Export and source env
if [ "$ARCH" = "aarch64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/cann-8.5.0/aarch64-linux/devlib/linux/aarch64:$LD_LIBRARY_PATH; \
elif [ "$ARCH" = "x86_64" ]; then \
export LD_LIBRARY_PATH=/usr/local/Ascend/cann-8.5.0/x86_64-linux/devlib/linux/x86_64/:$LD_LIBRARY_PATH; \
fi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
# Install transformers
pip install transformers==4.57.6 && \
# Install vllm
cd vllm && pip install -r requirements/build.txt && \
VLLM_TARGET_DEVICE=empty pip install -v -e. && cd .. && \
# Install vllm-ascend
cd vllm-ascend && pip install -r requirements.txt && \
export COMPILE_CUSTOM_KERNELS=1 && pip install -v -e . && cd .. && \
# Install MindSpeed & Megatron
pip install -e MindSpeed && \
pip install -e Megatron-LM && \
# Remove existing triton installed by some third-party packages
pip uninstall -y triton && \
# Install mbridge
pip install mbridge torch_npu==2.8.0 && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Prepare and install verl (update frequently)
RUN git clone --recursive https://github.com/volcengine/verl.git && \
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
# Clear extra files
rm -rf /tmp/* /var/tmp/* && \
pip cache purge
# Show install results
RUN pip list
# Setting Default Commands
CMD ["/bin/bash"]
================================================
FILE: docker/aws/Dockerfile.extention.awsefa
================================================
# Base Image support aws EFA
# Build Image with frameworks based on this
FROM verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
# For aws instances with EFA net interface (Sagemaker AI Pod)
# install EFA driver:
######## AWS EFA ############
ENV NCCL_VERSION=2.25.1-1
ENV DEBIAN_FRONTEND=noninteractive
ENV EFA_INSTALLER_VERSION=1.40.0
ENV AWS_OFI_NCCL_VERSION=1.14.2
ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
ENV FI_PROVIDER=efa
RUN apt update && apt install -y linux-image-generic libhwloc-dev
RUN cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
# NCCL EFA Plugin
RUN cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi/nccl
# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent \
FI_EFA_USE_HUGE_PAGE=0
# docker build -t verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
# on aws:
# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd verl:awsefa
================================================
FILE: docker/aws/Dockerfile.ngc.vllm0.8.sagemaker
================================================
# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
# uninstall nv-pytorch fork
RUN pip3 uninstall -y pytorch-quantization \
pytorch-triton torch torch-tensorrt torchvision \
xgboost transformer_engine flash_attn apex megatron-core
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Install torch-2.6.0 + vllm-0.8.2
RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
transformers>=4.49.0 accelerate datasets peft hf-transfer \
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest pre-commit py-spy pyext ruff tensorboard
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
pip install flash-attn==2.7.4.post1 --no-build-isolation
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
pip install --no-cache-dir --upgrade optree>=0.13.0
# Install verl
RUN pip install --no-cache-dir verl[vllm] -U
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
================================================
FILE: docker/rocm/Apptainerfile.rocm
================================================
Bootstrap: docker
# Support - Traing: fsdp; Inference: vllm
# FROM: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.5-rocm630
%environment
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
export HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
export CFLAGS="-D__HIP_PLATFORM_AMD__"
export CXXFLAGS="-D__HIP_PLATFORM_AMD__"
%post
# Create source directory
mkdir -p /opt/src
# Uninstall and reinstall vllm
pip uninstall -y vllm
cd /opt/src
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
cd vllm
MAX_JOBS=$(nproc) python3 setup.py install
cd /opt
rm -rf /opt/src/vllm
# Install dependencies
pip install "tensordict<0.6" --no-deps
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11
# Clone and install verl from GitHub
cd /opt
git clone https://github.com/volcengine/verl.git
cd verl
# Uncomment to use a specific version
# git checkout v0.3.0.post0
pip install -e . --no-deps
# Install torch_memory_saver
pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
================================================
FILE: docker/rocm/Dockerfile.rocm
================================================
# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
# FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"
SHELL ["/bin/bash", "-ceuxo", "pipefail"]
ENV MAX_JOBS=512
ENV PATH="/usr/local/python3.12/bin:$PATH"
RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
ln -sf /usr/bin/pip3.12 /usr/bin/pip
############################################
############################################
RUN apt-get update
RUN apt-get install -y pkg-config liblzma-dev
############################################
############################################
###########################################
##########Install TransformerEngine########
###########################################
WORKDIR /workspace/
# transformer-engine install
# https://github.com/ROCm/TransformerEngine
RUN rm -rf TransformerEngine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
WORKDIR /workspace/TransformerEngine
RUN git checkout 236178e5
# git checkout bb061ade
# git checkout 864405c
ENV NVTE_FRAMEWORK=pytorch
ENV NVTE_ROCM_ARCH=gfx942
ENV NVTE_USE_HIPBLASLT=1
ENV NVTE_USE_ROCM=1
# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
WORKDIR /workspace/
###########################################
###########################################
###########################################
####################################################################################
################Install vllm - sglang require vllm 0.6.7 dependency#################
####################################################################################
#### Require vllm 0.6.7 - checkout 113274a0
WORKDIR /workspace/
RUN rm -rf vllm
RUN pip uninstall -y vllm
# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
RUN git clone https://github.com/ROCm/vllm.git
# git clone https://github.com/vllm-project/vllm.git
WORKDIR /workspace/vllm
RUN git checkout 113274a0
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
#ENV MAX_JOBS=512
ENV MAX_JOBS=${MAX_JOBS}
RUN pip install "boto3>=1.26.0"
RUN pip install setuptools_scm
# will add src into py. You can delete the repo
RUN python3 setup.py install
WORKDIR /workspace/
####################################################################################
####################################################################################
####################################################################################
###########################################
############For hack docker################
###########################################
RUN pip install setuptools==75.8.0
###########################################
###########################################
###########################################
###########################################
############build sgalng###################
###########################################
# Set environment variables
ENV BASE_DIR=/sgl-workspace
ENV BUILD_TYPE=all
ENV SGL_REPO=https://github.com/sgl-project/sglang
ENV SGL_BRANCH=v0.4.6.post5
ENV TRITON_REPO=https://github.com/ROCm/triton.git
ENV TRITON_COMMIT=improve_fa_decode_3.0.0
ENV AITER_REPO=https://github.com/ROCm/aiter.git
ENV AITER_COMMIT=v0.1.2
# v0.1.2 version - commit id: 9d11f47
# ENV AITER_COMMIT=9d11f47
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_SET_CPU_AFFINITY=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
ENV NCCL_MIN_NCHANNELS=112
ENV MOE_PADDING=1
ENV VLLM_FP8_PADDING=1
ENV VLLM_FP8_ACT_PADDING=1
ENV VLLM_FP8_WEIGHT_PADDING=1
ENV VLLM_FP8_REDUCE_CONV=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
ENV AMDGPU_TARGETS=gfx942
ENV ROCM_ARCH=gfx942
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Switch to working directory
WORKDIR /sgl-workspace
# Clean and create directory
RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
# Clone and build sglang
RUN git clone ${SGL_REPO} \
&& cd sglang \
&& git checkout ${SGL_BRANCH} || echo "Using default branch" \
&& cd sgl-kernel \
&& rm -f pyproject.toml \
&& mv pyproject_rocm.toml pyproject.toml \
&& python setup_rocm.py install \
&& cd .. \
&& if [ "$BUILD_TYPE" = "srt" ]; then \
python -m pip --no-cache-dir install -e "python[srt_hip]"; \
else \
python -m pip --no-cache-dir install -e "python[all_hip]"; \
fi \
&& cd /sgl-workspace \
&& cp -r /sgl-workspace/sglang /sglang \
&& python -m pip cache purge
# Install common Python packages
RUN pip install IPython orjson python-multipart torchao pybind11
# Rebuild Triton
RUN pip uninstall -y triton || true \
&& git clone ${TRITON_REPO} \
&& cd triton \
&& git checkout ${TRITON_COMMIT} \
&& cd python \
&& python3 setup.py install \
&& cd /sgl-workspace
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
# Build aiter
#version: Commit 9d11f47
# && git checkout ${AITER_COMMIT} \
RUN pip uninstall -y aiter || true
RUN git clone ${AITER_REPO} \
&& cd aiter \
&& git checkout ${AITER_COMMIT} \
&& git submodule sync \
&& git submodule update --init --recursive \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
&& cd /sgl-workspace
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# Copy MI300X config
RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
-type f -name '*MI300X*' | \
xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
# Environment setup complete.
RUN echo "Environment setup complete."
WORKDIR /workspace/
###########################################
###########################################
###########################################
###########################################
###############vllm v0.8.5#################
###########################################
# ENV GITHUB_USERNAME=yushengsu-thu
# ENV GITHUB_MAIL=yushengsu@gmail.com
# RUN git config --global user.name "${GITHUB_USERNAME}" \
# && git config --global user.email "${GITHUB_MAIL}"
WORKDIR /workspace/
ENV VLLM_TARGET_DEVICE=rocm
ENV ROCM_PATH=/opt/rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
# Find the repo path in: DockerFile/Dockerfile.rocm_yang
# RUN git clone https://github.com/RLFoundation/vllm-patch.git
RUN pip uninstall -y vllm || true
RUN rm -rf vllm-patch
RUN git clone https://github.com/RLFoundation/vllm-patch.git \
&& cd vllm-patch \
&& git checkout v0.8.5-sleep-numa \
&& rm -rf build/ dist/ *.egg-info \
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
&& SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
# RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
WORKDIR /workspace/
###########################################
###########################################
###########################################
#########################################
#### Install megatron-core###############
#########################################
RUN pip uninstall -y megatron-core && \
git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
cd Megatron-LM-amd_version && \
pip install -vvv -e . && \
cd /workspace/
#########################################
#########################################
#########################################
#######################################
################apex###################
#######################################
WORKDIR /workspace/
RUN pip uninstall -y apex && \
git clone https://github.com/ROCm/apex.git && \
cd apex && \
python setup.py install && \
cd /workspace/
#######################################
#######################################
#######################################
################################################################################
###########################Add torch_memory_saver###############################
################################################################################
# Set environment variables
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
################################################################################
################################################################################
################################################################################
########################################
######Install ray#######################
########################################
# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
RUN pip uninstall ray -y
RUN pip install "ray[data,train,tune,serve]>=2.47.0"
########################################
########################################
########################################
##########################################
#######Install other dependencies#########
##########################################
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
torchdata \
wandb \
orjson \
pybind11
WORKDIR /workspace/
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
##########################################
##########################################
##########################################
WORKDIR /workspace/
CMD ["/usr/bin/bash"]
================================================
FILE: docker/rocm/Dockerfile.rocm7
================================================
# default base image
ARG REMOTE_VLLM="1"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}
# -----------------------
# vLLM fetch stages
FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
#ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
#ARG VLLM_BRANCH="main"
ARG VLLM_REPO=https://github.com/HollowMan6/vllm.git
ARG VLLM_BRANCH="sleep_amd"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git checkout ${VLLM_BRANCH}
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# vLLM build stages
FROM fetch_vllm AS build_vllm
# Build vLLM
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
&& VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ VLLM_GPU_LANG=HIP SETUPTOOLS_SCM_PRETEND_VERSION=0.11.0.dev python3 setup.py bdist_wheel --dist-dir=dist
#&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
# -----------------------
# Test vLLM image
FROM base AS test
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM
#RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
COPY --from=export_vllm /*.whl /install
COPY --from=export_vllm /requirements /install/requirements
COPY --from=export_vllm /benchmarks /install/benchmarks
COPY --from=export_vllm /tests /install/tests
COPY --from=export_vllm /examples /install/examples
COPY --from=export_vllm /.buildkite /install/.buildkite
RUN cd /install \
&& pip install -U -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# install development dependencies (for testing)
RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install pytest-shard
# -----------------------
# Final vLLM image
FROM base AS final
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually remove it so that later steps of numpy upgrade can continue
RUN case "$(which python3)" in \
*"/opt/conda/envs/py_3.9"*) \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli]
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements/rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
ARG COMMON_WORKDIR
# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
# -----------------------
# Install verl
ARG VERL_REPO=https://github.com/volcengine/verl.git
ARG VERL_BRANCH=main
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
torchdata \
wandb \
orjson \
pybind11
WORKDIR /workspace/
RUN git clone ${VERL_REPO} && \
cd verl && \
git checkout ${VERL_BRANCH} && \
pip install -e .
CMD ["/bin/bash"]
================================================
FILE: docker/rocm/Dockerfile.rocm_verl-0.3.0.post1
================================================
# Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker
# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post5-rocm630
# Set working directory
# WORKDIR $PWD/app
# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
# Install vllm
RUN pip uninstall -y vllm && \
rm -rf vllm && \
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
MAX_JOBS=$(nproc) python3 setup.py install && \
cd .. && \
rm -rf vllm
# Copy the entire project directory
COPY . .
# Install dependencies
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]<2.45.0" \
torchdata \
transformers \
wandb \
orjson \
pybind11
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
# Install torch_memory_saver
RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
================================================
FILE: docker/rocm/Dockerfile.rocm_verl-0.4.1
================================================
# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
# FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"
SHELL ["/bin/bash", "-ceuxo", "pipefail"]
ENV MAX_JOBS=512
ENV PATH="/usr/local/python3.12/bin:$PATH"
RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
ln -sf /usr/bin/pip3.12 /usr/bin/pip
############################################
############################################
RUN apt-get update
RUN apt-get install -y pkg-config liblzma-dev
############################################
############################################
###########################################
##########Install TransformerEngine########
###########################################
WORKDIR /workspace/
# transformer-engine install
# https://github.com/ROCm/TransformerEngine
RUN rm -rf TransformerEngine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
WORKDIR /workspace/TransformerEngine
RUN git checkout 236178e5
# git checkout bb061ade
# git checkout 864405c
ENV NVTE_FRAMEWORK=pytorch
ENV NVTE_ROCM_ARCH=gfx942
ENV NVTE_USE_HIPBLASLT=1
ENV NVTE_USE_ROCM=1
# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
WORKDIR /workspace/
###########################################
###########################################
###########################################
####################################################################################
################Install vllm - sglang require vllm 0.6.7 dependency#################
####################################################################################
#### Require vllm 0.6.7 - checkout 113274a0
WORKDIR /workspace/
RUN rm -rf vllm
RUN pip uninstall -y vllm
# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
RUN git clone https://github.com/ROCm/vllm.git
# git clone https://github.com/vllm-project/vllm.git
WORKDIR /workspace/vllm
RUN git checkout 113274a0
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
#ENV MAX_JOBS=512
ENV MAX_JOBS=${MAX_JOBS}
RUN pip install "boto3>=1.26.0"
RUN pip install setuptools_scm
# will add src into py. You can delete the repo
RUN python3 setup.py install
WORKDIR /workspace/
####################################################################################
####################################################################################
####################################################################################
###########################################
############For hack docker################
###########################################
RUN pip install setuptools==75.8.0
###########################################
###########################################
###########################################
###########################################
############build sgalng###################
###########################################
# Set environment variables
ENV BASE_DIR=/sgl-workspace
ENV BUILD_TYPE=all
ENV SGL_REPO=https://github.com/sgl-project/sglang
ENV SGL_BRANCH=v0.4.6.post5
ENV TRITON_REPO=https://github.com/ROCm/triton.git
ENV TRITON_COMMIT=improve_fa_decode_3.0.0
ENV AITER_REPO=https://github.com/ROCm/aiter.git
ENV AITER_COMMIT=v0.1.2
# v0.1.2 version - commit id: 9d11f47
# ENV AITER_COMMIT=9d11f47
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_SET_CPU_AFFINITY=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
ENV NCCL_MIN_NCHANNELS=112
ENV MOE_PADDING=1
ENV VLLM_FP8_PADDING=1
ENV VLLM_FP8_ACT_PADDING=1
ENV VLLM_FP8_WEIGHT_PADDING=1
ENV VLLM_FP8_REDUCE_CONV=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
ENV AMDGPU_TARGETS=gfx942
ENV ROCM_ARCH=gfx942
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Switch to working directory
WORKDIR /sgl-workspace
# Clean and create directory
RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
# Clone and build sglang
RUN git clone ${SGL_REPO} \
&& cd sglang \
&& git checkout ${SGL_BRANCH} || echo "Using default branch" \
&& cd sgl-kernel \
&& rm -f pyproject.toml \
&& mv pyproject_rocm.toml pyproject.toml \
&& python setup_rocm.py install \
&& cd .. \
&& if [ "$BUILD_TYPE" = "srt" ]; then \
python -m pip --no-cache-dir install -e "python[srt_hip]"; \
else \
python -m pip --no-cache-dir install -e "python[all_hip]"; \
fi \
&& cd /sgl-workspace \
&& cp -r /sgl-workspace/sglang /sglang \
&& python -m pip cache purge
# Install common Python packages
RUN pip install IPython orjson python-multipart torchao pybind11
# Rebuild Triton
RUN pip uninstall -y triton || true \
&& git clone ${TRITON_REPO} \
&& cd triton \
&& git checkout ${TRITON_COMMIT} \
&& cd python \
&& python3 setup.py install \
&& cd /sgl-workspace
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
# Build aiter
#version: Commit 9d11f47
# && git checkout ${AITER_COMMIT} \
RUN pip uninstall -y aiter || true
RUN git clone ${AITER_REPO} \
&& cd aiter \
&& git checkout ${AITER_COMMIT} \
&& git submodule sync \
&& git submodule update --init --recursive \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
&& cd /sgl-workspace
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# Copy MI300X config
RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
-type f -name '*MI300X*' | \
xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
# Environment setup complete.
RUN echo "Environment setup complete."
WORKDIR /workspace/
###########################################
###########################################
###########################################
###########################################
###############vllm v0.8.5#################
###########################################
# ENV GITHUB_USERNAME=yushengsu-thu
# ENV GITHUB_MAIL=yushengsu@gmail.com
# RUN git config --global user.name "${GITHUB_USERNAME}" \
# && git config --global user.email "${GITHUB_MAIL}"
WORKDIR /workspace/
ENV VLLM_TARGET_DEVICE=rocm
ENV ROCM_PATH=/opt/rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
# Find the repo path in: DockerFile/Dockerfile.rocm_yang
# RUN git clone https://github.com/RLFoundation/vllm-patch.git
RUN pip uninstall -y vllm || true
RUN rm -rf vllm-patch
RUN git clone https://github.com/RLFoundation/vllm-patch.git \
&& cd vllm-patch \
&& git checkout v0.8.5-sleep-numa \
&& rm -rf build/ dist/ *.egg-info \
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
&& SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
# RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
WORKDIR /workspace/
###########################################
###########################################
###########################################
#########################################
#### Install megatron-core###############
#########################################
RUN pip uninstall -y megatron-core && \
git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
cd Megatron-LM-amd_version && \
pip install -vvv -e . && \
cd /workspace/
#########################################
#########################################
#########################################
#######################################
################apex###################
#######################################
WORKDIR /workspace/
RUN pip uninstall -y apex && \
git clone https://github.com/ROCm/apex.git && \
cd apex && \
python setup.py install && \
cd /workspace/
#######################################
#######################################
#######################################
################################################################################
###########################Add torch_memory_saver###############################
################################################################################
# Set environment variables
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
################################################################################
################################################################################
################################################################################
########################################
######Install ray#######################
########################################
# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
RUN pip uninstall ray -y
RUN pip install "ray[data,train,tune,serve]>=2.47.0"
########################################
########################################
########################################
##########################################
#######Install other dependencies#########
##########################################
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
torchdata \
wandb \
orjson \
pybind11
WORKDIR /workspace/
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
##########################################
##########################################
##########################################
WORKDIR /workspace/
CMD ["/usr/bin/bash"]
CMD ["/usr/bin/bash"]
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Some sglang operations in 0.4.6.post5 require vllm
# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12.deepep
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Some sglang operations in 0.4.6.post5 require vllm
# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.13.preview
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Some sglang operations in 0.4.6.post5 require vllm
# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12.deepep
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.base
================================================
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-v2-cu124-cudnn9.8-torch2.6-fa2.8.0-te2.3
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Reinstall CUDA 12.4
RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cuda-toolkit-12-4 && \
rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
update-alternatives --set cuda /usr/local/cuda-12.4 && \
rm -rf /usr/local/cuda-12.6
RUN pip install --resume-retries 999 --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0 && \
dpkg -i ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
# Fix opencv
RUN pip install --resume-retries 999 --no-cache-dir opencv-python
RUN pip install --resume-retries 999 --no-cache-dir opencv-fixer && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
RUN pip install --resume-retries 999 --no-cache-dir cuda-bindings
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
RUN apt-get update && \
apt-get install -y libfreeimage3 libfreeimage-dev zlib1g htop
================================================
FILE: docker/verl0.4-cu124-torch2.6-fa2.7.4/README.md
================================================
# verl image with verl v0.4.x
## Important packages version
```txt
cuda==12.4
cudnn==9.8.0
torch==2.6.0
flash_attn=2.7.4
sglang==0.4.6.post5
vllm==0.8.5.post1
nvidia-cudnn-cu12==9.8.0.87
transformer_engine==2.3
megatron.core==core_v0.12.2
# Preview
transformer_engine==2.5
megatron.core==core_r0.13.0
```
## Target
- Base image:
- `verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4`
- App image:
- `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2`: SGLang requires vLLM in 0.4.6.post5 version, vLLM can have some package conflicts with SGLang
- `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2-deepep`: Built with deepep
- `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2`
- `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2-deepep`: Built with deepep
- Preview image:
- `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.13.0-te2.2-preview`
- `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.13.0-te2.2-preview`
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.13
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.10
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.10.post2"
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.10
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6"
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.7.1+cu126 + vllm-0.10.0
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0
# Fix packages
# transformers 4.54.0 still not support
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Fix qwen vl
RUN pip3 install --no-cache-dir --no-deps trl
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.15
================================================
# Start from the verl base image
# Dockerfile.base
FROM iseekyan/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4-h100
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.7.1+cu126 + vllm-0.10.0
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0
# Fix packages
# transformers 4.54.0 still not support
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.7
RUN pip install onnxscript
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0rc4
# Install mbridge
RUN pip3 install --no-cache-dir mbridge==v0.15.0
# Fix qwen vl
RUN pip3 install --no-cache-dir --no-deps trl
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.1
================================================
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
================================================
FILE: docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
================================================
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.6
cudnn==9.8.0
torch==2.7.1
flash_attn=2.7.4.post1
sglang==0.4.9.post6
vllm==0.8.5.post1
nvidia-cudnn-cu12==9.8.0.87
transformer_engine==2.3
megatron.core==core_v0.12.2
# Preview
transformer_engine==2.5
megatron.core==core_r0.13.0
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for vllm/sglang
- App image:
- `verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2`
- `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2`
- `iseekyan/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.15.0-te2.7`
================================================
FILE: docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.12
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.13.preview
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.base
================================================
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
# Install flash-attn-2.8.0.post2 (cxx11abi=True)
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.53" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
================================================
FILE: docker/verl0.5-cu126-torch2.7.1-fa2.8.0/README.md
================================================
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.6
cudnn==9.8.0
torch==2.7.1
flash_attn=2.8.0 ##
sglang==0.4.8
vllm==0.8.5.post1
nvidia-cudnn-cu12==9.8.0.87
transformer_engine==2.3
megatron.core==core_v0.12.2
# Preview
transformer_engine==2.5
megatron.core==core_r0.13.0
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with deep ep built in
- App image:
- `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.12.2`
- `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.13.0-preview`
- vllm temporarily not support latest version
================================================
FILE: docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.megatron
================================================
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.base
================================================
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:25.02-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
# Install flash-attn-2.8.0.post2 (cxx11abi=True)
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
================================================
FILE: docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/README.md
================================================
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.8
cudnn==9.8.0
torch==2.7.1
flash_attn=2.8.0 ##
sglang==0.4.8
transformer_engine==2.5
megatron.core==core_r0.13.0
nvidia-cudnn-cu12==9.8.0.87
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with flash infer 0.2.6.post1 built in
- App image:
- `verlai/verl:app-verl0.5-preview-sglang0.4.8-mcore0.13.0-preview`
- vllm temporarily not support latest version
## !!!Notice!!!
- pyext is lack of maintainace and cannot work with python 3.12, consider using replacement and deprecating this package.
================================================
FILE: docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.app.sglang
================================================
FROM verlai/verl:base-verl0.6-cu128-cudnn9.8-torch2.8.0-fa2.7.4
RUN pip install --no-cache-dir "sglang[all]==0.5.2"
RUN pip install --no-cache-dir "torch-memory-saver==0.0.9rc1"
================================================
FILE: docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base
================================================
# Start from the NVIDIA official image (ubuntu-24.04 + cuda-12.8 + python-3.12)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html
FROM nvcr.io/nvidia/pytorch:25.03-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
ENV PIP_CONSTRAINT=""
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
pip config set global.no-cache-dir "true" && \
python -m pip install --upgrade pip
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install libxml2
RUN apt-get update && \
apt-get install -y libxml2 aria2 && \
apt-get clean
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
transformer_engine flash_attn apex megatron-core \
xgboost opencv grpcio
# Fix packages
RUN pip install --no-cache-dir tensordict torchdata "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
# Fix cv2
RUN rm -rf /usr/local/lib/python3.11/dist-packages/cv2
# Install torch
RUN pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
# Install flash-attn
RUN pip install --no-cache-dir --no-build-isolation flash_attn==2.7.4.post1
# Install DeepEP
# the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
# Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
## Build deepep-nvshmem
RUN apt-get install -y ninja-build cmake
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
ENV GDRCOPY_INCLUDE=/workspace/gdrcopy/include
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN git clone -b core_v0.13.0 https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && pip3 install --no-deps -e .
# Install mbridge
RUN pip3 install --no-cache-dir git+https://github.com/ISEEKYAN/mbridge.git
================================================
FILE: docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.vllm011.mcore_gpt-oss
================================================
FROM nvcr.io/nvidia/nemo:25.07.gpt_oss
RUN git clone -b v0.11.0 --depth 1 https://github.com/vllm-project/vllm.git /opt/vllm
RUN pip install setuptools_scm
RUN cd /opt/vllm && pip install --no-deps --no-build-isolation --no-cache-dir -e .
RUN pip install cbor2 setproctitle blake3 openai_harmony pybase64 msgspec partial_json_parser py-cpuinfo diskcache gguf
RUN pip install --upgrade transformers tokenizers
RUN pip install codetiming tensordict mathruler pylatexenc
RUN pip3 install --no-cache-dir mbridge
================================================
FILE: docker/verl0.6.1-experimental/Dockerfile.sglang056exp
================================================
# Dockerfile for verlai/verl:sgl056.exp
FROM lmsysorg/sglang:v0.5.6.post1
RUN pip install pybind11
RUN pip install nvidia-mathdx
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
RUN export NVTE_FRAMEWORK=pytorch && MAX_JOBS=128 NVTE_BUILD_THREADS_PER_JOB=4 pip3 install --resume-retries 999 --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.11
RUN pip install --upgrade --no-cache-dir transformers tokenizers
RUN pip install codetiming tensordict mathruler pylatexenc qwen_vl_utils
RUN pip install --no-cache-dir --no-build-isolation flash_attn==2.8.1
RUN NSIGHT_VERSION=2025.6.1_2025.6.1.190-1_$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_6/nsight-systems-${NSIGHT_VERSION}.deb && \
apt-get update && apt-get install -y libxcb-cursor0 && \
apt-get install -y ./nsight-systems-${NSIGHT_VERSION}.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-${NSIGHT_VERSION}.deb
# =========================
# Install HybridEP
# =========================
WORKDIR /home/
RUN git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 && \
TORCH_CUDA_ARCH_LIST="9.0;10.0" pip install --no-build-isolation .
# =========================
# Install Qwen3-Next dependencies
# =========================
WORKDIR /home/
# Install causal-conv1d and flash-linear-attention
RUN cd /tmp && \
git clone https://github.com/Dao-AILab/causal-conv1d.git && \
cd causal-conv1d && \
unset PIP_CONSTRAINT && \
CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install --no-build-isolation . && \
cd .. && \
rm -rf causal-conv1d && \
pip install flash-linear-attention
RUN pip install --no-cache-dir torch-memory-saver
RUN pip3 install --no-cache-dir --no-deps trl
RUN pip3 install nvtx matplotlib liger_kernel
RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git
RUN pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@1d462bd37dac21cfa14177405d4921eedb987052 # latest dev branch on 20251209
RUN pip install git+https://github.com/volcengine/verl.git@v0.6.1
RUN pip uninstall -y verl
================================================
FILE: docker/verl0.6.1-experimental/Dockerfile.vllm012exp
================================================
# dockerfile for verlai/verl:vll012.exp
FROM nvcr.io/nvidia/pytorch:25.11-py3
RUN git clone -b v0.12.0 --depth 1 https://github.com/vllm-project/vllm.git /opt/vllm
RUN pip install setuptools_scm
RUN cd /opt/vllm && pip install --no-deps --no-build-isolation --no-cache-dir -e .
RUN pip install -r /opt/vllm/requirements/common.txt
RUN pip install pybind11
RUN export NVTE_FRAMEWORK=pytorch && MAX_JOBS=128 NVTE_BUILD_THREADS_PER_JOB=4 pip3 install --resume-retries 999 --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.11
RUN pip install --upgrade --no-cache-dir transformers tokenizers
RUN pip install codetiming tensordict mathruler pylatexenc qwen_vl_utils
RUN pip install flash_attn
#==2.8.1
RUN apt update && apt install numactl
RUN NSIGHT_VERSION=2025.6.1_2025.6.1.190-1_$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_6/nsight-systems-${NSIGHT_VERSION}.deb && \
apt-get update && apt-get install -y libxcb-cursor0 && \
apt-get install -y ./nsight-systems-${NSIGHT_VERSION}.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.6.1/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-${NSIGHT_VERSION}.deb
# =========================
# Install HybridEP
# =========================
WORKDIR /home/
RUN git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 && \
TORCH_CUDA_ARCH_LIST="9.0;10.0" pip install --no-build-isolation .
# =========================
# Install Qwen3-Next dependencies
# =========================
WORKDIR /home/
# Install causal-conv1d and flash-linear-attention
RUN cd /tmp && \
git clone https://github.com/Dao-AILab/causal-conv1d.git && \
cd causal-conv1d && \
unset PIP_CONSTRAINT && \
CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install --no-build-isolation . && \
cd .. && \
rm -rf causal-conv1d && \
pip install flash-linear-attention
RUN pip3 install --no-cache-dir --no-deps trl
RUN pip3 install nvtx matplotlib liger_kernel
RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git
RUN pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@1d462bd37dac21cfa14177405d4921eedb987052 # latest dev branch on 20251209
RUN pip install git+https://github.com/volcengine/verl.git@v0.6.1
RUN pip uninstall -y verl
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = verl
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
# verl documentations
## Build the docs
```bash
# If you want to view auto-generated API docstring, please make sure verl is available in python path. For instance, install verl via:
# pip install .. -e[test]
# Install dependencies needed for building docs.
pip install -r requirements-docs.txt
# Build the docs.
make clean
make html
```
## Open the docs with your browser
```bash
python -m http.server -d _build/html/
```
Launch your browser and navigate to http://localhost:8000 to view the documentation. Alternatively you could drag the file `_build/html/index.html` to your local browser and view directly.
================================================
FILE: docs/README_vllm0.7.md
================================================
# Upgrading to vllm >= 0.7
Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
## Installation
Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
```
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.7.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
- vllm/distributed/parallel_state.py: Remove the assertion below:
```
if (world_size
!= tensor_model_parallel_size * pipeline_model_parallel_size):
raise RuntimeError(
f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
```
- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
## Features
### Use cuda graph
After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
```
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
```
For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
### Use vLLM V1 Engine
Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
```
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout 2275784
sed -i "903a\ data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
VLLM_USE_PRECOMPILED=1 pip install --editable .
```
Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
The stable support of the vLLM V1 engine is available on verl main.
================================================
FILE: docs/README_vllm0.8.md
================================================
# Upgrading to vLLM >= 0.8
Last updated: 05/04/2025.
## Installation
Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
```bash
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.8.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
```bash
docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
```
## Features
vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
```bash
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
```
and also **remove** the environment variable if it exists:
## Notes
When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
```bash
in from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
```
You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
================================================
FILE: docs/_static/custom.css
================================================
/* Make the documentation use full screen width */
.wy-nav-content {
max-width: none !important;
width: 100% !important;
padding: 1.618em 3.236em !important;
}
/* Adjust the content wrapper - will be set by JavaScript */
.wy-nav-content-wrap {
margin-left: 300px;
transition: margin-left 0.2s ease;
width: auto !important;
position: relative !important;
background: white !important;
min-height: 100vh !important;
}
/* Make the main content area responsive */
.rst-content {
max-width: none !important;
width: 100% !important;
}
/* Optional: Adjust table widths to prevent overflow */
.rst-content table.docutils {
width: 100% !important;
table-layout: auto !important;
}
/* Optional: Better code block width handling */
.rst-content .highlight {
width: 100% !important;
}
/* Content area positioning already handled above */
/* Optional: Improve readability with some margin on very wide screens */
@media (min-width: 1400px) {
.wy-nav-content {
max-width: none !important;
margin: 0 auto !important;
}
}
/* Resizable sidebar styles */
.wy-nav-side {
position: fixed !important;
top: 0 !important;
bottom: 0 !important;
left: 0 !important;
width: 300px;
min-width: 200px;
max-width: 600px;
display: flex;
flex-direction: column;
z-index: 200 !important;
}
/* Ensure sidebar header (logo, search) adapts to width */
.wy-side-nav-search {
width: 100% !important;
box-sizing: border-box !important;
padding: 0.809em 0.809em !important;
}
.wy-side-nav-search input[type="text"] {
width: 100% !important;
box-sizing: border-box !important;
}
/* Make logo/title area responsive */
.wy-side-nav-search > div.version {
width: 100% !important;
}
.wy-side-nav-search > a {
width: 100% !important;
display: block !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
/* Responsive adjustments for narrow sidebar */
@media (max-width: 300px) {
.wy-side-nav-search > a {
font-size: 0.9em !important;
}
.wy-side-nav-search input[type="text"] {
font-size: 0.8em !important;
}
}
/* Ensure search input doesn't overflow */
.wy-side-nav-search form {
width: 100% !important;
margin: 0 !important;
}
/* Make search icon responsive */
.wy-side-nav-search .wy-dropdown {
width: 100% !important;
}
/* Adjust search results dropdown width */
.wy-side-nav-search .wy-dropdown-menu {
width: 100% !important;
max-width: none !important;
left: 0 !important;
right: 0 !important;
}
/* Resize handle is created by JavaScript */
/* Make sure the sidebar content doesn't overflow */
.wy-side-scroll {
width: 100% !important;
flex: 1 !important;
overflow-y: auto !important;
overflow-x: hidden !important;
padding-right: 10px !important;
box-sizing: border-box !important;
scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */
}
/* Ensure proper scroll behavior for main content area */
html {
scroll-behavior: smooth !important;
}
/* Ensure anchor links work properly in main content */
.wy-nav-content-wrap {
scroll-behavior: smooth !important;
}
/* Fix scroll to target for anchor links */
.rst-content {
scroll-behavior: smooth !important;
}
/* Fix anchor scroll offset to account for fixed header */
.rst-content .section {
scroll-margin-top: 60px;
}
/* Fix anchor scroll offset for headers */
.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 {
scroll-margin-top: 60px;
}
/* Fix anchor scroll offset for specific scroll targets */
.rst-content .headerlink {
scroll-margin-top: 60px;
}
/* Fix sidebar navigation styling */
.wy-menu-vertical {
width: 100% !important;
}
.wy-menu-vertical li {
width: 100% !important;
}
.wy-menu-vertical a {
width: 100% !important;
word-wrap: break-word !important;
white-space: normal !important;
}
/* Content area margin is handled by JavaScript */
/* Custom drag handle (more visible) */
.resize-handle {
position: absolute;
top: 0;
right: 0;
width: 8px;
height: 100%;
background: #ccc;
cursor: col-resize;
z-index: 1001;
opacity: 0.3;
transition: opacity 0.2s ease;
}
.resize-handle:hover {
opacity: 0.8;
background: #999;
}
.resize-handle::before {
content: '';
position: absolute;
top: 50%;
left: 50%;
width: 2px;
height: 20px;
background: #666;
transform: translate(-50%, -50%);
border-radius: 1px;
}
.resize-handle:hover::before {
background: #333;
}
/* Ensure smooth resizing */
.wy-nav-side.resizing {
user-select: none;
pointer-events: none;
}
.wy-nav-side.resizing .wy-side-scroll {
overflow: hidden;
}
================================================
FILE: docs/_static/js/resizable-sidebar.js
================================================
// Resizable sidebar functionality
document.addEventListener('DOMContentLoaded', function() {
const sidebar = document.querySelector('.wy-nav-side');
const content = document.querySelector('.wy-nav-content-wrap');
if (!sidebar || !content) return;
// Create resize handle
const resizeHandle = document.createElement('div');
resizeHandle.className = 'resize-handle';
sidebar.appendChild(resizeHandle);
let isResizing = false;
let startX = 0;
let startWidth = 0;
// Get initial width
const getInitialWidth = () => {
return 300; // Default width
};
// Save width to localStorage
const saveWidth = (width) => {
localStorage.setItem('sidebar-width', width);
};
// Load width from localStorage
const loadWidth = () => {
const savedWidth = localStorage.getItem('sidebar-width');
if (savedWidth) {
const width = parseInt(savedWidth, 10);
if (width >= 200 && width <= 600) {
return width;
}
}
return getInitialWidth();
};
// Apply width to sidebar and content
const applyWidth = (width) => {
// Update sidebar width
sidebar.style.width = width + 'px';
// Update content margin with !important to override any CSS
content.style.setProperty('margin-left', width + 'px', 'important');
// Also update any other content wrapper that might exist
const contentInner = document.querySelector('.wy-nav-content');
if (contentInner) {
contentInner.style.setProperty('margin-left', '0px', 'important');
}
// Force reflow and repaint
sidebar.offsetHeight;
content.offsetHeight;
// Trigger window resize event to notify other components
window.dispatchEvent(new Event('resize'));
};
// Initialize with saved width
const initialWidth = loadWidth();
applyWidth(initialWidth);
// Mouse down on resize handle
resizeHandle.addEventListener('mousedown', (e) => {
isResizing = true;
startX = e.clientX;
startWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
sidebar.classList.add('resizing');
document.body.style.cursor = 'col-resize';
document.body.style.userSelect = 'none';
// Add overlay to prevent iframe issues
const overlay = document.createElement('div');
overlay.style.cssText = `
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
z-index: 9999;
cursor: col-resize;
`;
overlay.id = 'resize-overlay';
document.body.appendChild(overlay);
e.preventDefault();
});
// Mouse move
document.addEventListener('mousemove', (e) => {
if (!isResizing) return;
const width = startWidth + e.clientX - startX;
const clampedWidth = Math.max(200, Math.min(600, width));
applyWidth(clampedWidth);
});
// Mouse up
document.addEventListener('mouseup', () => {
if (!isResizing) return;
isResizing = false;
sidebar.classList.remove('resizing');
document.body.style.cursor = '';
document.body.style.userSelect = '';
// Remove overlay
const overlay = document.getElementById('resize-overlay');
if (overlay) {
overlay.remove();
}
// Save the current width
const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
saveWidth(currentWidth);
});
// Handle window resize - removed to prevent infinite loop
// The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize
// Double-click to reset to default width
resizeHandle.addEventListener('dblclick', () => {
const defaultWidth = 300;
applyWidth(defaultWidth);
saveWidth(defaultWidth);
});
});
// Fix navigation issues - Using MutationObserver for reliable initialization
document.addEventListener('DOMContentLoaded', function() {
let navigationFixed = false;
function setupNavigationFix() {
if (navigationFixed) return;
// Find all links in the sidebar
const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
// Only proceed if we have sidebar links
if (sidebarLinks.length === 0) return;
console.log('Setting up navigation fix...');
sidebarLinks.forEach(function(link) {
const href = link.getAttribute('href');
// Clone the link to remove all existing event listeners
const newLink = link.cloneNode(true);
// Add our own click handler
newLink.addEventListener('click', function(e) {
console.log('Link clicked:', href);
// If it's an anchor link within the same page
if (href && href.startsWith('#') && href !== '#') {
e.preventDefault();
e.stopPropagation();
const targetId = href.substring(1);
const targetElement = document.getElementById(targetId);
if (targetElement) {
// Calculate offset for fixed header
const headerHeight = 60;
const elementPosition = targetElement.getBoundingClientRect().top;
const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
window.scrollTo({
top: offsetPosition,
behavior: 'smooth'
});
// Update URL hash
if (history.pushState) {
history.pushState(null, null, '#' + targetId);
} else {
location.hash = '#' + targetId;
}
}
}
// For external links, navigate normally
else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
console.log('Navigating to external link:', href);
window.location.href = href;
}
});
// Replace the old link with the new one
link.parentNode.replaceChild(newLink, link);
});
navigationFixed = true;
// Handle initial page load with hash
if (window.location.hash) {
// Use requestAnimationFrame for better timing
requestAnimationFrame(() => {
const targetId = window.location.hash.substring(1);
const targetElement = document.getElementById(targetId);
if (targetElement) {
const headerHeight = 60;
const elementPosition = targetElement.getBoundingClientRect().top;
const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
window.scrollTo({
top: offsetPosition,
behavior: 'smooth'
});
}
});
}
}
// Try to set up navigation fix immediately
setupNavigationFix();
// If it didn't work, use MutationObserver to watch for when sidebar links are added
if (!navigationFixed) {
const observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.type === 'childList' && mutation.addedNodes.length > 0) {
// Check if sidebar links were added
const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
if (sidebarLinks.length > 0) {
setupNavigationFix();
if (navigationFixed) {
observer.disconnect();
}
}
}
});
});
// Start observing the document for changes
observer.observe(document.body, {
childList: true,
subtree: true
});
// Fallback timeout in case MutationObserver doesn't work
setTimeout(function() {
if (!navigationFixed) {
setupNavigationFix();
}
observer.disconnect();
}, 5000);
}
});
================================================
FILE: docs/_static/js/runllm-widget.js
================================================
document.addEventListener("DOMContentLoaded", function () {
var script = document.createElement("script");
script.type = "module";
script.id = "runllm-widget-script";
script.src = "https://widget.runllm.com";
script.setAttribute("version", "stable");
script.setAttribute("crossorigin", "true");
script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
script.setAttribute("runllm-name", "verl Chatbot");
script.setAttribute("runllm-position", "TOP_RIGHT");
script.setAttribute("runllm-assistant-id", "679");
script.async = true;
document.head.appendChild(script);
});
================================================
FILE: docs/advance/agent_loop.rst
================================================
Agent Loop
==========
Last updated: 07/17/2025.
.. versionadded:: 0.4.2
[status: alpha]
.. warning::
Agent Loop is ready for use, but the API may change in future releaes.
Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning.
**Design goal**:
- Plugable user defined agent loop
- Provide standard request generate api with different inference frameworks
- Provide request level load balance between multiple inference servers
**Non-goal**:
- How tool is defined and how to call tool
In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ...
and return the final output. The final output is then calculated reward and used as trajectory for RL training.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true
API Design
----------
``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement.
The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params,
could do whatever user wants, such as
- call LLM generate api
- call tools: web search, database query, code sandbox, ...
- environment interaction
- reflection
- ...
.. code:: python
class AgentLoopBase(ABC):
@abstractmethod
async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
"""Run agent loop to interact with LLM server and environment.
Args:
sampling_params (Dict[str, Any]): LLM sampling params.
**kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`.
Returns:
AgentLoopOutput: Agent loop output.
"""
raise NotImplementedError
After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids,
response token ids, and response mask.
.. code:: python
class AgentLoopOutput(BaseModel):
"""Agent loop output."""
prompt_ids: list[int]
"""Prompt token ids."""
response_ids: list[int]
"""Response token ids including LLM generated token, tool response token."""
response_mask: list[int]
"""Response mask, 1 for LLM generated token, 0 for tool response token."""
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true
.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion.
Architecture Design
-------------------
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true
A single PPO step contain two phase: rollout and train. In rollout phase:
1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``.
2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM).
3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``.
4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``.
.. tip::
AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt.
In agent loop, when user need LLM generate response:
5. Call ``AsyncLLMServerManager.generate`` with prompt_ids.
6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance).
7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below).
When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer.
8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory.
AsyncLLMServer
~~~~~~~~~~~~~~
AsyncLLMServer is the abstraction of LLM server with two types of generation api:
- `OpenAI chat completion `_: generate response for the given chat conversation.
- Token in token out: generate response ids for the given token ids.
We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested.
Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class.
.. code:: python
class AsyncServerBase(ABC):
@abstractmethod
async def chat_completion(self, raw_request: Request) -> JSONResponse:
"""OpenAI chat completion API.
Args:
raw_request (Request): raw json request
Returns:
JSONResponse: json response
API reference: https://platform.openai.com/docs/api-reference/chat/create
"""
raise NotImplementedError
@abstractmethod
async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
"""Generate response ids given prompt ids.
Args:
prompt_ids (List[int]): prompt ids
sampling_params (Dict[str, Any]): sampling params
request_id (str): request id
Returns:
List[int]: response ids
"""
raise NotImplementedError
Chat completion vs Token in token out
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
The following conclusion is based on our recent experience and is still open to investigation and discussion.
Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and
keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout.
But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool `_,
we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true
**Where does this inconsistency happened?**
First, the tool parser may alter the content. For example
.. code:: json
{"role": "assistant", "content": "Let me call a ... and get the result"}
After tool_calls extraction, the messages is like this:
.. code:: json
{"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]}
Encode the extracted message back is not equal to the original LLM generated response_ids.
Second, the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 `_.
**What is the impact of this inconsistency?**
This inconsistency is not a big problem for serving/agent system, but is critical to RL training.
It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template
to the final chat history messages make PPO training not even converged in single-turn.
vLLM
^^^^
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true
For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers.
Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids.
SGLang
^^^^^^
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true
For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner.
Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids.
AsyncLLMServerManager
~~~~~~~~~~~~~~~~~~~~~
AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides:
- load balance: select a server instance with least request in first turn and send request to it.
- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns.
AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop,
they can call ``AsyncLLMServerManager.generate`` to generate response_ids.
.. code:: python
class AsyncLLMServerManager:
async def generate(
self,
request_id,
*,
prompt_ids: list[int],
sampling_params: dict[str, Any],
) -> list[int]:
"""Generate tokens from prompt ids.
Args:
request_id (str): request id for sticky session.
prompt_ids (List[int]): List of prompt token ids.
sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
Returns:
List[int]: List of generated token ids.
"""
...
Next
----
- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset.
- `LangGraph MathExpression `_: Demonstrate how to use LangGraph to build agent loop.
- `Retool `_: End-to-end retool paper reproduction using tool agent.
================================================
FILE: docs/advance/async-on-policy-distill.md
================================================
# Recipe: Async On-Policy Knowledge Distillation Trainer
**Authors:** Brilliant Hanabi, furunding
**Last updated:** 2025-11-08
## 1. Background
On-policy knowledge distillation (KD) trains a student policy to imitate a stronger teacher using samples drawn from the student's current policy. For each on-policy rollout the teacher returns soft, top-k token distributions and the student is optimized with a token-wise sparse KL objective that focuses learning on the teacher's high-probability modes. Because training examples come from the student's own state distribution, KD reduces distributional mismatch relative to off-policy distillation or supervised fine-tuning (SFT), improving stability and sample efficiency. Compared with reinforcement learning, KD avoids high-variance reward-based optimization and complex reward design by providing dense, informative per-token targets, which typically yields faster convergence and simpler scaling. Recent empirical and implementation-focused writeups (e.g., [ThinkingMachines' blog on on-policy distillation](https://thinkingmachines.ai/blog/on-policy-distillation/)) also demonstrate that on-policy distillation can deliver high-quality behavior with substantially lower compute and data requirements than many alternative approaches.
Built on verl’s Ray-based single-controller components, we initially assembled a strictly on-policy KD pipeline where rollout generation, teacher knowledge acquisition, and policy optimization ran in lockstep. In practice, this synchronous design proved highly inefficient: the three stages had to wait for one another, creating pipeline bubbles and underutilized GPUs. To address this, we extend the asynchronous schedulers introduced by the One-Step-Off Policy pipeline to overlap these phases. This overlap preserves the same distillation objective while trading some strict on-policy guarantees for substantial gains in end-to-end throughput and hardware utilization.
## 2. Distillation Overview and Objective
This recipe centers on on-policy knowledge distillation: the student policy learns from a stronger teacher on samples generated by the current policy (on-policy). For each input prompt, the student (actor) generates responses; the teacher provides top-k token distributions, and the student is trained to match them token-wise.
Core components:
1. Teacher signal: top-k log-probabilities and token indices per valid token position.
2. Student objective: sparse, token-level KL divergence between student logits and teacher top-k distribution.
Objective: encourage student probabilities $Q$ to cover teacher modes $P$ using token-wise $\mathrm{KL}(P\,\|\,Q)$ computed on the teacher's top-k support.
## 3. Efficient System Design
### 3.1 Schedulers (One-Step / Two-Step Off-Policy)
The native (serial) on-policy distillation process is shown in the figure below.

This recipe supports optional schedulers that overlap generation, teacher querying, and updates to improve throughput without changing the distillation objective.
#### 3.1.1 One-Step-Off-Policy

- Warm-up: 2 steps.
- Overlap pattern: rollout while actor update; weight sync while teacher retrieving.
- Timing keys: `sync_rollout_weights`, `wait_prev_gen`, `wait_prev_teacher`.
#### 3.1.2 Two-Step-Off-Policy

- Warm-up: 3 steps.
- Overlap pattern: rollout, actor update while teacher retrieving; interleave weight sync.
- Timing keys: `sync_rollout_weights`, `max(wait_prev_gen, wait_prev_prev_teacher)`.
Tip: Use `two_step_off` when teacher takes much more time than sync; `one_step_off` for simpler overlapping.
Practical details:
- Inputs per batch: `teacher_topk_logps`, `teacher_topk_indices`, `attention_mask` (to select valid token positions).
- Loss injection: last pipeline stage computes KL via a logits processor; earlier stages remain unchanged.
- Optional dynamic micro-batching groups sequences by density to reduce padding overhead.
The pipeline:
1. Actor parameters are synchronized to a rollout worker group (nccl broadcast) with a little bit latency.
2. Rollout workers (vLLM-backed) generate sequences asynchronously (`async_generate_sequences`).
3. Teacher client service (ZeroMQ based) returns top-k log-probabilities + token indices for each sequence (batched micro-requests), enabling KL-based guidance.
4. Megatron actor performs a KL divergence computation between student logits and teacher top-k distributions (custom TP-aware kernel in `megatron_kl_loss.py`).
5. Scheduling strategies (`one_step_off_scheduler`, `two_step_off_scheduler`) can overlap phases (optional for throughput):
### 3.2 Weights sync between actor and rollout
We initially followed the weight synchronization path from the One-Step-Off-Policy recipe (Ray collective broadcast across all actor and rollout ranks, plus Megatron-side allgather of parameter shards). In practice this became the dominant bottleneck, so we made three changes:
1. Batch-and-bulk load on the rollout side: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched load into the rollout engine. In our setup this reduced the weight-loading time by roughly 3×.
2. Batch-and-bulk broadcast between the actor and rollout: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched broadcast between the actor and rollout workers.
3. Replace allgather with gather-to-root in Megatron: parameter shards are gathered to actor rank 0 (rather than allgathered to everyone), and that root then serves as the single source for broadcasting to rollout ranks. On top of the previous change, 2 and 3 changes delivered an additional ~4× speedup in the synchronization phase.
## 4. High-Level Data & Control Flow
```
Driver (TaskRunner)
├─ Initialize Ray, tokenizer, datasets, worker groups
├─ Build ResourcePoolManager (actor vs rollout GPU layouts)
├─ Trainer.fit()
├─ init_workers(): build actor + rollout groups, broadcast weight metadata, create nccl collective group
├─ continuous_iterator(): epochs → batches
├─ scheduler (see Section 6)
• _async_gen_next_batch(): optional weight sync + non-blocking rollout
• _async_get_teacher_knowledge(): submit teacher requests, store future
├─ For each step:
• Sync rollout weights
• Retrieve (batch, gen_output, teacher_output) from futures
• Merge gen + teacher outputs → DataProto
• Compute metrics (response length stats, timing, throughput)
• Update actor (forward_backward_batch + KL loss + optimizer step)
• (Optional) save checkpoint
```
> Note: Schedulers are optional and explained later; the distillation objective is independent of how phases are overlapped.
## 5. Key Components
### 5.1 `OnPolicyDistillTrainer` (`ray_trainer.py`)
- Creates `GenerationBatchFuture` objects holding rollout and (later) teacher futures.
- Adds scheduling + teacher integration + modified metric emission (KL, timing, MFU).
### 5.2 Actor Worker (Megatron)
- `OnPolicyDistillActor.update_policy()` orchestrates micro-batch forward/backward.
- KL Loss injection via `logits_processor` during forward on pipeline last stage.
### 5.3 Rollout Worker (vLLM / SGLang)
- Pure inference mode (`init_model` builds model; no optimizer).
- `async_generate_sequences` returns a Ray future for overlapping.
### 5.4 Teacher Service (`teacher/`)
- Proxy + worker architecture (ZMQ REQ/REP) for batched top-k retrieval.
- `TeacherClient.submit()` returns a `Future`; aggregator composes micro-batches.
- Configurable temperature, max tokens, only-response mode.
### 5.5 KL Loss (`megatron_kl_loss.py`)
- Performs normalization & stable per-token probability construction across TP shards.
- Gradient is (student_probs - teacher_sparse_probs) scaled by upstream grad.
## 6. Configuration Highlights (`on_policy_distill_trainer.yaml`)
| Section | Purpose | Notable Keys |
|---------|---------|-------------|
| actor_rollout_ref.teacher | Teacher server | server_ip, server_port, n_server_workers |
| trainer | Global training control | total_epochs, save_freq, scheduler (one_step_off | two_step_off), n_gpus_per_node, nnodes |
| rollout | Resource split for rollout | n_gpus_per_node, nnodes |
**Remember to set `trainer.n_gpus_per_node`, `trainer.nnodes`, `rollout.n_gpus_per_node` and `rollout.nnodes` to allocate GPU resources.**
### Dynamic Batch Size
Enable by:
```
actor_rollout_ref.actor.use_dynamic_bsz=True
actor_rollout_ref.actor.max_token_len=6000 # cap post-group token length
```
Improves utilization under variable sequence lengths.
### Resource Guidelines
- Actor pool: `trainer.nnodes * trainer.n_gpus_per_node` GPUs.
- Rollout pool: `rollout.nnodes * rollout.n_gpus_per_node` GPUs.
- Ensure teacher server capacity ≈ `n_server_workers` to avoid stalls (monitor `wait_prev_teacher`).
## 7. Usage Examples
### 7.1 Launch Teacher Server
Before training process, you should have a teacher server to provide logp information.
We provide a toy teacher server example with vLLM. It needs `telnet` to check proxy status, and `python` command to run. So if you have not installed `telnet`, you can just delete these code in `start_server.sh`. And some OS use `python3` rather than `python`, so you also need to modify it. Also you can change the port of teacher if you meet port conflict.
There are 3 arguments can be set for vllm backend `--tp-size`, `--n-logprobs` and `--ckpt-path` in `start_server.sh` / `worker.py`. You should set before you start server.
We also provide a toy multi-node teacher server. You can start the main node using `start_server.sh` and start the slave nodes using `join_server.sh`. Still remember to set args in `join_server.sh`, especially the `$PROXY_IP` and `$PROXY_BACKEND_PORT` of main node.
When training, student will automatically use the teacher's topk (n-logprobs) to set its own topk argument at line 83 of `recipe/gkd/megatron_kl_loss.py`, so you don't need to set student's topk argument.
```bash
cd recipe/gkd/teacher
bash start_server.sh
# Exports ports and launches proxy + worker (default vLLM backend)
```
Verify with:
```bash
telnet localhost 15555
```
### 7.2 Minimal Local (Megatron + vLLM) Run
```bash
python3 -m recipe.gkd.main_gkd \
--config-path=recipe/gkd/config \
--config-name=on_policy_distill_trainer \
actor_rollout_ref.model.path=/path/to/MODEL \
data.train_files=/path/to/train.parquet \
trainer.total_epochs=2 \
trainer.n_gpus_per_node=4 rollout.n_gpus_per_node=2 \
actor_rollout_ref.teacher.server_ip=127.0.0.1 \
actor_rollout_ref.teacher.server_port=15555 \
trainer.scheduler=one_step_off
```
(Requires a running teacher server).
### 7.3 Ray Job Submission (Distilled 16B Example)
See `run_moonlight_dsv3_training.sh` for a full script including:
- Dist ckpt path setup (`dist_checkpointing_path`)
- Expert parallel sizing (EP / ETP)
- Dynamic batch sizing
- Two-step-off scheduling for deeper overlap.
Submit (after adjusting paths):
```bash
bash recipe/gkd/run_moonlight_dsv3_training.sh
```
## 8. Metrics & Monitoring
Emitted metrics include (prefixes may vary):
- Timing: `timing/wait_prev_gen`, `timing/sync_rollout_weights`, `timing/get_teacher_knowledge`, `timing/update_actor`.
- Sequence stats: `response_seq_len/*` (avg, max, min, counts).
- Performance: `perf/mfu/actor`, `perf/max_memory_allocated_gb`, `perf/cpu_memory_used_gb`.
- Distillation: `actor/kl_loss`, `actor/grad_norm`, `actor/lr`.
Interpretation Tips:
- High `wait_prev_teacher` → scale `n_server_workers` and allocate more teacher GPUs or reduce per-request batch size, or just use `two_step_off`.
- High `wait_prev_gen` with uniform lengths → allocate more rollout GPUs.
- High `sync_rollout_weights` → check NCCL env / network congestion and try to modify `actor_rollout_ref.rollout.update_weights_bucket_megabytes`.
## 9. Extensibility Notes
- Add new schedulers by following interface returning `(epoch, batch, gen_output, teacher_output, timing_dict)`.
- Integrate different distillation signals (e.g., hidden states, intermediate reasoning tokens) by extending `teacher_utils.get_teacher_knowledge` and modifying `logits_processor`.
## 10. Functional Support Summary
| Category | Supported |
|----------|-----------|
| Train engine | Megatron |
| Rollout engine | vLLM |
| Distillation signal | Teacher top-k logprobs & indices |
| Scheduling | one_step_off, two_step_off |
## 11. Quick Checklist Before Running
- Teacher server reachable (`telnet `).
- `actor_rollout_ref.model.path` contains the correct Megatron/HF config artifacts.
- `train_files` points to a parquet dataset compatible with this recipe's dataset loader.
- NCCL environment vars set (see `config/runtime_env.yaml`).
---
Feel free to open issues or PRs to extend scheduler variants, add new distillation objectives, or broaden engine support, and more improvement.
================================================
FILE: docs/advance/attention_implementation.rst
================================================
.. _attention-implementation-override:
Attention Implementation Override
==================================
Last updated: 10/31/2025.
By default, VERL's FSDP workers use ``flash_attention_2`` as the attention implementation for improved performance.
However, you can now override this setting to use different attention implementations based on your needs.
Supported Attention Implementations
-----------------------------------
The following attention implementations are supported (subject to model and hardware compatibility):
- ``flash_attention_2``: High-performance attention implementation (default)
- ``eager``: Standard PyTorch attention implementation
- ``sdpa``: Scaled Dot-Product Attention (PyTorch native)
When to Override
----------------
You might want to override the attention implementation in the following scenarios:
- **Debugging**: Use ``eager`` for easier debugging and better error messages
- **Compatibility**: Some models or hardware configurations may not support ``flash_attention_2``
- **Memory constraints**: Different implementations have different memory characteristics
- **Performance tuning**: Testing different implementations for optimal performance
Configuration Examples
-----------------------
PPO Training with Eager Attention
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To override the attention implementation for the actor, rollout, and reference models:
.. code:: bash
python3 ppo_trainer.py \
+actor_rollout_ref.model.override_config.attn_implementation=eager \
[other parameters...]
PPO Training with SDPA Attention
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: bash
python3 ppo_trainer.py \
+actor_rollout_ref.model.override_config.attn_implementation=sdpa \
[other parameters...]
Critic Model Override
~~~~~~~~~~~~~~~~~~~~~
For training configurations that include a critic model, you can also override its attention implementation:
.. code:: bash
python3 ppo_trainer.py \
+actor_rollout_ref.model.override_config.attn_implementation=eager \
+critic.model.override_config.attn_implementation=eager \
[other parameters...]
YAML Configuration
~~~~~~~~~~~~~~~~~~
You can also specify the attention implementation in your YAML configuration file:
.. code:: yaml
actor_rollout_ref:
model:
override_config:
attn_implementation: eager
# other overrides...
critic: # if using a critic model
model:
override_config:
attn_implementation: eager
# other overrides...
Important Notes
---------------
**Backward Compatibility**: If you don't specify ``attn_implementation`` in the override config,
VERL will continue to use ``flash_attention_2`` by default, ensuring backward compatibility with existing configurations.
**Model Support**: Not all models support all attention implementations. Ensure your model is compatible
with the chosen attention implementation before training.
**Performance Impact**: Different attention implementations have varying performance characteristics.
``flash_attention_2`` typically offers the best performance, while ``eager`` provides better debugging capabilities.
**Hardware Dependencies**: Some attention implementations (like ``flash_attention_2``) may require
specific hardware or CUDA versions. If you encounter compatibility issues, try using ``eager`` or ``sdpa``.
Troubleshooting
---------------
If you encounter errors when using a specific attention implementation:
1. **Check model compatibility**: Verify that your model supports the chosen attention implementation
2. **Try eager attention**: Use ``attn_implementation=eager`` as a fallback for debugging
3. **Check hardware requirements**: Ensure your hardware supports the attention implementation
4. **Review error messages**: Attention implementation errors often provide clear guidance on supported options
Example Error Resolution
~~~~~~~~~~~~~~~~~~~~~~~~
If you see an error like "flash_attention_2 is not supported", you can resolve it by switching to eager attention:
.. code:: bash
# Instead of the default flash_attention_2
python3 ppo_trainer.py +actor_rollout_ref.model.override_config.attn_implementation=eager
This override ensures your training can proceed while you investigate the flash attention compatibility issue.
================================================
FILE: docs/advance/checkpoint.rst
================================================
.. _checkpoint-page:
Using Checkpoints to Support Fault Tolerance Training
=====================================================
Last updated: 06/25/2025.
There could be training errors or machine failure during the whole RLHF training process,
so it is recommended to enable checkpoints to minimize your loss.
The API Interface has already been listed in :ref:`config-explain-page`,
and we will not repeat them. But there are still some technique details
we hope to clarify.
.. note::
Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``,
the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all.
Checkpoint Saving Directory Structure
-------------------------------------
Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml``
to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``.
So the inner checkpoint structure of **FSDP** is like:
.. code::
checkpoints/${trainer.project_name}/${trainer.experiment_name}
├── global_steps_${i}
│ ├── actor
│ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_model`` in checkpoint.contents
│ │ └── fsdp_config.json # FSDP config file, including world_size and fsdp version
│ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
│ ├── critic
│ │ ├── huggingface
│ │ └── fsdp_config.json
│ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
└── latest_checkpointed_iteration.txt
All model shards, optimizers and extra states are stored together, in a sharded and distributed way.
While **Megatron** current checkpoint structure is:
.. code::
checkpoints/${trainer.project_name}/${trainer.experiment_name}
├── global_steps_${i}
│ ├── actor
│ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents
│ │ └── dist_ckpt # save sharded model/optimizer/rng_states, naming the same as Megatron
│ └── critic
│ │ ├── huggingface
│ │ └── dist_ckpt
└── latest_checkpointed_iteration.txt
Convert FSDP and Megatron Checkpoints to HuggingFace Format Model
-----------------------------------------------------------------
We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model.
The tool is located in ``verl/model_merger``. For older versions of verl that don't include fsdp_config.json in checkpoints, you can use the legacy model merger located at ``verl/scripts/legacy_model_merger.py``.
The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model).
The arguments for the `merge` sub-command are as follows:
.. code:: bash
usage: python -m verl.model_merger merge [-h] --backend {fsdp,megatron} [--local_dir LOCAL_DIR] [--tie-word-embedding] [--is-value-model] [--use_cpu_initialization] [--target_dir TARGET_DIR]
[--hf_upload_path HF_UPLOAD_PATH] [--private]
options:
-h, --help show this help message and exit
--backend {fsdp,megatron}
The backend of the model
--local_dir LOCAL_DIR
Path to the saved model checkpoints
--tie-word-embedding Whether to tie word embedding weights (currently only Megatron supported)
--is-value-model Whether the model is a value model (currently only Megatron supported)
--use_cpu_initialization
Whether to use CPU initialization for the model. This is useful for large models that cannot fit into GPU memory during initialization.
--target_dir TARGET_DIR
Directory to save the merged huggingface model
--hf_upload_path HF_UPLOAD_PATH
Hugging Face repository ID to upload the model
--private Whether to upload the model to a private Hugging Face repository
Example usage for merging Megatron checkpoints:
.. code:: bash
python -m verl.model_merger merge \
--backend megatron \
--tie-word-embedding \
--local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
Example usage for distributed merging Megatron checkpoints:
.. code:: bash
torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} -m verl.model_merger merge \
--backend megatron \
--tie-word-embedding \
--local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
Example usage for merging FSDP checkpoints:
.. code:: bash
python -m verl.model_merger merge \
--backend fsdp \
--local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
Megatron Merger details
-----------------------
Current implement of decoder layers uses ``nn.ModuleList`` to store the layers,
and thus the model layers on every PP rank and VPP rank starts their index from 0.
There are 3 ways to correct this behavior:
1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation.
2. Modify the layer index when saving checkpoint and recover them when loading checkpoint.
3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex.
Current implementation use solution 2.
HuggingFace to Megatron DistCheckpoint details
----------------------------------------------
Through ``mbridge``, we can directly save the mcore model to huggingface format during training.
No need to convert the model to Megatron dist-checkpoint format.
Original Checkpoint Utils
-------------------------
Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``.
We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can).
.. note::
Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**.
Because it utilizes **sharded load way to minimize the loading checkpoint overhead**.
Every rank loads its own data from ``state_dict`` which can be accessed by all of them.
While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0.
For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM.
To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``.
================================================
FILE: docs/advance/dpo_extension.rst
================================================
Extend to other RL(HF) algorithms
=================================
Last updated: 02/25/2025.
We already implemented the complete training pipeline of the PPO
algorithms. To extend to other algorithms, we analyze the high-level
principle to use verl and provide a tutorial to implement the DPO
algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
Overall Approach
----------------
Step 1: Consider what multi-machine multi-GPU computations are needed
for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
``update_policy`` in the actor_rollout model. Implement distributed
single-process-multiple-data (SPMD) computation and encapsulate them
into APIs
Step 2: Based on different distributed scenarios, including FSDP and 3D
parallelism in Megatron-LM, implement single-process control of data
interaction among multi-process computations.
Step 3: Utilize the encapsulated APIs to implement the control flow
Example: Online DPO
-------------------
We use verl to implement a simple online DPO algorithm. The algorithm
flow of Online DPO is as follows:
1. There is a prompt (rollout) generator which has the same weight as
the actor model. After a batch of prompts are fed into the generator,
it generates N responses for each prompt.
2. Send all the prompts + responses to a verifier for scoring, which can
be reward model or a rule-based function. Then sort them in pairs to
form a training batch.
3. Use this training batch to train the actor model using DPO. During
the process, a reference policy is needed.
Step 1: What are the multi-machine multi-GPU computations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Sample Generator**
Implementation details:
.. code:: python
from verl.single_controller.base import Worker
from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
import ray
@ray.remote
class SampleGenerator(Worker):
def __init__(self, config):
super().__init__()
self.config = config
def generate_sequences(self, data):
pass
Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
``torchrun``, with each process running the same code (SPMD).
``SampleGenerator`` needs to implement a ``generate_sequences`` API for
the control flow to call. The implementation details inside can use any
inference engine including vllm, sglang and huggingface. Users can
largely reuse the code in
verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't
go into details here.
**ReferencePolicy inference**
API: compute reference log probability
.. code:: python
from verl.single_controller.base import Worker
import ray
@ray.remote
class ReferencePolicy(Worker):
def __init__(self):
super().__init__()
self.model = Model()
def infer(self, data):
return self.model(data)
**Actor update**
API: Update actor model parameters
.. code:: python
from verl.single_controller.base import Worker
import ray
@ray.remote
class DPOActor(Worker):
def __init__(self):
super().__init__()
self.model = Model()
self.model = FSDP(self.model) # or other distributed strategy
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.loss_fn = xxx
def update(self, data):
self.optimizer.zero_grad()
logits = self.model(data)
loss = self.loss_fn(logits)
loss.backward()
self.optimizer.step()
**Notes: How to distinguish between control processes and distributed computation processes**
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Control processes are generally functions directly decorated with
``@ray.remote``
- Computation processes are all wrapped into a ``RayWorkerGroup``.
Users can reuse most of the distribtued computation logics implemented
in PPO algorithm, including FSDP and Megatron-LM backend in
verl/verl/trainer/ppo.
Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**The core problem to solve here is how a single process sends data to
multiple processes, drives multi-process computation, and how the
control process obtains the results of multi-process computation.**
First, we initialize the multi-process ``WorkerGroup`` in the control
process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
worker_group = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
As we can see, in the control process, multiple processes are wrapped
into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
``self._workers`` member, where each worker is a RayActor
(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
ray_trainer.md also provide an implementation of
``MegatronRayWorkerGroup``.
Assuming the model is distributed using FSDP, and there is a batch of
data on the control process, for data parallelism, the underlying
calling process is:
.. code:: python
data = xxx
data_list = data.chunk(dp_size)
output = []
for d in data_list:
# worker_group._workers[i] is a SampleGenerator
output.append(worker_group._workers[i].generate_sequences.remote(d))
output = ray.get(output)
output = torch.cat(output)
Single process calling multiple processes involves the following 3
steps:
1. Split the data into DP parts on the control process.
2. Send the data to remote, call the remote computation through RPC, and
utilize multi-process computation.
3. Obtain the computation results of each worker on the control process
and merge them.
Frequently calling these 3 steps on the controller process greatly hurts
code readability. **In verl, we have abstracted and encapsulated these 3
steps, so that the worker's method + dispatch + collect can be
registered into the worker_group**
.. code:: python
from verl.single_controller.base.decorator import register
def dispatch_data(worker_group, data):
return data.chunk(worker_group.world_size)
def collect_data(worker_group, data):
return torch.cat(data)
dispatch_mode = {
'dispatch_fn': dispatch_data,
'collect_fn': collect_data
}
@register(dispatch_mode=dispatch_mode)
def generate_sequences(self, data):
pass
In this way, we can directly call the method inside the worker through
the ``worker_group`` on the control (driver) process (which is a single
process):
.. code:: python
output = worker_group.generate_sequences(data)
This single line includes data splitting, data distribution and
computation, and data collection.
Furthermore, the model parallelism size of each model is usually fixed,
including dp, tp, pp. So for these common distributed scenarios, we have
pre-implemented specific dispatch and collect methods,in `decorator.py `_, which can be directly used to wrap the computations.
.. code:: python
from verl.single_controller.base.decorator import register, Dispatch
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, data: DataProto) -> DataProto:
pass
Here it requires the data interface to be ``DataProto``. Definition of
``DataProto`` is in `protocol.py `_.
Step 3: Main training loop
~~~~~~~~~~~~~~~~~~~~~~~~~~
With the above training flows, we can implement the algorithm's control
flow. It is recommended that ``main_task`` is also a ray remote process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
sample_gen = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
ray_cls = RayClassWithInitArgs(ReferencePolicy)
ref_policy = RayWorkerGroup(resource_pool, ray_cls)
# construct actor
ray_cls = RayClassWithInitArgs(DPOActor)
dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
dataloader = DataLoader()
for data in dataloader:
# generate data
data = sample_gen.generate_sequences(data)
# generate scores for each data
data = generate_scores(data)
# generate pairwise data using scores
data = generate_pairwise_data(data)
# generate ref_log_prob
data.batch['ref_log_prob'] = ref_policy.infer(data)
# update using dpo
dpo_policy.update(data)
# logging
Here, different ``WorkerGroups`` can be placed in the same resource pool or
in different resource pools using ``create_colocated_worker_cls``
similar as in `ray_trainer.py `_.
================================================
FILE: docs/advance/fp8.md
================================================
# FP8 RL in verl
Last updated: 03/05/2026
verl supports two FP8 modes for accelerating RL training:
| Mode | Training Precision | Rollout Precision |
|------|-------------------|-------------------|
| **FP8 Rollout Only** | BF16 | FP8 |
| **FP8 End-to-End** | FP8 (Megatron) | FP8 (vLLM) |
> [!TIP]
> For ready-to-run scripts, see the [low-precision recipe directory](https://github.com/verl-project/verl-recipe/low_precision).
---
## FP8 Rollout Only
FP8 rollout-only mode keeps training in BF16 and quantizes rollout inference to FP8. This reduces GPU memory during generation and speeds up rollout without affecting training precision.
### Implementation
We monkey patch several vLLM functions to enable FP8 rollout for reinforcement learning:
1. **Quantize weights**: Quantize model weights on-the-fly from higher-precision formats to FP8.
2. **Process weights after loading**: For vLLM, we replace the `vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading` function to handle weight processing after quantization. For SGLang, this patch is not needed as it natively supports loading quantized weights.
### Support Matrix
- FP8 blockwise quantization for rollout
- Used in Deepseek, which is 1x128 quantization for activations and 128x128 quantization for model weights
- Dense models and MoE models
- Async rollout interfaces
- vLLM 0.10.x & vLLM 0.11 & vLLM 0.12 & SGLang 0.5.5
- FSDP and Megatron training backends
### Usage
Enable in config file:
```yaml
rollout:
quantization: "fp8"
```
Or via command line:
```bash
actor_rollout_ref.rollout.quantization=fp8
```
### Experiments and Outcomes
#### Qwen3-8B-Base Dense Model
**Configuration**
- DAPO recipe. AIME24 online validation.
- vLLM(FP8 spmd rollout) + FSDP
- Note that SPMD rollout has been deprecated, so we removed the FP8 SPMD rollout.
- Prompt batch size 32, n=16.
- Rollout batch size: 32\*3*16
- Train_batch_size & ppo_mini_batch_size 32
- Max response length 20K
- Token-level TIS, C=2
- 8*H100
- vLLM 0.10.0+CUDA 12.6 vs vLLM 0.11.0+CUDA 12.9
**Accuracy**

*dark green: BF16, orange: FP8 rollout + token-level TIS, light green: FP8 rollout without TIS*
Results and observations:
- With TIS, FP8 rollout aligns with BF16
- Obvious accuracy drop when TIS is not enabled
- Higher mismatch kl but within acceptable range throughout the training
**Performance**

*green: BF16, orange: FP8 rollout + CUDA12.6 + DeepGemm, purple: FP8 rollout + CUDA 12.9 + DeepGemm*
Results and observations:
- FP8 rollout leads to around ~12% rollout speedup with CUDA 12.6 + DeepGemm
- When upgrading to CUDA 12.9, speedup can be up to ~18%
#### Qwen3-30B-A3B-Base MoE Model
**Configuration**
- DAPO recipe. AIME24 online validation.
- FP8 async rollout, vLLM+FSDP
- Prompt batch size 32
- Rollout batch size: 32\*3*16
- Train_batch_size & ppo_mini_batch_size 32
- Max response length 20K
- Token-level TIS, C=2
- 2\*8*H100
- vLLM 0.10.0+CUDA 12.6
**Accuracy**

*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
Results and observations:
- Rollout & training distribution mismatch is in general higher for MoE
- Rollout correction required even for BF16
- FP8 rollout with token-level TIS aligns with BF16
**Performance**

*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
Results and observations:
- FP8 rollout : over 35% rollout speedup
- Expecting more perf gain with CUDA 12.9
---
## FP8 End-to-End (Training + Rollout)
FP8 E2E applies FP8 to the entire RL pipeline: forward/backward passes via Transformer Engine, FP8 optimizer states, and FP8 rollout inference via vLLM. This maximizes memory savings and throughput.
### Requirements
- **CUDA 12.9+** (required for block-wise FP8 scaling)
- **Transformer Engine** with block-wise FP8 support
- Environment variable: `NVTE_FP8_BLOCK_SCALING_FP32_SCALES=1`
### Key Configuration
```yaml
# FP8 training via Transformer Engine
actor_rollout_ref.actor.megatron.override_transformer_config:
fp8: "hybrid" # FP8 forward + backward; also supports "e4m3"
fp8_recipe: "blockwise" # block-wise scaling
# FP8 optimizer
actor_rollout_ref.actor.optim.override_optimizer_config:
fp8_recipe: "blockwise"
# FP8 rollout inference (vLLM)
actor_rollout_ref.rollout:
quantization: fp8
```
### Support Matrix
- Megatron training backend (via Megatron-Bridge)
- Verified on Qwen3-30B-A3B and Qwen3-8B
- Block-wise FP8 scaling (`fp8_recipe: "blockwise"`)
### Experiments and Results
#### Qwen3-30B-A3B MoE Model
**Configuration**
- DAPO recipe. AIME24 online validation.
- Megatron + Megatron-Bridge, FP8 async rollout with vLLM
- MoE router in BF16 for both vLLM and Megatron-Core
- Prompt batch size 128, n=16
- Max response length 20K
- Token-level TIS, C=2
- 2\*8*H100, CUDA 12.9

*Orange: BF16, Green: FP8 E2E, Red: FP8 rollout + BF16 training*
Results and observations:
- FP8 E2E achieves comparable accuracy to the BF16 baseline, with the two curves closely aligned throughout training.
- The training/inference precision mismatch (measured by KL divergence) follows the ordering: FP8 rollout-only > FP8 E2E > BF16 E2E. This is expected, as FP8 E2E maintains consistent precision across both training and inference, resulting in lower distribution mismatch than the FP8 rollout-only setting where training remains in BF16.
---
## Citation
For more extensive experiments, ablation studies, and analysis on FP8 reinforcement learning, please refer to our technical report:
```bibtex
@article{qiu2026fp8rl,
title={FP8-RL: A Practical and Stable Low-Precision Stack for LLM Reinforcement Learning},
author={Qiu, Zhaopeng and Yu, Shuang and Zhang, Jingqi and Zhang, Shuai and Huang, Xue and Yang, Jingyi and Lai, Junjie},
journal={arXiv preprint arXiv:2601.18150},
year={2026},
url={https://arxiv.org/abs/2601.18150}
}
```
================================================
FILE: docs/advance/fsdp_extension.rst
================================================
Add models with the FSDP backend
==================================
Last updated: 02/09/2025.
Model
--------------------------
In principle, our FSDP backend can support any HF model and we can
sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`.
However, ``hf_weight_loader`` is will gather the full state_dict of a
model during synchronization, which may cause OOM. We suggest using
``dtensor_weight_loader`` which gather the full model parameter layer by
layer to reduce the peak memory usage. We already support dtensor weight
loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`:
- ``GPT2LMHeadModel``
- ``LlamaForCausalLM``
- ``LLaMAForCausalLM``
- ``MistralForCausalLM``
- ``InternLMForCausalLM``
- ``AquilaModel``
- ``AquilaForCausalLM``
- ``Phi3ForCausalLM``
- ``GemmaForCausalLM``
- ``Gemma2ForCausalLM``
- ``GPTBigCodeForCausalLM``
- ``Starcoder2ForCausalLM``
- ``Qwen2ForCausalLM``
- ``DeepseekV2ForCausalLM``
To implement ``dtensor_weight_loader`` of a model that's supported in
vLLM, follow the guide of gemma model below:
1. Copy the
``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
to ``dtensor_weight_loaders.py``
2. Modify the arguments to
``(actor_weights: Dict, vllm_model: nn.Module)``
3. Replace the ``self`` to ``vllm_model``
4. Add the
``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
before each ``param = params_dict[name]`` and modify the following
weight loading using ``local_loaded_weight``.
5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
.. code-block:: diff
- def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
- params_dict = dict(self.named_parameters())
+ params_dict = dict(vllm_model.named_parameters())
loaded_params = set()
- for name, loaded_weight in weights:
+ for name, loaded_weight in actor_weights.items():
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
- weight_loader(param, loaded_weight, shard_id)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
- weight_loader(param, loaded_weight)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
loaded_params.add(name)
unloaded_params = params_dict.keys() - loaded_params
if unloaded_params:
raise RuntimeError(
"Some weights are not initialized from checkpoints: "
f"{unloaded_params}")
================================================
FILE: docs/advance/fully_async.md
================================================
# Recipe: Fully Async Policy Trainer
**Author:** `https://github.com/meituan-search`
Last updated: 02/05/2026.
This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
supporting asynchronous sample generation and training.
Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs,
without significantly affecting the results.
## Introduction
### Background
The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more
flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training
efficiency caused by long-tail problems.
The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by
designing a separated architecture and performing asynchronous training between rollout and train for one round.
However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot
completely eliminate the impact of long-tail on training efficiency.
In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have
been implemented based on the separated architecture and have achieved gains.
We borrow from their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and
partial
rollout training.
By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy
can significantly improve training efficiency.
> Magistral https://arxiv.org/abs/2506.10910
>
> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
> Reasoning https://arxiv.org/abs/2505.24298
>
> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
> Generation https://arxiv.org/abs/2504.15930
>
> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
>
### Core Contributions
* **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to
specify the resources they occupy separately.
* **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples.
* **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to
multiple steps, making the asynchronous solution more flexible.
* **NCCL Parameter Synchronization**: Based on the nccl communication primitive, refer
to [checkpoint-engine](https://github.com/MoonshotAI/checkpoint-engine) to
achieve efficient parameter synchronization between Rollouter and Trainer.
* **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single
sample as the minimum transmission unit.
* **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it
supports training with samples generated by old parameters.
* **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter
synchronization, by adding `sleep() and resume()` logic, it
saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for
ongoing tasks to finish during parameter synchronization.
Currently, the supported usage mode is megatron/fsdp+vllm. vllm must use the server mode based on AgentLoop.
## Design
The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four
parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer.

1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the
production speed controlled by freshness.
2. MessageQueue is used to temporarily store samples generated by Rollouter.
3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size`
samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers
a parameter synchronization with Rollouter.
4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability.
The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for
rollout cannot solve the idleness caused by long-tail samples.
After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources
are used),
but the overlap in their time consumption reduces the end-to-end time consumption.

## Usage
### Parameter Description
| super params | implication |
|------------------------------------------------------------------|------------------------------------------------------------------------------------------------|
| `trainer.nnodes` | Number of nodes for Trainer |
| `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer |
| `rollout.nnodes` | Number of nodes for Rollouter |
| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter |
| `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) |
| `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) |
| `rollout.total_rollout_steps` | Total number of rollout samples |
| `rollout.test_freq` | How many times Rollouter updates parameters before performing a validation |
| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus |
| `actor_rollout_ref.actor.use_rollout_log_probs=True` | Use log_probs generated by rollout |
| `algorithm.rollout_correction.bypass_mode` | Whether to compute log_prob using the training model's parameters during the training phase. |
| `async_training.require_batches` | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once |
| `async_training.trigger_parameter_sync_step` | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
| `async_training.staleness_threshold` | Freshness control |
| `async_training.partial_rollout` | Whether to perform partial_rollout |
| `async_training.use_trainer_do_validate` | Whether use trainer node to do validate process, default `False` |
**Further Explanation:**
* `rollout.total_rollout_steps`
Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step:
`rollout.total_rollout_steps = data.train_batch_size * step`.
* `async_training.trigger_parameter_sync_step`
In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches
`require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter.
Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process
`trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples.
To fairly compare speed with colocate, `trigger_parameter_sync_step` should be set to
`data.train_batch_size / (require_batches * ppo_mini_batch_size)`.
* `async_training.staleness_threshold`
In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used.
* `staleness_threshold`=0, indicates synchronous training.
Rollouter will generate a fixed number of samples between two parameter updates, the sample count is:
`rollout_num = (trigger_parameter_sync_step*require_batches*ppo_mini_batch_size)`
* `staleness_threshold`>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous
calls.
Rollouter will generate at most the following number of samples between two parameter updates:
`rollout_num = (1+staleness_threshold)*(trigger_parameter_sync_step*require_batches*ppo_mini_batch_size) - num_staleness_sample`
`num_staleness_sample` represents the number of stale samples generated in excess during the last rollout.
Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower,
trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples.
When rollout is fast enough, setting `staleness_threshold` to 1 is basically equivalent to one_step_off policy.
To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1.
* `async_training.partial_rollout`
partial_rollout only actually takes effect when staleness_threshold>0.
* `async_training.require_batches`
In streaming training, require_batches should be set to 1, indicating that training is performed after producing
enough ppo_mini_batch_size samples.
In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can
cause training instability and longer response lengths.
Here, we additionally provide require_batches for streaming distribution and control the number of samples
participating in training at once.
* `actor_rollout_ref.actor.use_rollout_log_probs=True`
In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to
the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling,
old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm
correctness. In the fully
async strategy, we default to old_log_prob being calculated by rollout rather than by trainer.
* `algorithm.rollout_correction.bypass_mode`
> algorithm.rollout_correction.bypass_mode default is True, using rollout log prob.
During the training process, we observed that metrics and response lengths may become unstable in the later
stages of training. To mitigate this issue, we can use
the [Rollout Importance Sampling](https://verl.readthedocs.io/en/latest/advance/rollout_is.html)
technique for importance sampling. To utilize Rollout Importance Sampling, we need to compute log_prob using
the training engine, which requires enabling this switch.
Additionally, when `algorithm.rollout_correction.bypass_mode=False` and Rollout Importance Sampling are enabled under
mode d
(async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`.
* `async_training.use_trainer_do_validate`
It controls whether to use the trainer's `do_validate` method for validation.
If set to True, the trainer will perform validation after each parameter update. It can reduce the validation time
overhead and trainer node idle time.
If set to False, the trainer will not perform validation.
### Supported Modes
1. on policy pipeline:
1. **trigger_parameter_sync_step=1, staleness_threshold=0**
2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for
training, and after training completes, Trainer and Rollouter perform a parameter synchronization;
3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill
idle resources, causing some resource waste.
4. As shown in figure a;
2. stream off policy pipeline:
1. **trigger_parameter_sync_step>1, staleness_threshold=0**
2. Synchronous streaming training will be performed. Rollouter produces
`require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local
training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training
trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization;
3. Compared to a, since more samples are generated at once, resource idleness will be lower.
4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples,
train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter
update, rollout waits for training to complete.
5. As shown in figure b;
3. async stream pipeline with stale samples:
1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False**
2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number
of samples generated may be less than this value depending on rollout speed).
3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples
before parameter synchronization for immediate use by Trainer after synchronization.
When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete
and not add new tasks;
4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the
first batch rollout to finish, but will have the time to wait for active tasks to finish.
5. As shown in figure c;
4. async stream pipeline with partial rollout:
1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True**
2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will
interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be
generated after synchronization. This reduces the time to wait for active tasks to finish.
3. As shown in figure d;

### Key Metrics
| metrics | implication |
|------------------------------------------------|--------------------------------------------------------------------------------------------------------|
| `trainer/idle_ratio` | Trainer idle rate |
| `rollouter/idle_ratio` | Rollouter idle rate |
| `fully_async/count/stale_samples_processed` | Total number of old samples used in training |
| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories) |
| `fully_async/partial/total_partial_num` | Number of partial samples processed by Trainer between two trigger_parameter_sync_step |
| `fully_async/partial/partial_ratio` | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step |
| `fully_async/partial/max_partial_span` | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step |
### Parameter Tuning Recommendations
* Resource Allocation and Adjustment:
* Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource
allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire
training process,
avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource
allocation can be adjusted based on the idle time of rollout and train during actual training,
which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and
trainer/idle_ratio is low,
Trainer resources should be increased and Rollouter resources should be reduced, and vice versa.
* Key Parameters:
* staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It
is recommended to set it to less than 1.
* require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and
the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample
processing;
* trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent
parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in
low resource utilization.
The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy.
* rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small.
* Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at
different levels, suitable for tasks in different scenarios.
* For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed
requirements, the on policy pipeline mode (Mode 1) can be tried.
* For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy
pipeline mode can be tried. That is, by
setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization
mechanism (staleness_threshold=0) (Mode 2).
* For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and
staleness, setting staleness_threshold>
0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4).
### Quick Start
```shell
rollout_mode="async"
rollout_name="vllm" # sglang or vllm
if [ "$rollout_mode" = "async" ]; then
export VLLM_USE_V1=1
return_raw_chat="True"
fi
train_prompt_bsz=0
gen_prompt_bsz=1
n_resp_per_prompt=16
train_prompt_mini_bsz=32
total_rollout_steps=$(((512*400)))
test_freq=10
staleness_threshold=0
trigger_parameter_sync_step=16
partial_rollout=False
python -m recipe.fully_async_policy.fully_async_main \
train_batch_size=${train_prompt_bsz} \
data.gen_batch_size=${gen_prompt_bsz} \
data.return_raw_chat=${return_raw_chat} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.actor.strategy=fsdp2 \
critic.strategy=fsdp2 \
actor_rollout_ref.hybrid_engine=False \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.name=${rollout_name} \
actor_rollout_ref.rollout.mode=${rollout_mode} \
trainer.nnodes="${NNODES_TRAIN}" \
trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
rollout.nnodes="${NNODES_ROLLOUT}" \
rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
rollout.total_rollout_steps="${total_rollout_steps}" \
rollout.test_freq="${test_freq}" \
async_training.staleness_threshold="${staleness_threshold}" \
async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
async_training.partial_rollout="${partial_rollout}"
```
## Experiments
### Asynchronous Training on 7B Model
We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources.
Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards,
64 cards, and 128 cards without significantly affecting experimental results.
* Machine: H20
* Model: Qwen2.5-Math-7B
* Rollout length: max_response_length FSDP2: 28K tokens;
* Algorithm: DAPO
* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
* Engine: vllm+FSDP2
* rollout.n: 16
* ppo_mini_batch_size: 32
* test_freq: 20
* colocate sync:
* step: 400
* train_batch_size: 512
* fully_async_policy
* total_rollout_steps: 512*400
* require_batches: 4
* trigger_parameter_sync_step: 4
* staleness_threshold: 0.5
* partial_rollout: True
| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:---------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-------------------------------:|
| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 269.80 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 |
| fully_async_policy | 16:16 | 294.77 | 21.26 | \ | 313.81 | 7h 58m
(1.72x) | 16h 21m
(1.70x) | 1d 0h 53m
(2.31x) | 1d 9h 26m
(2.66x) | max: 0.3302
last: 0.2333 |
| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 |
| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 |
| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 |
| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
### 128-card 7B Asynchronous Mode Experiment
We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async.
We can see that the benefit brought by streaming is approximately 1.6x, and after combining staleness and
partial_rollout, the benefit reaches 2.35x.
| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
|:-------------------------------------------------------------------------------------------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:|
| colocate sync | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 |
| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 |
| `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | |
| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
### 128-card Stale Ablation Experiment
Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
efficiency.
We found that the larger the staleness, the more obvious the final gains.
We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps
increase, the response length changes significantly, causing training instability.
Further analysis and optimization are needed for this issue.
| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 |
| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 |
| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 |
| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
### 128-card 7B require_batches Ablation Experiment
In multiple tests, we found that the number of samples issued each time in streaming affects the response length during
training, which in turn affects training time. We verified the impact on results by modifying
`async_training.require_batches`.
| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 |
|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:|
| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 |
| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 |
| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
### 30B Model Mode Experiment
We achieved a 1.7x performance improvement with `async stream pipeline with staleness samples` strategy on the
Qwen3-30B-A3B-Base model compared to the colocate setup. It is worth noting that this is far from the upper limit of
performance gains achievable through asynchrony. Firstly, the comparative experiments used a maximum response length of
only 8k, which is much shorter than the 20k sequence length in previous experiments, resulting in a less pronounced
rollout tail effect. Secondly, we adopted a highly skewed resource allocation, with rollout using 96 GPUs and trainer
using 32 GPUs, which is not an optimal configuration. During the experiments, we observed that the current verl
implementation imposes certain constraints, such as requiring data to be evenly divisible by the number of GPUs, making
resource adjustment less flexible. Additionally, as asynchronous training and deployment accelerate, the performance gap
is gradually narrowing. Therefore, enabling more flexible resource allocation and dynamic resource adjustment in the
future will be our next focus.
* Machine: H20
* Model: Qwen3-30B-A3B-Base
* Rollout length: max_response_length : 8K tokens;
* Algorithm: GRPO
* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
* Engine: vllm+Megatron
* rollout.n: 16
* ppo_mini_batch_size: 128
* test_freq: 20
* colocate sync:
* step:400
* train_batch_size: 512
* fully_async_policy
* total_rollout_steps: 512*400
* trigger_parameter_sync_step: 512/128 = 4
* staleness_threshold: 0.5
* partial_rollout: True
| Training Mode | Resource Allocation | Step | Gen | Old Log Prob | Ref | Update Actor | Total Time 100 Step | Total Time 200 Step | Total Time 300 Step | Total Time 400 Step | Acc/Mean@1 |
|--------------------|---------------------|--------|--------|--------------|-------|--------------|---------------------|---------------------|---------------------|---------------------|-----------------------------|
| Colocate Sync | 128 | 497.89 | 348.05 | 28.73 | 20.86 | 86.27 | 13h 36m | 1d 3h 48m | 1d 19h 4m | 2d 11h 39m | max: 0.3500
last: 0.3208 |
| Fully Async Policy | 96:32 | 282.75 | 22.06 | \ | 50.05 | 206.63 | 6h 45m (2.01x) | 14h 48m (1.88x) | 1d 0h 9m (1.78x) | 1d 10h 41m (1.72x) | max: 0.3813
last: 0.3448 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-30B?nw=nwuserhouzg | | |
### checkpoint-engine Ablation Experiment
We tested the single-step parameter synchronization time of the checkpoint-engine on three models: Qwen2.5-Math-7B, Qwen3-30B-A3B, and Qwen3-235B-A22B, using default checkpoint-engine configurations. All experiments were performed on H20 machines, and the Megatron engine was used for training.
| model | trainer rank | rollout rank | checkpoint-engine | total sync time |
|:---------------:|:--------------:|:-------------:|:-------------------:|:-----------------:|
| Qwen2.5-Math-7B | 4 | 4 | False | 0.12s |
| Qwen2.5-Math-7B | 4 | 4 | True | 0.02s |
| Qwen3-30B-A3B | 16 | 16 | False | 15.76s |
| Qwen3-30B-A3B | 16 | 16 | True | 4.38s |
| Qwen3-235B-A22B | 64 | 64 | False | 58.57s |
| Qwen3-235B-A22B | 64 | 64 | True | 23.70s |
### use_trainer_do_validate Experiment
We tested the effect of setting `use_trainer_do_validate=True` on the training process. The results show that setting
this parameter to True can reduce the validation time overhead and trainer node idle time.
We used Qwen2.5-Math-7B to verify the benefits of `use_trainer_do_validate=True` on the training process, we achieved about 2x performance improvement on validation time, and the trainer node idle time is reduced by about 40%.
* Machine: H20
* Model: Qwen2.5-Math-7B
* Rollout length: max_response_length FSDP2: 10K tokens;
* Algorithm: DAPO
* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
* Engine: vllm+FSDP2
* rollout.n: 16
* ppo_mini_batch_size: 32
* test_freq: 10
* fully_async_policy
* total_rollout_steps: 512*400
* require_batches: 4
* trigger_parameter_sync_step: 4
* staleness_threshold: 0.5
* partial_rollout: True
| training mode | resource allocation | step | gen | old_log_prob | update_actor | validate time | total time
50 step | acc/mean@2 |
|:------------------:|:-------------------:|:-------:|:-------:|:------------:|:------------:|:-------------:|:---------------------:|:----------:|
| colocate sync | 16 | 484.623 | 52.939 | 0 | 430.263 | 205.080 | 7h9m | 22.6 |
| fully_async_policy | 8:8 | 489.953 | 52.622 | 0 | 435.874 | 95.699 | 7h2m | 21.0 |
## Multi-Turn Tool Calling
Referencing **recipe/retool** and **ToolAgentLoop**, we implemented **AsyncPartialToolAgentLoop**, a multi-turn
tool-calling loop that supports partial_rollout for **fully_async_policy**.
### Core Design
`AsyncPartialToolAgentLoop` inherits from `ToolAgentLoop` and is adapted for the asynchronous training mode of
`fully_async_policy`. When `partial_rollout=True`, the Rollouter interrupts ongoing generation tasks before
synchronizing parameters with the Trainer. `AsyncPartialToolAgentLoop` is capable of:
1. **Interrupting Tasks**: Responding to an interrupt signal to save the current state. Currently, interruptions occur
during the `GENERATING` process or after other states have completed.
2. **Resuming Tasks**: Resuming execution from the saved state after parameter synchronization is complete, rather than
starting over.
### How to Use
RL training with multi-turn tool calling in `fully_async_policy` is similar to `recipe/retool`. It is enabled by
specifying `multi_turn` configurations in the config file.
1. **SFT Stage**: First, the model should undergo SFT to learn how to follow tool-calling format instructions.
2. **Multi-turn Configuration**: In the `fully_async_policy` training configuration, set the following parameters:
```yaml
actor_rollout_ref:
rollout:
multi_turn:
enable: True # AsyncPartialToolAgentLoop will be used by default in fully_async_policy mode
# Other multi_turn related configurations
```
3. **Async Parameters**: To improve efficiency, enable `partial_rollout` and `staleness_threshold` when using multi-turn
tool calling:
```yaml
async_training:
partial_rollout: True
staleness_threshold: 0.5
# Other async parameters
```
4. **Example**: See `recipe/fully_async_policy/shell/dapo_7b_async_retool.sh`.
### Experimental Results
To validate the performance of `fully_async_policy` on multi-turn tool-calling tasks, we compared it with the standard
`colocate` synchronous mode. Key parameter settings are as follows.
* **SFT Model**: Based on `Qwen2.5-7B-Instruct`, trained for 6 epochs on the `ReTool-SFT` dataset
* **RL Algorithm**: DAPO
* **Dataset**:
* Train: `DAPO-Math-17k`
* Test: `aime_2025`
* **Resource and Mode Comparison**:
* `colocate sync`: 32 H20 gpus
* `fully_async_policy`: 16 gpus for Trainer + 16 gpus for Rollouter
* **Key Configurations**:
1. **Tool Calling Configuration**:
* `multi_turn.enable: True`
* `multi_turn.max_user_turns: 16`
* `multi_turn.max_assistant_turns: 16`
* `multi_turn.tool_config_path: recipe/retool/sandbox_fusion_tool_config.yaml`
2. **`colocate sync` Configuration**:
* `ppo_mini_batch_size: 16`
* `train_batch_size: 64`
3. **`fully_async_policy` Configuration**:
* `ppo_mini_batch_size: 16`
* `trigger_parameter_sync_step: 4`
* `require_batches: 1`
* `staleness_threshold: 1`
* `partial_rollout: True`
| training mode | Resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | aime_2025
acc/mean@30 |
|:--------------------:|:---------------------:|:---------:|:---------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:-------------------------------:|
| colocate | 32 | 375.47 | 228.03 | 35.19 | 111.84 | 9h 46m | 22h 28m | start:0.1078
last:0.2056 |
| fully_async_policy | 16: 16 | 221.36 | 40.59 | \ | 179.58 | 6h 19m
(1.55x) | 14h 4m
(1.60x) | start:0.11
last:0.2044 |
> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-multiturn-tool?nw=nwuserhouzg
## Future Plans
* GRPO experiments
* Megatron adaptation
* SGLang integration
* Transfer queue integration
* Asynchronous parameter synchronization
* AReaL asynchronous algorithm implementation
* TPPO algorithm implementation
* Multi-turn and Tool support
================================================
FILE: docs/advance/grafana_prometheus.md
================================================
# Use Prometheus and Grafana to Monitor Rollout
**Author:** `https://github.com/meituan-search`
Last updated: 12/05/2025.
Monitor the rollout computation process using Prometheus and Grafana when using verl to enhance system observability and facilitate further performance optimization.
We provide an additional training monitoring capability, leveraging Prometheus and Grafana to display rollout information during training and enhance system observability to facilitate further performance optimization.
The system automatically configures Prometheus to scrape metrics from rollout servers, eliminating manual configuration steps.
## Overview
The figures below show the performance of Qwen235B on the AIME2024 dataset with a response length of 20k, where the emergence of a long-tail problem is clearly observable.

The following figure presents the fully asynchronous training of the Qwen235B model. Here, resource idleness is distinctly noticeable, indicating that rollout resources can be reduced.

Through the above two examples, we also illustrate the necessity of system observability.
## Architecture Overview
The overall workflow consists of the following steps:
1. **Multi-node Ray Cluster Setup**: Start Ray cluster across multiple nodes with Grafana and Prometheus information configured in environment variables on the master node
2. **Start Grafana Service**: Launch Grafana on the master node for visualization of monitoring dashboards
3. **Start Prometheus Service**: Launch Prometheus on the master node for metrics collection and storage
4. **verl Async Rollout Mode**: verl uses async rollout mode to obtain rollout server ports and IP addresses
5. **Automatic Prometheus Configuration**: verl automatically rewrites the Prometheus configuration to add monitoring for rollout servers and notifies Prometheus to reload the configuration
6. **Metrics Collection**: After program execution, metrics can be viewed in Prometheus
7. **Dashboard Visualization**: Upload and view monitoring metrics in Grafana dashboards
## Detailed Setup Steps
### Step 1: Environment Variables and Start Ray Cluster
First, set the necessary environment variables and start the Ray service.
> Reference: [configure-manage-dashboard](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
```bash
# Master node environment variables
export GF_SERVER_HTTP_PORT=3000 # Grafana service default port (customizable)
export PROMETHEUS_PORT=9090 # Prometheus service default port (customizable)
export RAY_HEAD_PORT=6379 # Ray master node port (customizable)
export RAY_DASHBOARD_PORT=8265 # Ray dashboard default port (customizable)
export GRAFANA_PATHS_DATA=/tmp/grafana # Grafana data storage directory (customizable)
export RAY_GRAFANA_HOST="http://${master_ip}:${GF_SERVER_HTTP_PORT}" # Ray-associated Grafana address
export RAY_PROMETHEUS_HOST="http://${master_ip}:${PROMETHEUS_PORT}" # Ray-associated Prometheus address
# Start Ray on master node
ray start --head --port=${RAY_HEAD_PORT} --dashboard-port=${RAY_DASHBOARD_PORT}
# Start Ray on worker nodes
ray start --address={master_addr}:${RAY_HEAD_PORT}
```
**Verification:** Visit `http://master_ip:8265` to confirm Ray has started successfully.
### Step 2: Start Grafana (Visualization Dashboard)
Grafana is used to display metrics collected by Prometheus (such as cache hit rate, throughput, etc.):
```bash
# Master node
nohup grafana-server \
--config /tmp/ray/session_latest/metrics/grafana/grafana.ini \
--homepath /usr/share/grafana \
web > grafana.log 2>&1 &
```
**Verification:** Visit `http://master_ip:3000` to confirm Grafana has started successfully (default credentials: `admin/admin`).
If you need to change the port, modify the `GF_SERVER_HTTP_PORT` environment variable, and grafana-server will automatically recognize it.
### Step 3: Start Prometheus (Metrics Collection)
Prometheus is responsible for scraping metrics from vLLM services and storing them as time-series data:
```bash
# Master node
nohup prometheus \
--config.file /tmp/ray/session_latest/metrics/prometheus/prometheus.yml \
--web.enable-lifecycle \
--web.listen-address=:${PROMETHEUS_PORT} \
> prometheus.log 2>&1 &
```
**Verification:** Visit `http://master_ip:9090` to confirm Prometheus service has started successfully.
### Step 4 & 5: Start verl Training
Start verl training with the following parameters configured:
**Required Configuration:**
- `actor_rollout_ref.rollout.mode="async"`
- `actor_rollout_ref.rollout.disable_log_stats=False`
- `actor_rollout_ref.rollout.prometheus.enable=True`
If use default port, this parameter can be omitted.
- `actor_rollout_ref.rollout.prometheus.port=9090`
If use default path, this parameter can be omitted.
- `actor_rollout_ref.rollout.prometheus.file="/tmp/ray/session_latest/metrics/prometheus/prometheus.yml"`
served_model_name uses `model_path.split("/")[-1]` for data statistics by default.
Users can also customize other aliases:
- `actor_rollout_ref.rollout.prometheus.served_model_name="Qwen3-235B"`
**Shell Script Example:**
```bash
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
rollout_mode="async"
rollout_name="vllm" # Options: sglang or vllm
if [ "$rollout_mode" = "async" ]; then
export VLLM_USE_V1=1
return_raw_chat="True"
fi
# Synchronous training
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
data.return_raw_chat=${return_raw_chat} \
actor_rollout_ref.rollout.name=${rollout_name} \
actor_rollout_ref.rollout.mode=${rollout_mode} \
actor_rollout_ref.rollout.disable_log_stats=False \
actor_rollout_ref.rollout.prometheus.enable=True
...
# Asynchronous training
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--working-dir "${WORKING_DIR}" \
-- python3 verl.experimental.fully_async_policy.fully_async_main \
data.return_raw_chat=${return_raw_chat} \
actor_rollout_ref.rollout.name=${rollout_name} \
actor_rollout_ref.rollout.mode=${rollout_mode} \
actor_rollout_ref.rollout.disable_log_stats=False \
actor_rollout_ref.rollout.prometheus.enable=True
...
```
### Step 6: View Metrics in Prometheus
After task execution, verify that Prometheus is correctly collecting metrics.
**Verification:** Visit the Prometheus interface at `http://master_ip:9090` and search for `vllm:` or `sglang:` to
confirm metrics are being reported correctly.
**Troubleshooting:**
If no metrics appear:
1. Check logs for `AgentLoopManager` to find the server port
2. Visit `http://master_ip:server_port/metrics` to verify server metrics are available
3. Confirm that `actor_rollout_ref.rollout.disable_log_stats=False` is set
### Step 7: View Metrics in Grafana
After task execution, log in to Grafana to view and customize monitoring dashboards.
**Login:** Visit `http://master_ip:3000` (default credentials: `admin/admin`)
**Import Dashboard:**
1. Select `Dashboards` → `New` → `Import` → `Upload dashboard JSON file`
2. Upload a pre-built dashboard JSON file
**Available Dashboards:**
- [vLLM Grafana Dashboard style 1](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana/vllm_grafana.json)
- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/performance_statistics.json)
- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/query_statistics.json)
- [SGLang Grafana Dashboard](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json)
## Additional Resources
- [Ray Monitoring Documentation](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
- [Prometheus Documentation](https://prometheus.io/docs/)
- [Grafana Documentation](https://grafana.com/docs/)
- [vLLM GitHub Repository](https://github.com/vllm-project/vllm)
- [SGLang GitHub Repository](https://github.com/sgl-project/sglang)
================================================
FILE: docs/advance/megatron_extension.rst
================================================
Add models with the Megatron-LM backend
=========================================
Last updated: 04/25/2025.
Model
-----------
If use latest verl, we have direct support of ``GPTModel`` for Megatron backend.
You can use the similar way of using Megatron to pretrain custom models.
We list the steps here:
1. Find `model_initializer.py `_
2. If your model is configurable by ``TransformerLayerSpec`` , you can
directly use ``GPTModel``. Otherwise, Please implement a new
``ModelLayerSpec`` and ``ModelLayer`` here.
3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig``
as arguments to initialize the GPTModel.
4. Return the model at last.
================================================
FILE: docs/advance/mtp.md
================================================
# Guide to Using MTP in SFT/RL Training and Inference
**Author**: `https://github.com/meituan-search`
Last updated: 02/15/2026
# 1. Scope of Support
Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek series models based on the MTP architecture. The support rules for training and inference engines are as follows:
- **Training Engine**: Only supports the `mbridge/Megatron-Bridge + megatron` combination; other training engines are not compatible at this time;
- **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list;
- **Dependency Versions**:
- mbridge: Apply the patches and review suggestions from PR: [#62](https://github.com/ISEEKYAN/mbridge/pull/62) (Already merged into the main branch);
- Megatron-Bridge: Apply the patches and review suggestions from PR if you want to try out mimo-7B-RL: [#2387](https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/2387) (will be merged into the main branch in the future);
- megatron: Use the latest dev version (commit: [23e092f41ec8bc659020e401ddac9576c1cfed7e](https://github.com/NVIDIA/Megatron-LM/tree/23e092f41ec8bc659020e401ddac9576c1cfed7e)), which supports MTP + CP training methods.
- sglang: Use the specified branch: [https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor](https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor), [PR](https://github.com/sgl-project/sglang/pull/17870) , which fix the MTP update weights from tensor OOM issue.
# 2. MTP Training Configuration (Core Parameters)
The MTP training process can be flexibly controlled through the following configurations. All configurations are based on the `actor_rollout_ref.model.mtp` prefix:
| Configuration Scenario | Core Parameters | Description |
|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|
| Load MTP Parameters Only | `enable=True` | VRAM usage will increase, but the exported parameters include the MTP module and can be directly used for online deployment |
| Full-Parameter MTP Training | `enable=True`
`enable_train=True`
`mtp_loss_scaling_factor=0.1` | MTP Loss will apply to all model parameters |
| MTP Parameter-Only Training | `enable=True`
`enable_train=True`
`detach_encoder=True` | Freeze the Encoder layer, update only MTP module parameters, MTP Loss applies only to MTP parameters |
| MTP Accelerated Rollout | 1. vLLM configuration:
`enable=True`
`enable_rollout=True`
`method="mtp"`
`num_speculative_tokens=1`
2. SGLang configuration:
`enable=True`
`enable_rollout=True`
`speculative_algorithm="EAGLE"`
`speculative_num_steps=2`
`speculative_eagle_topk=2`
`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on MTP |
# 3. Experimental Results
The experiment was conducted as follows:
* model = mimo-7B-math
* max_response_length = 8k
Experiment chart:

The wandb link for the graph: [wandb](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
**Scenarios with No Significant Effect**
The following configurations will not have a noticeable impact on training results:
1. The base model does not carry MTP parameters;
2. The base model carries MTP parameters, but the MTP module is not trained;
3. The base model carries MTP parameters and trains MTP, with `mtp_loss_scaling_factor=0`;
4. The base model carries MTP parameters, trains MTP and detaches the encoder, with `mtp_loss_scaling_factor=0.1`.
**Scenarios with Significant Effect**
Only the following configuration will have a noticeable impact on training results:
- The base model carries MTP parameters, MTP Loss applies to all model parameters, and `mtp_loss_scaling_factor=0.1`.
**Recommended Training Method**
It is recommended to adopt the `detach_encoder=True` approach for MTP training.
# 4. Performance Notes for MTP in Rollout Inference
Enabling MTP improves the rollout acceptance rate by around 14%. However, on H20 GPUs, overall throughput does not increase and even decreases slightly.

The effectiveness of MTP-accelerated Rollout is significantly affected by **model size** and **inference hardware**. Key reference information is as follows:
**Hardware Tensor Core Performance**
| Hardware Model | FP16 Performance (TFLOPS) |
|----------------|---------------------------|
| H20 | 148 |
| H800 | 1,671 |
| H200 | 1,979 |
**Measured Performance and Recommendations**
Taking the mimo-7B model deployed separately on H20 hardware using SGLang as an example: After enabling MTP speculative decoding, the Rollout throughput decreases by approximately 50%.
- Current priority recommendation: Do not enable MTP acceleration during the inference phase for now;
- Future planning: Further optimization of the speculative logic in the Rollout phase will be conducted to improve throughput performance.
# 5. SFT training
The SFT training with MTP is supported, using the same MTP training configuration as RL training.
An example configuration for running SFT can be found in `examples/sft/gsm8k/run_mimo_megatron_mtp.sh`
**SFT result**
The experiment was conducted using following data:
- model = mimo-7B-math
- dataset = gsm8k
The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value.
================================================
FILE: docs/advance/one_step_off.md
================================================
# Recipe: One Step Off Policy Async Trainer
**Author:** `https://github.com/meituan-search`
Last updated: 07/17/2025.
## Introduction
### Background
The current reinforcement learning training process implemented by verl is synchronous, adhering to the algorithmic
workflows of established methods like PPO, GRPO, and DAPO. In each step, training samples are generated by the latest
model, and the model is updated after training completes. While this approach aligns with off-policy reinforcement
learning and stabilizes RL training, but it suffers from severe efficiency issues.
Model updates must wait for the longest output in the generation phase to complete.
During the generation of long-tail samples, GPUs remain idle, resulting in significant underutilization.
The more severe the long-tail problem in sample generation, the lower the overall training efficiency.
For example, in DAPO 32B training, the Rollout phase accounts for approximately 70% of the total time,
and increasing resources does not reduce the Rollout duration.

> source data: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=nwusertongyuxuan361
### Solution
We have implemented the **One Step Off Async Trainer** to help alleviate this issue. This approach parallelizes the
generation and training processes, utilizing samples generated in the previous step for current training.
It also involves appropriately partitioning resources, allocating dedicated resources for generation while automatically
assigning the remainder to training. By reducing resources allocated to the generation phase, we mitigate GPU idle time
during long-tail sample generation. Throughout this process, generation and training parameters maintain a one-step off
policy.

> reference: [AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning](https://arxiv.org/abs/2505.24298)
Our core contributions include:
1. **Parallel Generation and Training**:
Samples for the next batch are asynchronously generated while the current batch is being trained.
2. **Resource Isolation**:
Unlike `hybrid_engine`, this method requires explicit resource allocation for rollout, with remaining resources
automatically assigned to training.
3. **NCCL Parameter Synchronization**:
Employs NCCL communication primitives for seamless parameter transfer between generation and training modules.
### Experimental Results
- **Machine Configuration**: 2 nodes with 16 H20 GPUs each
- Generation: 4 GPUs
- Training: 12 GPUs
- **Model**: Qwen2.5-Math-7B
- **Rollout Configuration**:
- **Max Response Length**: FSDP2: 20,480 tokens; Megatron: 8,192 tokens
- **Algorithm**: DAPO
- **Rollout Engine**: vLLM
| training mode | engine | step | gen | wait_prev_gen | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | acc/maj@32/mean |
| ---------------------- | ------------- | ---- | --- | ------------- | ------------------ | ------------ | ------------ | -------------- | ---------------- | --------------- |
| colocate sync | VLLM+FSDP2 | 749 | 321 | - | 247 | 88 | 286 | 19h18m | 0.5948 | 0.417 |
| one-step-overlap async | VLLM+FSDP2 | 520 | - | 45 | 458 | 108 | 337 | 15h34m(+23%) | 0.6165 | 0.494 |
| colocate sync | VLLM+Megatron | 699 | 207 | - | 162 | 119 | 344 | 18h21m | 0.605 | 0.4217 |
| one-step-overlap async | VLLM+Megatron | 566 | - | 59 | 501 | 120 | 347 | 13h06m (+40%) | 0.6569 | 0.4038 |
- colocate sync: step ≈ gen + old_log_prob + update_actor
- one-step-overlap async: step ≈ wait_prev_gen + old_log_prob + update_actor

> source data: https://wandb.ai/hou-zg-meituan/one-step-off-policy?nw=nwuserhouzg
## Implementation
### One Step Off Policy Async Pipeline
Our implemented **One Step Off Policy Async Pipeline** integrates seamlessly into existing training logic at minimal
cost,
eliminating the need for additional sample storage management. The core mechanism uses `async_gen_next_batch`
for asynchronous rollout generation while maintaining continuous operation during epoch transitions
via `create_continuous_iterator`.
```python
# iterator generator, simplify one-step integration of the training process
def _create_continuous_iterator(self):
for epoch in range(self.config.trainer.total_epochs):
iterator = iter(self.train_dataloader)
for batch_dict in iterator:
yield epoch, batch_dict
# read next batch samples, parameters sync and launch asyn gen_seq
def _async_gen_next_batch(self, continuous_iterator):
# read train_data
try:
epoch, batch_dict = next(continuous_iterator)
except StopIteration:
return None
batch = DataProto.from_single_dict(batch_dict)
gen_batch = batch_pocess(batch)
# sync weights from actor to rollout
self.sync_rollout_weights()
# async generation
gen_batch_output = self.rollout_wg.async_generate_sequences(gen_batch)
# future encapsulated
return GenerationBatchFuture(epoch, batch, gen_batch_output)
continuous_iterator = self._create_continuous_iterator()
# run rollout first to achieve one-step-off
batch_data_future = self._async_gen_next_batch(continuous_iterator)
while batch_data_future is not None:
# wait for the gen_seq result from the previous step
batch = batch_data_future.get()
# launch the next async call to generate sequences
batch_data_future = self._async_gen_next_batch(continuous_iterator)
# compute advantages
batch = critic.compute_values(batch)
batch = reference.compute_log_prob(batch)
batch = reward.compute_reward(batch)
batch = compute_advantages(batch)
# model update
critic_metrics = critic.update_critic(batch)
actor_metrics = actor.update_actor(batch)
```
### Parameter Synchronization
The exciting point is that our nccl based weights updating for rollout model has great performance.
At most of time, the latency is under 300ms, which is negligible for RLHF.
> **sync_rollout_weights**:The time for synchronizing parameters from actor to rollout is extremely fast and can almost
> be ignored because it is implemented with nccl.
```python
class ActorRolloutRefWorker:
# actor acquires the meta-info of model parameters for parameter sync
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def get_actor_weights_info(self):
params = self._get_actor_params()
ret = []
for key, tensor in params.items():
ret.append((key, tensor.size(), tensor.dtype))
self._weights_info = ret
return ret
# rollout sets the meta-info of model parameters for parameter sync
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def set_actor_weights_info(self, weights_info):
self._weights_info = weights_info
class AsyncRayPPOTrainer(RayPPOTrainer):
def init_workers(self):
...
# rollout obtains the meta-info of model parameters from the actor for parameter sync
weights_info = self.actor_wg.get_actor_weights_info()[0]
self.rollout_wg.set_actor_weights_info(weights_info)
# Create an actor-rollout communication group for parameter sync
self.create_weight_sync_group
```
```python
# The driving process invokes the actor and rollout respectively to create a weight synchronization group based on nccl/hccl.
def create_weight_sync_group(self):
master_address = ray.get(self.actor_wg.workers[0]._get_node_ip.remote())
master_port = ray.get(self.actor_wg.workers[0]._get_free_port.remote())
world_size = len(self.actor_wg.workers + self.rollout_wg.workers)
self.actor_wg.create_weight_sync_group(
master_address,
master_port,
0,
world_size,
)
ray.get(
self.rollout_wg.create_weight_sync_group(
master_address,
master_port,
len(self.actor_wg.workers),
world_size,
)
)
# drive process call the actor and rollout respectively to sync parameters by nccl
def sync_rollout_weights(self):
self.actor_wg.sync_rollout_weights()
ray.get(self.rollout_wg.sync_rollout_weights())
# fsdp model parameter sync
@register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
def sync_rollout_weights(self):
params = self._get_actor_params() if self._is_actor else None
if self._is_rollout:
inference_model = (
self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
)
from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader
patch_vllm_moe_model_weight_loader(inference_model)
# Model parameters are broadcast tensor-by-tensor from actor to rollout
for key, shape, dtype in self._weights_info:
tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
if self._is_actor:
assert key in params
origin_data = params[key]
if hasattr(origin_data, "full_tensor"):
origin_data = origin_data.full_tensor()
if torch.distributed.get_rank() == 0:
tensor.copy_(origin_data)
from ray.util.collective import collective
collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
if self._is_rollout:
inference_model.load_weights([(key, tensor)])
```
### PPO Correctness
To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling.
For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html
The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored.
### AgentLoop
In the current implementation, we no longer provide SPMD model rollout mode.
Instead, we have switched to AgentLoop mode, which also supports multi-turn tool calling.
## Usage
### FSDP2 Configuration Example
```shell
python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
--config-path=config \
--config-name='one_step_off_ppo_trainer.yaml' \
actor_rollout_ref.actor.strategy=fsdp2 \
# actor and rollout are placed separately
actor_rollout_ref.hybrid_engine=False \
# actor and rollout resource
trainer.nnodes=1 \
trainer.n_gpus_per_node=6 \
rollout.nnodes=1 \
rollout.n_gpus_per_node=2
```
### Megatron Configuration Example
```shell
python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
--config-path=config \
--config-name='one_step_off_ppo_megatron_trainer.yaml' \
actor_rollout_ref.actor.strategy=megatron \
# actor and rollout are placed separately
actor_rollout_ref.hybrid_engine=False \
# actor and rollout resource
trainer.nnodes=1 \
trainer.n_gpus_per_node=6 \
rollout.nnodes=1 \
rollout.n_gpus_per_node=2
```
### Configuration Guidelines
1. **Card Number Relationships**
Maintain either of these relationships for optimal batch distribution:
- `actor_rollout_ref.rollout.n` should be an integer divisor of:
`trainer.n_gpus_per_node * trainer.nnodes`
- `actor_rollout_ref.rollout.n * data.train_batch_size` should be evenly divisible by:
`trainer.n_gpus_per_node * trainer.nnodes`
> Rationale: Ensures training samples can be evenly distributed across training GPUs when using partial resources for
> generation.
2. **Dynamic Resource Tuning**
Adjust `trainer.nnodes` `trainer.n_gpus_per_node` `rollout.nnodes` `rollout.n_gpus_per_node` based on phase
durations:
- **Ideal state**: Rollout and training phases have comparable durations
- **Diagnostic metrics**:
- Monitor `wait_prev_gen` duration
- Analyze `sequence_length` distribution
- **Adjustment strategy**:
- High `wait_prev_gen` + uniform sequence lengths → Increase rollout resources
- High `wait_prev_gen` + long-tail sequences → Optimize stopping criteria (resource increase won't help)
> **wait_prev_gen**:The time consumed waiting for the previous rollout to end (the part that is not fully
> overlapped).
> **Resource Configuration Strategies:**
- **Resource-constrained scenario**: Optimize resource utilization by adjusting GPU allocation ratios,
keeping the number of nodes equal to allow training and rollout to share nodes;
- Configure `trainer.nnodes = rollout.nnodes` with
`trainer.n_gpus_per_node + rollout.n_gpus_per_node = physical_gpus_per_node`. Control rollout resource
allocation by adjusting `n_gpus_per_node`.
- **Resource-abundant scenario**: Optimize performance by adjusting the number of nodes,
keeping the number of GPUs per node equal to enable independent scaling of training and rollout
parallelism.
- Configure `trainer.n_gpus_per_node = rollout.n_gpus_per_node` and control rollout resource allocation by
adjusting `trainer.nnodes` and `rollout.nnodes`to achieve optimal performance.
> **Note**: The total number of nodes required by the system is not simply `trainer.nnodes + rollout.nnodes`. The
> actual calculation depends on GPU capacity:
>
> - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node <= physical_gpus_per_node`,
> the required node count is `max(trainer.nnodes, rollout.nnodes)`
> - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`,
> the required node count is `trainer.nnodes + rollout.nnodes`
## Functional Support
| Category | Support Situation |
| ------------------ | --------------------------------------------------------------------------------------------------------------- |
| train engine | FSDP2
Megatron |
| rollout engine | vLLM |
| AdvantageEstimator | GRPO
GRPO_PASSK
REINFORCE_PLUS_PLUS
RLOO
OPO
REINFORCE_PLUS_PLUS_BASELINE
GPG |
| Reward | all |
================================================
FILE: docs/advance/placement.rst
================================================
Ray API Design Tutorial
=======================================
Last updated: 10/30/2024.
We provide a tutorial for our Ray API design, including:
- Ray basic concepts
- Resource Pool and RayWorkerGroup
- Data Dispatch, Execution and Collection
- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
See details in `tutorial.ipynb `_.
================================================
FILE: docs/advance/ppo_lora.rst
================================================
RL(HF) algorithms with LoRA Support
===========================================
Last updated: 02/03/2026.
We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware.
The benefits this brings include:
- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs),
- enable larger batch sizes due to reduced memory usage,
- simplify model transfer and deployment, as only LoRA adapters need to be saved,
- Combine with techniques like `SLoRA `_ or `CCoE `_ to serve multiple LoRA adapters efficiently
This guide explains how to enable LoRA in RL training and configure related parameters.
FSDP Backend Usage Guide
------------------------
.. note::
This section applies to **FSDP/FSDP2 backend only**. For Megatron backend, see the :ref:`megatron-lora` section below.
1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point.
2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon).
- `strategy=fsdp` or `strategy=fsdp2`
- `rollout.name=vllm`
3. Required configurations for LoRA:
- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64)
- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA
- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model.
- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear".
4. Optional configurations for LoRA:
- `actor_rollout_ref.model.lora_adapter_path`: string, path to a pretrained LoRA adapter directory.
If provided, loads existing adapter instead of creating new one. Enables multi-stage training from previously saved adapters.
Directory need contain `adapter_model.safetensors` and `adapter_config.json`.
- `actor_rollout_ref.model.lora.merge`: bool, whether to merge LoRA adapters into the base model weights before transferring to vLLM.
If True, it will merge LoRA adapters into the base model weights before transferring to vLLM. If False, it will transfer only adapters to vLLM. This option is currently supported **only for engine-based rollout workers** (i.e. vLLM engine workers using the new worker implementation with ``trainer.use_legacy_worker_impl`` disabled) and is not available when using the legacy worker implementation.
5. Recommend options:
- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed.
- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB)
.. _megatron-lora:
Megatron Backend Usage Guide
----------------------------
.. warning::
The FSDP-specific config options are **NOT applicable** to Megatron backend, and they will be ignored if set. Only options listed under ``lora`` key are applicable:
- ``actor_rollout_ref.model.lora.*``
- ``critic.model.lora.*``
You need to install and enable Megatron-Bridge for Megatron LoRA support.
Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit `_ or later for proper support, and use the following settings to enable Megatron-Bridge:
- ``actor_rollout_ref.actor.megatron.use_mbridge=True``
- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
**Key Differences from FSDP LoRA:**
1. **LoRA Implementation**: Verl Megatron backend uses Megatron-Bridge's native LoRA implementation, which differs from HuggingFace PEFT.
2. **Weight Sync / Refit Mechanism**: Currently, Megatron-Bridge can support syncing weights by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss), as well as loading separate adapters.
**Configuration for Megatron LoRA:**
.. code-block:: yaml
actor_rollout_ref:
model:
lora:
# LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
type: lora
# whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters.
merge: False
# LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
rank: 0
# Weighting factor for the low-rank projection. Defaults to 32
alpha: 32
# Dropout rate for the low-rank projection. Defaults to 0.0
dropout: 0.0
# A list of module names to apply LoRA to.
# For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
# For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
# - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
# - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
# - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
# - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
# Target modules can also contain wildcards. For example, you can specify
# target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
#
# Note:
# For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
# Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
# By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
target_modules:
- linear_qkv
- linear_proj
- linear_fc1
- linear_fc2
# A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
# does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
exclude_modules: []
# Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
dropout_position: pre
# Initialization method for the low-rank matrix A. Defaults to "xavier".
lora_A_init_method: xavier
# Initialization method for the low-rank matrix B. Defaults to "zero".
lora_B_init_method: zero
# Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
a2a_experimental: False
# Parameter data type for LoRA weights. Default to null, which will use model's dtype.
dtype: null
# Path to pre-trained LoRA adapter weights (null to train from scratch)
adapter_path: null
# Whether to fully shard LoRA adapters. Defaults to False
# https://docs.vllm.ai/en/latest/api/vllm/config/lora/#vllm.config.lora.LoRAConfig.fully_sharded_loras
fully_sharded_loras: bool
# VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
# For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
# finetune the vision model.
freeze_vision_model: True
freeze_vision_projection: True
freeze_language_model: True
LoRA training experiment with Qwen3-8B on 8 * H200 single node comparing FSDP and Megatron backend (script adapted from examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh):
.. image:: https://github.com/user-attachments/assets/0482f423-01a3-4e52-a7ee-8b9cd79b7b1a
.. image:: https://github.com/user-attachments/assets/6ce10400-8164-47d8-90a6-c1bf002fb9e8
.. image:: https://github.com/user-attachments/assets/092d3a43-4eba-425e-a584-8d83c1f02de4
Best Practices and Notes
-------------------------
1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude.
2. **LoRA Rank**:
- Too small a rank can hurt convergence.
- LoRA rank recommendation from @thelongestusernameofall:
- A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training
- For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training.
- More comprehensive reference results are coming soon.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true
3. **FSDP-Specific:** Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed):
.. code-block::
data.train_batch_size=64 \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=64 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
Example Scripts
-------------------
For end-to-end examples, refer to the scripts below:
**FSDP Examples:**
- LoRA training from scratch: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
- LoRA training from adapter path: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh
**Megatron Examples:**
- LoRA training with Dense: examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
- LoRA training with MoE: examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
================================================
FILE: docs/advance/reward_loop.rst
================================================
Reward Loop
===========
.. _yyding: https://yyding1.github.io
Author: `Yuyang Ding `_
Last updated: 2/10/2026.
Introduction
------------
Reward Loop is the default reward computation implementation in verl.
It is designed to support efficient, flexible, and easy-to-use reward computation.
This document introduces the usage and architectural design.
Key features include:
1. **Distributed reward manager**, enabling scalable and efficient reward computation.
2. **Support for hybrid reward settings**, including both generative and discriminative reward models, as well as more complex reward scenarios.
3. **Simple and extensible interface**, for easily defining customized reward functions.
Distributed Reward manager
--------------------------
.. image:: https://github.com/yyDing1/verl-materials/blob/main/distributed_reward_manager.svg?raw=true
How distributed
~~~~~~~~~~~~~~~
Under the single_controller setup, actor rollout and reward computation can be abstracted as:
.. code:: python
# initalize rollout manager and async reward loop manager
async_rollout_manager = AgentLoopManager(config)
async_reward_manager = RewardLoopManager(config)
# actor rollout using `async_rollout_manager`
gen_batch = async_rollout_manager.generate_sequences(batch)
# compute reward using `async_reward_manager`
reward_batch = async_reward_manager.compute_rm_score(gen_batch)
Within the ``RewardLoopManager``, multiple ``RewardWorker`` are launched across all nodes to enable distributed reward computation.
The number of parallel workers can be configured via ``config.reward.num_workers``.
Upon receiving a batch reward request, the batch is partitioned into smaller chunks and distributed to each reward worker for parallel execution.
User only need to invoke ``compute_rm_score``.
.. code:: python
class RewardLoopManager:
"""
RewardLoopManager run in single controller.
This class will create reward loop workers and manage them.
"""
def _init_reward_loop_workers(self):
self.reward_loop_workers = [...]
def compute_rm_score(self, data):
chunks = data.chunk(len(self.reward_loop_workers))
outputs = ray.get(
[
worker.compute_score_batch.remote(chunk)
for worker, chunk in zip(self.reward_loop_workers, chunks, strict=True)
]
)
outputs_flat = [item for sublist in outputs for item in sublist]
...
This is how the reward manager is parallelized and distributed across all nodes.
Streaming Reward with Rollout
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Furthermore, we check whether actor rollout and reward computation can be performed in a streaming manner,
where the reward is calculated as soon as each sample is rolled out.
.. code:: python
# agent_reward_loop: streaming reward computation with actor rollout
# two conditions satisfied: (1) rule-based reward, or (2) reward model with extra resource pool
enable_agent_reward_loop = not use_rm or config.reward.reward_model.enable_resource_pool
# if enable_agent_reward_loop, we directly pass reward_loop_workers to agent loop manager
# to stream reward computation with actor rollout
reward_loop_worker_handles = async_reward_manager.reward_loop_workers if enable_agent_reward_loop else None
async_rollout_manager = AgentLoopManager(
config=config,
worker_group=actor_rollout_wg,
rollout_resource_pool=actor_rollout_resource_pool,
reward_loop_worker_handles=reward_loop_worker_handles,
)
Hybrid Reward Scenarios Usage
-----------------------------
As described above, each ``reward_loop_worker`` is responsible for handling reward requests.
The rewards can be categorized as follows:
- **Rule-based Reward**: The reward is determined by predefined rules, e.g., checking whether the predicted answer matches the ground truth via string matching.
- **Discriminative Reward Model (DisRM)**: The reward is produced by a specified discriminative reward model, such as ``Skywork/Skywork-Reward-Llama-3.1-8B-v0.2``.
- **Generative Reward Model (GenRM)**: The reward is obtained using a generative reward model, for example ``dyyyyyyyy/FAPO-GenRM-4B``.
- **Hybrid Reward Scenarios**: A combination of the above reward types, e.g., rule + GenRM.
.. code:: python
class RewardLoopWorker:
async def compute_score_batch(self, data: DataProto) -> list[dict]:
tasks = []
for i in range(len(data)):
tasks.append(asyncio.create_task(self.compute_score(data[i : i + 1])))
outputs = await asyncio.gather(*tasks)
return outputs
async def compute_score(self, data: DataProto) -> dict:
assert len(data) == 1, "RewardLoopWorker only support single data item"
if self.config.reward.custom_reward_function.path is not None:
# directly use user-customized reward function
return await self.reward_manager.run_single(data)
else:
if self.config.reward.reward_model.enable:
# we assume the rm is disrm
# genrm must set custom_reward_function
return await self.compute_score_disrm(data)
else:
return await self.reward_manager.run_single(data)
Each ``RewardLoopWorker`` will initalize one ``RewardManager``, splits the batch into individual data items and processes them in parallel using asynchronous tasks.
Reward Manager
~~~~~~~~~~~~~~
The ``RewardManager`` maintains a reward function and defines its computation logic, including:
- **naive**: The simplest implementation.
- **dapo**: DAPO implementation with an overlong reward penalty.
- **limit**: Restricts the concurrency of the reward function, useful when external API calls are rate-limited.
- **remote**: Runs in a separate process, effective for CPU-intensive tasks such as ``Math-Verify``.
Users can also customize their own ``RewardManager``, inheriting from ``RewardManagerBase``, and implementing the ``run_single`` function.
.. code:: python
@register("user_costomized")
class UserCostomizedRewardManager(RewardManagerBase):
async def run_single(self, data: DataProto) -> dict:
assert len(data) == 1, "Only support single data item"
# your own reward manager
...
After defining it, users can specify their custom reward manager by setting ``reward.reward_manager.name=user_costomized``.
Rule-Based Reward
~~~~~~~~~~~~~~~~~
If ``reward.custom_reward_function`` is provided, the user-defined reward function will be used. Otherwise, it falls back to the default reward function.
Note that The custom function can be either synchronous or asynchronous; the system automatically detects its type and loads it accordingly.
We recommend **using asynchronous functions** when reward computation need to involve external model API calls or sandboxed execution, as they are significantly more efficient.
.. code:: python
async def compute_score(data_source, solution_str, ground_truth, extra_info):
"""Compute a score by sending an async request to a remote service."""
# prepare request payload
payload = {"messages": [{"role": "user", "content": "check the correcness of the question and response ..."}], ...}
# send async HTTP request
async with aiohttp.ClientSession() as session:
async with session.post("https://api.openai.com/v1/chat/completions", json=payload) as resp:
result = await resp.json()
# parse and return score
score = int(result["choices"][0]["message"]["content"].strip().split("\n")[-1])
return {"score": score}
Model-Base Reward
~~~~~~~~~~~~~~~~~
**For discriminative reward model (DisRM)**, we provide a simple implementation:
.. code:: python
class RewardLoopWorker:
async def compute_score_disrm(self, data) -> dict:
disrm_prompt = await self._preprocess_reward_inputs(data)
payloads = {
"model": model_name,
"input": disrm_prompt,
"activation": False,
}
output = await self._post_request(payloads, "classify")
rm_score = output["data"][-1]["probs"][-1]
return {"reward_score": rm_score}
pass the question and the model rollout as inputs to the reward model and obtain a reward score. This is also the standard practice for most DisRM.
Users should provide ``reward.reward_model.model_path`` to specify the reward model.
**For generative reward model (GenRM)**
For generative reward model scenarios, users need to specify both ``reward.reward_model.model_path`` and ``reward.custom_reward_function``.
The custom reward function should implement the following components:
- Convert the question and the model rollout into a GenRM input prompt using a custom prompt template.
- Invoke the GenRM to perform generation with custom sampling parameters. For this purpose, the Reward Loop provides an HTTP interface (i.e., ``reward_router_address``) for interacting with GenRM.
- Parse the GenRM output using a custom parser and extract the reward score.
As these steps are highly customizable and task-dependent, we offer this flexibility entirely to the user-defined reward function.
Below we provide an example of a custom reward function using GenRM.
.. code:: python
async def compute_score_gsm8k(
data_source: str,
solution_str: str,
ground_truth: str,
extra_info: dict,
reward_router_address: str, # an HTTP router endpoint provided by Reward Loop
reward_model_tokenizer: PreTrainedTokenizer,
):
"""Compute the reward score."""
# Step 1: Prepare prompt and request payload
grm_prompt = GRM_PROMPT_TEMPLATE.format(problem=extra_info["question"], solution=solution_str)
messages = [{"role": "user", "content": grm_prompt}]
sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096}
chat_complete_request = {"messages": messages, **sampling_params}
# Step 2: Send async request to the reward model
# here, chat_complete sends async http request to the router address
result = await chat_complete(
router_address=reward_router_address,
chat_complete_request=chat_complete_request,
)
# Step 3: Parse model response and extract score
grm_response = result.choices[0].message.content.strip()
try:
score_str = grm_response.split("\n\n")[-1].strip()
score = int(score_str)
except Exception:
score = 0
return {"score": score}
**For hybrid reward scenarios**, such as combining rule-based rewards with GenRM similarly as above,
.. _recipe/fapo: https://github.com/verl-project/verl-recipe/tree/main/fapo
A runnable and reproducible example that demonstrates how to use a rule-based reward function together with a GenRM is provided in the `recipe/fapo`_ directory for reference. Welcome to use and cite.
Reward Model Arch Design
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We support multiple execution modes for reward models during:
- **Colocate Mode**: The reward model shares the same resource pool as the actor/rollout/reference models. In this setup, all rollouts must complete first, after which the reward model is awakened to perform inference.
- **Standalone Mode**: The reward model runs on a separate resource pool, independent from the actor/rollout/reference models. In this setup, each sample is evaluated by the reward model immediately after its rollout finishes.
The standalone mode can enable the streaming manner stated above.
By default, the system runs in colocate mode. Users can enable standalone mode by setting ``reward.reward_model.enable_resource_pool=True`` and allocating the corresponding resources via ``reward.reward_model.nnodes`` and ``reward.reward_model.n_gpus_per_node``.
.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop.svg?raw=true
To support flexible and scalable reward model computation, we implement a reward router that coordinates requests among multiple reward model servers.
Each reward model runs as an independent server and is registered with the router.
This router will forward the requests to the registered reward servers with load balancing and return the results.
This design allows us to expose a single unified router address to user-defined reward functions, enabling them to access various reward models seamlessly through the same interface.
.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_full.svg?raw=true
.. code:: python
class RewardModelManager:
"""Reward model manager."""
def __init__(
self,
config: RewardModelConfig,
resource_pool: RayResourcePool = None,
):
"""
Initialize the reward model manager.
Args:
config (RewardModelConfig): Reward model configuration.
resource_pool (RayResourcePool, optional): Resource pool. Defaults to None.
"""
self.config = config
self.resource_pool = resource_pool
self._initialize_llm_servers()
self._initialize_router()
================================================
FILE: docs/advance/rollout_skip.rst
================================================
RolloutSkip Function Usage Documentation
========================================
Last updated: 08/01/2025.
Applicable Scenarios
--------------------
The RolloutSkip functionality is designed to accelerate the rollout process in reinforcement learning training by caching and reusing previously generated sequences. This feature is particularly useful when:
1. You need to repeatedly run experiments with the same configuration
2. You want to save time by avoiding redundant sequence generation to come close to the optimal policy
API and Usage Example
----------------------
2.1 Trainer Adaptation
~~~~~~~~~~~~~~~~~~~~~~
Both`RayDAPOTrainer()` (in `verl/recipe/dapo/dapo_ray_trainer.py`) and `RayPPOTrainer()`(in `verl/trainer/ppo/ray_trainer.py``) have already been adapted.
This is an example of how to patch rollout_skip in RayPPOTrainer.
.. code-block:: python
#* Import the RolloutSkip class
from verl.utils.rollout_skip import RolloutSkip
...
class RayPPOTrainer:
...
def fit(self):
...
#* Add code as follow:
rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
rollout_skip.wrap_generate_sequences()
...
for epoch in range(self.config.trainer.total_epochs):
for batch_dict in self.train_dataloader:
...
2.2 Basic Configuration
~~~~~~~~~~~~~~~~~~~~~~~
Then, you should add the following parameters to your config to enable the RolloutSkip feature:
.. code-block:: bash
actor_rollout_ref.rollout.skip_rollout=True \
actor_rollout_ref.rollout.skip_dump_dir="/tmp/rollout_dump" \
Note:
1. The `skip_dump_dir` is the directory where the cached sequences will be stored. Ensure that this directory is writable and accessible by your training process. And make sure that `skip_dump_dir` is not relative path because ray will store the data in `/tmp/ray/session_/` and the relative path will not be found in the worker.
2. The dumped data path follows this naming pattern `{experiment_name}_{project_name}_TrainGBS{train_gbs}__InferGBS{gen_gbs}__N{n}`, once you change the `experiment_name`, `project_name`, `train_gbs`, `gen_gbs`, or `n`, the cached data will be stored in a new directory.
================================================
FILE: docs/advance/rollout_trace.rst
================================================
Trace Function Usage Instructions
========================================
Last updated: 07/10/2025.
Applicable Scenarios
--------------------
Agentic RL involves multiple turns of conversations, tool invocations, and user interactions during the rollout process. During the Model Training process, it is necessary to track function calls, inputs, and outputs to understand the flow path of data within the application. The Trace feature helps, in complex multi-round conversations, to view the transformation of data during each interaction and the entire process leading to the final output by recording the inputs, outputs, and corresponding timestamps of functions, which is conducive to understanding the details of how the model processes data and optimizing the training results.
The Trace feature integrates commonly used Agent trace tools, including wandb weave and mlflow, which are already supported. Users can choose the appropriate trace tool according to their own needs and preferences. Here, we introduce the usage of each tool.
Trace Parameter Configuration
-----------------------------
- ``actor_rollout_ref.rollout.trace.backend=mlflow|weave`` # the trace backend type
- ``actor_rollout_ref.rollout.trace.token2text=True`` # To show decoded text in trace view
- ``actor_rollout_ref.rollout.trace.max_samples_per_step_per_worker=N`` # Limit traces per worker (optional)
Limiting Trace Volume
~~~~~~~~~~~~~~~~~~~~~~
By default, all samples are traced, which can generate large amounts of data and incur significant costs with trace backends like Weave or MLflow. To limit trace volume while maintaining representative coverage, use ``max_samples_per_step_per_worker``.
Example configuration:
.. code-block:: yaml
actor_rollout_ref:
rollout:
trace:
backend: weave
token2text: False
max_samples_per_step_per_worker: 5 # Each worker traces 5 random samples
Each agent loop worker independently selects up to N unique samples to trace per training step. For GRPO (``n > 1``), all rollouts for selected samples are traced. Total traces per step = max_samples_per_step_per_worker * num_workers * n.
Example: With 4 workers, max_samples_per_step_per_worker=5, and GRPO n=4, you get 4 * 5 * 4 = 80 traces per step instead of tracing all samples. Set to null (default) to trace all samples.
Glossary
--------
+----------------+------------------------------------------------------------------------------------------------------+
| Object | Explaination |
+================+======================================================================================================+
| trajectory | A complete multi-turn conversation includes: |
| | 1. LLM output at least once |
| | 2. Tool Call |
+----------------+------------------------------------------------------------------------------------------------------+
| step | The training step corresponds to the global_steps variable in the trainer |
+----------------+------------------------------------------------------------------------------------------------------+
| sample_index | The identifier of the sample, defined in the extra_info.index of the dataset. It is usually a number,|
| | but may also be a uuid in some cases. |
+----------------+------------------------------------------------------------------------------------------------------+
| rollout_n | In the GROP algorithm, each sample is rolled out n times. rollout_n represents the serial number of |
| | the rollout. |
+----------------+------------------------------------------------------------------------------------------------------+
| validate | Whether the test dataset is used for evaluation? |
+----------------+------------------------------------------------------------------------------------------------------+
Rollout trace functions
-----------------------
There are 2 functions used for tracing:
1. ``rollout_trace_op``: This is a decorator function used to mark the functions to trace. In default, only few method has it, you can add it to more functions to trace more infor.
2. ``rollout_trace_attr``: This function is used to mark the entry of a trajectory and input some info to trace. If you add new type of agent, you may need to add it to enable trace.
Usage of wandb weave
--------------------
1.1 Basic Configuration
~~~~~~~~~~~~~~~~~~~~~~~
1. Set the ``WANDB_API_KEY`` environment variable
2. Configuration Parameters
1. ``actor_rollout_ref.rollout.trace.backend=weave``
2. ``trainer.logger=['console', 'wandb']``: This item is optional. Trace and logger are independent functions. When using Weave, it is recommended to also enable the wandb logger to implement both functions in one system.
3. ``trainer.project_name=$project_name``
4. ``trainer.experiment_name=$experiment_name``
5. ``actor_rollout_ref.rollout.mode=async``: Since trace is mainly used for agentic RL, need to enable agent toop using async mode for either vllm or sglang.
Note:
The Weave Free Plan comes with a default monthly network traffic allowance of 1GB. During the training process, the amount of trace data generated is substantial, reaching dozens of gigabytes per day, so it is necessary to select an appropriate wandb plan.
1.2 View Trace Logs
~~~~~~~~~~~~~~~~~~~
After executing the training, on the project page, you can see the WEAVE sidebar. Click Traces to view it.
Each Trace project corresponds to a trajectory. You can filter and select the trajectories you need to view by step, sample_index, rollout_n, and experiment_name.
After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the input and output content.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_list.png?raw=true
1.3 Compare Trace Logs
~~~~~~~~~~~~~~~~~~~~~~
Weave can select multiple trace items and then compare the differences among them.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_compare.png?raw=true
Usage of mlflow
---------------
1. Basic Configuration
~~~~~~~~~~~~~~~~~~~~~~
1. Set the ``MLFLOW_TRACKING_URI`` environment variable, which can be:
1. Http and https URLs corresponding to online services
2. Local files or directories, such as ``sqlite:////tmp/mlruns.db``, indicate that data is stored in ``/tmp/mlruns.db``. When using local files, it is necessary to initialize the file first (e.g., start the UI: ``mlflow ui --backend-store-uri sqlite:////tmp/mlruns.db``) to avoid conflicts when multiple workers create files simultaneously.
2. Configuration Parameters
1. ``actor_rollout_ref.rollout.trace.backend=mlflow``
2. ``trainer.logger=['console', 'mlflow']``. This item is optional. Trace and logger are independent functions. When using mlflow, it is recommended to also enable the mlflow logger to implement both functions in one system.
3. ``trainer.project_name=$project_name``
4. ``trainer.experiment_name=$experiment_name``
2. View Log
~~~~~~~~~~~
Since ``trainer.project_name`` corresponds to Experiments in mlflow, in the mlflow view, you need to select the corresponding project name, then click the "Traces" tab to view traces. Among them, ``trainer.experiment_name`` corresponds to the experiment_name of tags, and tags corresponding to step, sample_index, rollout_n, etc., are used for filtering and viewing.
For example, searching for ``"tags.step = '1'"`` can display all trajectories of step 1.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_list.png?raw=true
Opening one of the trajectories allows you to view each function call process within it.
After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the content.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_view.png?raw=true
Note:
1. mlflow does not support comparing multiple traces
2. rollout_trace can not associate the mlflow trace with the run, so the trace content cannot be seen in the mlflow run logs.
================================================
FILE: docs/advance/rope.rst
================================================
RoPE Scaling override
=======================================
Last updated: 05/14/2025.
Some models such as `Qwen/Qwen2.5-7B-Instruct `_ support RoPE Scaling but don't have it defined in their config.json file.
For example, this model supports this configuration:
.. code:: python
{
...,
"rope_scaling": {
"factor": 4.0,
"original_max_position_embeddings": 32768,
"type": "yarn"
}
}
In order to support a longer context for such models, you must override the model configs when starting the trainer.
PPO example:
.. code:: bash
+actor_rollout_ref.model.override_config.rope_scaling.type=yarn \
+actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \
+actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
And for the critic model
.. code:: bash
+critic.model.override_config.rope_scaling.type=yarn \
+critic.model.override_config.rope_scaling.factor=4.0 \
+critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
================================================
FILE: docs/algo/baseline.md
================================================
# Algorithm Baselines
Last updated: 06/18/2025.
## Math related datasets
### GSM8k
Assuming GSM8k/math dataset is preprocessed via:
```bash
python3 examples/data_preprocess/*.py
```
Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder.
| Hardware | Model | Method | Test score | Details |
| ---------- | -------------------------------- | --------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| NVIDIA GPU | google/gemma-2-2b-it | hf checkpoint | 23.9 | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results) |
| NVIDIA GPU | google/gemma-2-2b-it | SFT | 52.06 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log) |
| NVIDIA GPU | google/gemma-2-2b-it | SFT + PPO | 64.02 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | hf checkpoint | 49.6 | [Qwen blog](https://qwen.ai/blog?id=qwen2.5-llm) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PRIME | 58.7 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | GRPO-LoRA | 54.3 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.543.log) |
| NVIDIA GPU | Qwen/Qwen2.5-1.5B-Instruct | GRPO-LoRA | 77.9 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-1.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.779.log) |
| NVIDIA GPU | Qwen/Qwen2.5-3B-Instruct | GRPO-LoRA | 86.1 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-3B-bsz64_2-prompt512-resp1024-lorarank32-score0.861.log) |
| NVIDIA GPU | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron) | 69.5 [1] | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO | 89 | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (FSDP2) | 89.8 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (Megatron) | 89.6 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | ReMax | 97 | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPPO | 65.6 (MATH) | [SPPO script](https://github.com/verl-project/verl-recipe/tree/main/sppo/README.md) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | GRPO-LoRA | 93.4 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-7B-bsz64_8-prompt512-resp1024-lorarank32-score0.934.log) |
| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwen.ai/blog?id=qwen2.5-llm) |
| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/verl-project/verl-recipe/tree/main/spin/README.md) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/ab86c4va) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG (Megatron) | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math_megatron.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/yy8bheu8) |
| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) |
| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) |
| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) |
| NVIDIA GPU | Qwen/Qwen2.5-14B-Instruct | GRPO-LoRA | 94.6 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-14B-bsz64_8-prompt512-resp1024-lorarank32-score0.946.log) |
| NVIDIA GPU | Qwen/Qwen2.5-32B-Instruct | GRPO-LoRA | 95.8 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-32B-bsz64_8-prompt512-resp1024-lorarank32-score0.958.log) |
| NVIDIA GPU | Qwen/Qwen2.5-72B-Instruct | GRPO-LoRA | 96.0 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-72B-bs64_8-prompt512-resp1024-lorarank32-score0.960.log) |
### DAPO math-17k
- Training DAPO math-17k dataset: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
- Testing: AIME'24: https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024
Note:
- For Qwen/Qwen2.5-Math-7B, we directly modify the max_position_embeddings to 32768 without observing performance degradation in order to train longer response length.
| Hardware | Model | Method | Test score | Details |
| ---------- | -------------------------- | ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| NVIDIA GPU | Qwen/Qwen2.5-Math-7B (32k) | DAPO | 36.3 | [command](https://github.com/verl-project/verl-recipe/blob/main//dapo/test_dapo_7b_math.sh), [logs](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | DAPO + Code Interpreter | 40.0 | [command](https://github.com/verl-project/verl-recipe/blob/main//retool/run_qwen2_7b_dapo.sh) |
## Coding related datasets
Below is the result on leetcode if not specified otherwise.
| Hardware | Model | Method | Test score | Details |
| ---------- | ----------------------- | ------ | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| NVIDIA GPU | PRIME-RL/Eurus-2-7B-SFT | RPIME | 36.1 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) |
### Notes
[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score.
[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions.
================================================
FILE: docs/algo/collabllm.md
================================================
# Recipe: CollabLLM
Last updated: 09/22/2025.
> Open-Source Algorithm Implementation & Expriement Running: [Haiquan Chen](https://github.com/chenhaiq), [Shirley Wu](https://github.com/Wuyxin)
🏠 [Homepage](https://aka.ms/CollabLLM) | 📝 [Paper](https://arxiv.org/pdf/2502.00640) | 🤗 [Datasets & Models](https://huggingface.co/collabllm) | ⭐️ [Original Implementation](https://github.com/Wuyxin/collabllm)
`verl` provides a recipe for the Outstanding Paper at ICML 2025, **"CollabLLM: From Passive Responders to Active Collaborators"**. [CollabLLM](https://aka.ms/CollabLLM) is a unified fine-tuning framework that optimizes LLMs for effective and efficient multiturn collaboration with users.
**Core Idea:** Models are rewarded based on how well their responses enable effective *future* collaboration with users.
Paper Authors: [Shirley Wu](https://cs.stanford.edu/~shirwu/), [Michel Galley](https://www.microsoft.com/en-us/research/people/mgalley/), Baolin Peng, Hao Cheng, Gavin Li, Yao Dou, Weixin Cai, [James Zou](https://www.james-zou.com/), [Jure Leskovec](https://cs.stanford.edu/people/jure/), [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/)
---
## Quick Start
### 0. Environment
Make sure the required packages for `verl` are installed. Additionally, install `litellm` and export the required API keys. The API model will be used for user simulators and, optionally, LLM Judges (see the Configuration section below).
### 1. Prepare Your Dataset
First, process your dataset using the provided script (see example commands and usage in `process_dataset.py`):
```bash
python process_dataset.py --dataset <> ... --dataset_type
```
**Requirements:**
- Input: A Hugging Face multiturn dataset. Existing datasets: `collabllm/collabllm-multiturn-$DATASET`, with `DATASET` in one of [`math-hard(-large)`, `medium(-large)`, `bigcodebench(-large)`] (*-large are the datasets used in the CollabLLM paper)
- Example format: See [collabllm-multiturn-math-hard](https://huggingface.co/datasets/collabllm/collabllm-multiturn-math-hard)
- To generate your own dataset: Use [build_dataset.py](https://github.com/Wuyxin/collabllm/blob/main/scripts/engine/build_dataset.py) from the original CollabLLM repository
### 2. Train Your Model
**(Optional) For Supervised Fine-Tuning (SFT):**
```bash
bash train_sft_collabllm.sh
```
**For Reinforcement Learning (RL):**
```bash
bash train_rl_collabllm.sh
```
The RL script shows an example to train CollabLLM on `math-hard-large`.
- The config to sample future conversations are in `recipe/collabllm/config/collabllm_interaction_config.yaml`.
- The Multiturn-aware Reward is aggregated from these three conversational-level rewards:
```
+reward_model.reward_kwargs.metric_weights.accuracy=1 \
+reward_model.reward_kwargs.metric_weights.interactivity=1 \
+reward_model.reward_kwargs.metric_weights.token_amount=-0.0001 \
```
You can remove, add, or modify the weights depending on your task. A list of implemented metrics you can already add are under `recipe/collabllm/metrics`. For example, on `medium-large`, you can replace `accuracy` with `bleu_score` via
```
+reward_model.reward_kwargs.metric_weights.bleu_score=1
```
which will instead apply bleu score on the sampled future conversations.
## Algorithm
| Step | Name | Description |
|------|-------------------------------|-----------------------------------------------------------------------------|
| 1 | Model response generation | The model generates multiple responses for each prompt in a batch. |
| 2 | Collaborative simulation | A user simulator (e.g., GPT or Claude) samples `num_repeat_rollouts` conversations for up to `max_user_turns` additional turns. |
| 3 | Compute Multiturn-aware Reward | Customized conversational reward functions are applied to the sampled conversations. Rewards are aggregated, then averaged across rollouts. |
| 4 | Update model | The model weights are updated using the computed multiturn-aware rewards. |
---
## Configuration
The primary configuration is managed through the launch script `train_rl_collabllm.sh` and the YAML file `recipe/collabllm/config/collabllm_interaction_config.yaml`. Key configuration sections:
| Section | Key Parameters / Notes |
|----------------------|-----------------------------------------------------------------------------------------|
| `data` | Paths to training/validation files, batch sizes, sequence lengths. |
| `actor_rollout_ref` (common) | Base model path (used for actor + initial reference), FSDP settings, optimization (LR, scheduler). |
| `actor_rollout_ref` (CollabLLM-specific) | Hyperparameters under `actor_rollout_ref.rollout.multi_turn`: `max_user_turns`, `max_assistant_turns`, `num_repeat_rollouts`. |
| `interaction` | Defined in `collabllm_interaction_config.yaml`. Specifies user simulator and hyperparameters. Requires exported API keys. |
| `reward_model` | Manager set to `collabllm` by default. Modify `reward_model.reward_kwargs.metric_weights` for conversational rewards and weights. LLM Judge hyperparameters (e.g., `model`, `temperature`) go under `reward_model.reward_kwargs.llm_judge_kwargs`. |
| `algorithm` | GRPO-specific hyperparameters such as `actor_rollout_ref.rollout.n`. |
| `trainer` | Distributed training (nodes, GPUs per node), logging (WandB), checkpointing frequency. |
---
## Key Files
| File Path | Purpose |
|-----------|---------|
| `recipe/collabllm/collabllm_agent_loop.py` | Main logic to sample future conversations, using `CollabLLMInteraction` from `verl/interactions/collabllm_interaction.py`. |
| `verl/workers/reward_manager/collabllm.py` | Computes rewards for future conversations, leveraging `recipe/collabllm/reward_function.py` to apply each metric. |
---
## Acknowledgement
We sincerely thank the `verl` community and advisors for their contributions and guidance!
================================================
FILE: docs/algo/dapo.md
================================================
# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
Last updated: 06/19/2025.
> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper@arXiv](https://arxiv.org/abs/2503.14476) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Our system is based on the awesome [verl](https://github.com/volcengine/verl) framework. Thanks for their great work! Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
>
> 
## Quickstart
1. Prepare the datasets **on the Ray cluster**:
```bash
bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
```
2. Submit the job to the Ray cluster **from any machine**:
```bash
cd verl # Repo root
export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
export RUNTIME_ENV="./recipe/dapo/runtime_env.yaml" # This sets environment variables for the Ray cluster
bash recipe/dapo/run_dapo_qwen2.5_32b.sh # or other scripts
```
## Reproduction Runs
| Setup | AIME 2024 Acc. | Hardware | Image | Commit | Environment Variables | Training Script | Training Record |
| -------------------------------------------- | -------------- | --------- | -------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| DAPO | 52% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO w/o Dynamic Sampling | 50% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_wo_ds_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO w/o Token-level Loss & Dynamic Sampling | 44% | 16x8xH20 | `hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.4-hotfix` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_early_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
> [!IMPORTANT]
>
> **📢 Call for Contribution!**
>
> Welcome to submit your reproduction runs and setups!
## Configuration
### Separated Clip Epsilons (-> Clip-Higher)
An example configuration:
```yaml
actor_rollout_ref:
actor:
clip_ratio_low: 0.2
clip_ratio_high: 0.28
```
`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
Core relevant code:
```python
pg_losses1 = -advantages * ratio
pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
pg_losses = torch.maximum(pg_losses1, pg_losses2)
```
### Dynamic Sampling (with Group Filtering)
An example configuration:
```yaml
data:
gen_batch_size: 1536
train_batch_size: 512
algorithm:
filter_groups:
enable: True
metric: acc # score / seq_reward / seq_final_reward / ...
max_num_gen_batches: 10 # Non-positive values mean no upper limit
```
Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
Core relevant code:
```python
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
num_gen_batches += 1
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
continue
else:
raise ValueError(
f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
)
else:
# Align the batch
traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
batch = batch[:traj_bsz]
```
### Flexible Loss Aggregation Mode (-> Token-level Loss)
An example configuration:
```yaml
actor_rollout_ref:
actor:
loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
# NOTE: "token-mean" is the default behavior
```
Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
Core relevant code:
```python
if loss_agg_mode == "token-mean":
loss = verl_F.masked_mean(loss_mat, loss_mask)
elif loss_agg_mode == "seq-mean-token-sum":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum
loss = torch.mean(seq_losses) # seq-mean
elif loss_agg_mode == "seq-mean-token-mean":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean
loss = torch.mean(seq_losses) # seq-mean
else:
raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
```
### Overlong Reward Shaping
An example configuration:
```yaml
data:
max_response_length: 20480 # 16384 + 4096
reward_model:
overlong_buffer:
enable: True
len: 4096
penalty_factor: 1.0
```
Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length - overlong_buffer.len` by `0` to `overlong_buffer.len` tokens.
Core relevant code:
```python
if self.overlong_buffer_cfg.enable:
overlong_buffer_len = self.overlong_buffer_cfg.len
expected_len = self.max_resp_len - overlong_buffer_len
exceed_len = valid_response_length - expected_len
overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
reward += overlong_reward
```
## FAQ
### Where is the "Overlong Filtering" in the paper?
Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
### What's the difference between [the `recipe/dapo` directory in the `main` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo) and the [`recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo)?
[The `recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) is for **as-is reproduction** and thus won't be updated with new features.
[The `recipe/dapo` directory in the `main` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo) works as an example of how to extend the latest `verl` to implement an algorithm recipe, which will be maintained with new features.
### Why can't I produce similar results after modifications?
RL infrastructures nowadays still have inherent unrobustness, on which we are still working hard to improve.
We strongly recommend to only modify one thing at a time.
We also list some known problems here:
1. Enabling CUDA graph (`enforce_eager=False`) might cause model performance degradation, whose cause is still under investigation.
================================================
FILE: docs/algo/dppo.md
================================================
# Divergence Proximal Policy Optimization (DPPO)
Last updated: 02/25/2026.
## Rethinking the Trust Region in LLM Reinforcement Learning
[](https://arxiv.org/pdf/2602.04879)
[](https://github.com/sail-sg/Stable-RL)
[](https://x.com/QPHutu/status/2019435642539897303)
## ✨Getting started
1. Prepare the datasets by running [prepare_dapo_data.sh](https://github.com/verl-project/verl-recipe/blob/3490a22a0a3adeb7e4787fe70b1060b642efbae4/dapo/prepare_dapo_data.sh):
```bash
bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
```
2. Prepare the model:
```bash
hf download Qwen/Qwen3-30B-A3B-Base --local-dir ${HOME}/verl/models/Qwen3-30B-A3B-Base
```
3. Run the script:
```bash
# run DPPO-Binary-KL
LOSS_MODE=dppo_kl bash examples/dppo_trainer/run_qwen30b_dppo.sh
# run DPPO-Binary-TV
LOSS_MODE=dppo_tv bash examples/dppo_trainer/run_qwen30b_dppo.sh
# run GRPO baseline
LOSS_MODE=vanilla CLIP_LOW=0.2 CLIP_HIGH=0.2 bash examples/dppo_trainer/run_qwen30b_dppo.sh
# or GRPO with clip higher
LOSS_MODE=vanilla CLIP_LOW=0.2 CLIP_HIGH=0.28 bash examples/dppo_trainer/run_qwen30b_dppo.sh
```
## 📖Introduction
Comparison of **PPO** and the proposed **DPPO** (the Binary-TV variant). **(Left)** The surrogate objective and corresponding masks for PPO and DPPO. PPO (and variants like GRPO) employs a heuristic mask based on the probability ratio. In contrast, DPPO utilizes a more principled mask based on a direct approximation of policy divergence (e.g., Total Variation), ensuring updates stay within a theoretically grounded trust region. **(Right)** Experimental results on the AIME24 using Qwen3-30B-A3B-Base. DPPO significantly outperforms GRPO baselines, achieving superior training stability and final performance even without rollout routing replay (R3).
DPPO variants achieve stable training while controlling the training-inference mismatch at a low level. In contrast, methods without a trust region (PG-IS, CISPO) or with a misspecified one (MiniRL) suffer from growing mismatch and eventual collapse.
The plots show numerical differences between a training and an inference engine for Qwen3-30B-A3B-Base with identical parameters. **(Left)** The probability ratio (used in PPO) is highly volatile for low-probability tokens. **(Right)** In contrast, the TV divergence is more stable. This highlights a key flaw of PPO's clipping mechanism: it **over-penalizes low-probability tokens**, which can slow down learning; and **under-penalizes high-probability tokens**, which can permit large, destabilizing updates.
The most frequently clipped tokens (by GRPO) are important to the reasoning task!
They are dominated by:
- numbers, like 1, 4
- mathematical symbols, like +, -, =
- reasoning and structural Words: Wait, Thus, Next
## Top-K divergence approximation
We only implement the DPPO-Binary-TV/DPPO-Binary-KL here due to their simplicity.
For the TopK divergence approximation, please refer to the [the original repo](https://github.com/sail-sg/Stable-RL) for a complete implementation.
## Citation
If you find our works useful for your research, please consider citing:
```bibtex
@article{qi2026dppo,
title={Rethinking the Trust Region in LLM Reinforcement Learning},
author={Qi, Penghui and Zhou, Xiangxin and Liu, Zichen and Pang, Tianyu and Du, Chao and Lin, Min and Lee, Wee Sun},
journal={arXiv preprint arXiv:2602.04879},
year={2026}
}
```
## 🌻Acknowledgement
We implement our reinforcement learning algorithm extending from [verl](https://github.com/volcengine/verl). We utilize [vLLM](https://github.com/vllm-project/vllm) and [sglang](https://github.com/sgl-project/sglang) for inference. Our models are trained primarily on [Qwen3 family](https://huggingface.co/collections/Qwen/qwen3). Our training data is built from [DAPO-MATH](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k). Thanks for their great contributions!
================================================
FILE: docs/algo/entropy.md
================================================
# Recipe: Entropy Mechanism
Last updated: 06/27/2025.
The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning.
[](https://arxiv.org/pdf/2505.22617) [](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL) [](https://www.alphaxiv.org/abs/2505.22617) [](https://x.com/stingning/status/1928088554166505667) [](https://x.com/charlesfornlp/status/1928089451080585283) [](https://x.com/_akhaliq/status/1928077929105268861)
## 🎉News
- **[2025/05/29]** 🎉 Ranked **#1** of the day on [Huggingface Daily Papers](https://huggingface.co/papers?date=2025-05-29).
- **[2025/05/29]** Released our Paper on arXiv. See [here](https://arxiv.org/pdf/2505.22617). We provide insights into the entropy mechanism of RL for LLMs and propose two simple yet effective strategies to alleviate the entropy collapse.
## ✨Getting started
After preparing the training data, for training Qwen2.5-7B on a single node, taking the KL-Cov approach as an example, you can simply run:
```
cd verl
conda activate your_env
bash recipe/dapo/7b_kl_cov.sh
```
While for training Qwen2.5-32B on multi nodes, you can run the following commands:
```
cd verl
conda activate your_env
bash recipe/dapo/32b_kl_cov.sh
```
## 📖Introduction
This paper addresses the entropy collapse issue in scaling reinforcement learning (RL) for large language models (LLMs), where policy entropy drops sharply during training, leading to overconfidence and performance saturation. We empirically establish a relationship between entropy ($H$) and performance ($R$): $R=−aexp(H)+b$, showing performance is bottlenecked by entropy exhaustion.
Theoretically, we find entropy changes are driven by the covariance between action probability and logit updates, which correlates with advantage in Policy Gradient methods. High-probability, high-advantage actions reduce entropy, while rare, high-advantage actions increase it. Empirically, the covariance term remains positive, explaining entropy’s monotonic decline. To mitigate this, we propose Clip-Cov and KL-Cov, which restrict updates for high-covariance tokens. These methods effectively prevent entropy collapse, and improve performance.
## 📃Evaluation
Our method is able to maintain a considerably higher level of entropy throughout training. For example, when the baseline's entropy reaches a plateau and can no longer be consumed, the KL-Cov method still sustains an entropy level over 10 times higher. Meanwhile, the response length of the policy model steadily increases, and its performance on the test set consistently surpasses that of the baseline. This indicates that our model is able to explore more freely during training, learning better policy through RL.
| **Method** | **AIME24** | **AIME25** | **AMC** | **MATH-500** | **OMNI-MATH** | **OlympiadBench** | **Minerva** | **Avg.** |
| ----------------- | ---------: | ---------: | -------: | -----------: | ------------: | ----------------: | ----------: | -------: |
| *Qwen2.5-7B* | | | | | | | | |
| GRPO | 21.2 | 9.6 | 58.7 | 78.8 | 27.9 | 40.7 | 36.7 | 38.6 |
| w. Clip-higher | 18.1 | 11.5 | 56.6 | 79.2 | 29.8 | 43.3 | 40.4 | 38.8 |
| w. **`CLIP-Cov`** | 22.1 | **15.8** | 58.2 | 80.4 | **30.5** | **44.1** | **41.1** | 40.4 |
| w. **`KL-Cov`** | **22.6** | 12.9 | **61.4** | **80.8** | 29.1 | 42.6 | 38.2 | **40.6** |
| *Qwen2.5-32B* | | | | | | | | |
| GRPO | 21.8 | 16.2 | 69.7 | 84.2 | 35.2 | 43.6 | 45.5 | 45.8 |
| w. Clip-higher | 35.6 | 22.3 | 69.5 | 77.2 | 35.1 | 42.5 | 43.0 | 47.2 |
| w. **`CLIP-Cov`** | 32.3 | 22.7 | 67.2 | **87.0** | **42.0** | **57.2** | 46.0 | 50.3 |
| w. **`KL-Cov`** | **36.8** | **30.8** | **74.5** | 84.6 | 39.1 | 49.0 | **46.3** | **52.2** |
Our two approaches both achieve non-trivial improvements across all benchmarks. Compared to GRPO, our method outperforms it by 2.0% on average for the 7B model and by 6.4% for the 32B model. Moreover, we observe that our method yields more substantial gains on the larger Qwen2.5-32B. Specifically, our method achieves improvements of 15.0% and 14.6% compared to GRPO on the most challenging benchmarks, AIME24 and AIME25, respectively.
## 🎈Citation
If you find this paper or repo helpful, please cite us.
```bibtex
@article{cui2025entropy,
title={The Entropy Mechanism of Reinforcement Learning for Reasoning Language Models},
author={Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others},
journal={arXiv preprint arXiv:2505.22617},
year={2025}
}
```
## 🌻Acknowledgement
We implement our reinforcement learning algorithm extending from [verl](https://github.com/volcengine/verl). We utilize [vLLM](https://github.com/vllm-project/vllm) for inference. Our models are trained primarily on [Qwen2.5 family](https://github.com/QwenLM/Qwen2.5). Our training data is built from [DAPO-MATH](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k). Thanks for their great contributions!
## 📬 Contact
For questions, discussion, or collaboration opportunities, feel free to contact:
- Ganqu Cui: cuiganqu@pjlab.org.cn
- Yuchen Zhang: yuchen.zhang2003@gmail.com
- Jiacheng Chen: jackchan9345@gmail.com
- Ning Ding: ningding.cs@gmail.com
================================================
FILE: docs/algo/gpg.md
================================================
# GPG: Group Policy Gradient
Last updated: 07/03/2025.
Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
](https://arxiv.org/abs/2504.02546).
## Key Components
- Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
## Configuration
To configure GPG within the framework, use the following YAML settings.
```yaml
algorithm:
adv_estimator: gpg
actor_rollout_ref:
actor:
policy_loss:
loss_mode: "gpg"
```
## Advanced Extensions
GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
```yaml
algorithm:
adv_estimator: gpg
actor_rollout_ref:
actor:
use_kl_loss: True # enable kl regularization
kl_loss_coef: 0.01
policy_loss:
loss_mode: "gpg"
```
================================================
FILE: docs/algo/grpo.md
================================================
# Group Relative Policy Optimization (GRPO)
Last updated: 05/31/2025.
In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
- Baseline Calculation: The average reward of the group serves as a baseline.
- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
## Key Components
- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
## Configuration
Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).

- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
## Advanced Extensions
### DrGRPO
[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
- `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length.
- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
## Reference Example
Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
```bash
bash examples/grpo_trainer/run_qwen3-8b.sh
```
For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
================================================
FILE: docs/algo/opo.md
================================================
# On-Policy RL with Optimal Reward Baseline (OPO)
Last updated: 06/02/2025.
Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses.
OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585).
## Key Components
- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data.
- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards.
## Configuration
To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline.
```yaml
algorithm:
adv_estimator: opo # Use OPO for optimal reward baseline
data:
train_batch_size: 1024
actor_rollout_ref:
actor:
ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training
entropy_coeff: 0 # disable entropy regularization
use_kl_loss: False # disable kl regularization
kl_loss_coef: 0
```
## Advanced Extensions
OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions.
================================================
FILE: docs/algo/otb.md
================================================
# Optimal Token Baseline (OTB)
Last updated: 02/23/2026.
📝 [ArXiv](https://www.arxiv.org/abs/2602.07078) | 📒 [Blog](https://richardli.xyz/optimal-token-baseline) | 🤗 [Datasets](https://huggingface.co/datasets/Jiawei415/DPAO_filter)
Optimal Token Baseline (OTB) is a dynamic token-level baseline for gradient variance reduction in policy-gradient reinforcement learning. It weights updates with the "Realized Energy" statistic that tracks how much uncertainty has accumulated up to each token, so noisy regions get downweighted while confident regions carry more weight.
## Key properties
- _Token-level baselines:_ OTB adapts per token by tracking realized energy, avoiding the padding artifacts that appear when group means dilute the signal with `EOS` tokens.
- _Forward-only overhead:_ The realized-energy statistic is computed via the **Logit-Gradient Proxy**, so OTB requires no extra backward passes or gradient-norm kernels.
## Logit-Gradient Proxy
Computing true uncertainty per token would normally mandate per-token backward passes. OTB sidesteps this by estimating realized energy entirely from forward probabilities, so it introduces negligible runtime overhead in practice.
## Mechanics at a glance
For each prompt group of size `N`, OTB computes rewards-to-go `G_t` and cumulative variance weights `W_t`. The optimal baseline per token is
```
B*_t = (Σ_i G_t^{(i)} · W_t^{(i)}) / (Σ_i W_t^{(i)} + ε),
W_t = Σ_{j=1}^t (1 - 2π_j + Σπ_j²),
Σπ_j² = exp(logsumexp(2·logits_j) - 2·logsumexp(logits_j)).
```
The final advantage is `(G_t - B*_t) · mask_t`, so padding tokens stay at zero.
## Integration in VERL
- `AdvantageEstimator.OPTIMAL_TOKEN_BASELINE` registers `compute_optimal_token_baseline_advantage`, invoked whenever `algorithm.adv_estimator` is set to `optimal_token_baseline`.
- `ActorRolloutRefWorker.compute_log_prob` emits an additional tensor `sum_pi_squared` (Σπ² per token) when `actor.calculate_sum_pi_squared=True`. This requires disabling fused log-prob kernels, because they do not surface logits.
- Trainers assert `sum_pi_squared` exists, regroup trajectories by `non_tensor_batch["uid"]`, and run the OTB calculation. If rollout IS is active, they rescale the weights by `rollout_is_weights**2` before aggregating.
- In Ulysses sequence-parallel setups, the actor gathers, unpads, and returns Σπ² in the same way it handles log-probabilities, so OTB supports sharded sequence-parallel models out of the box.
- `sum_pi_squared_checkpointing` is available to trade compute for memory when Σπ² tensors become large (e.g., lengthy chain-of-thought reasoning).
## Configuration checklist
- `actor_rollout_ref.actor.calculate_sum_pi_squared: true` (mandatory).
- `actor_rollout_ref.model.use_fused_kernels: false` (required until fused kernels emit logits).
- `algorithm.adv_estimator: optimal_token_baseline` for single-turn RL and `tir_optimal_token_baseline` for multi-turn RL.
- Group sampling (`actor_rollout_ref.rollout.n > 1`) to unlock OTB’s variance reduction; with `n=1` the baseline collapses to returns.
Example OmegaConf overlay:
```yaml
algorithm:
adv_estimator: optimal_token_baseline
actor_rollout_ref:
actor:
calculate_sum_pi_squared: true
sum_pi_squared_checkpointing: false # optional memory saver
rollout:
n: 8
```
## Example script
See `examples/otb_trainer/run_qwen2_5-7b.sh` for a reference training loop.
## Gradient Variance Proxy Metrics
All gradient-variance analysis in the Optimal Token Baseline work starts from the variance identity
```
Var(ĝ) = E[||ĝ||²] - ||E[ĝ]||²,
```
which states that the variance of any stochastic gradient equals the mean squared magnitude minus the squared norm of its expectation.
For a trajectory `τ`, the policy-gradient estimator is
```
ĝ(τ) = ∇ log π_θ(τ) · A(τ), A(τ) = R(τ) - B.
```
The logit-gradient proxy approximates the squared gradient norm without an extra backward pass:
```
||ĝ(τ)||² ≈ Ŵ(τ) · A(τ)²,
```
where `Ŵ(τ)` is the realized energy built. Given a mini-batch `{τ_i}` of size `N`, we decompose its statistics into three diagnostics:
- **Signal strength (squared norm of the mean gradient)**
```
S = || (1/N) · Σ ĝ(τ_i) ||²
```
- **Total power (signal + noise)**
```
P_total = (1/N) · Σ Ŵ(τ_i) · A(τ_i)²
```
- **Pure noise (estimated variance of the batch mean)**
```
Var_proxy = (1/(N-1)) · (P_total - S)
```
`verl/trainer/ppo/metric_utils.py#L306` implements these diagnostics via `compute_variance_proxy_metrics`, emitting `variance_proxy/proxy1_signal_strength`, `variance_proxy/proxy2_total_power`, and `variance_proxy/proxy3_pure_noise`.
Tracking these metrics provides a forward-only, low-overhead view of gradient health for any advantage estimator that supplies `sum_pi_squared`.
================================================
FILE: docs/algo/ppo.md
================================================
# Proximal Policy Optimization (PPO)
Last updated: 06/19/2025.
Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
- High variance and sample inefficiency.
- Instability due to large policy updates.
PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
## Key Components
- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
## Configuration
Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.

- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs`
- `algorithm.gemma`: discount factor
- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo
## Advanced Extensions
### KL Divergence Control
Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
Options to use KL loss for KL divergence control:
- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
Options to use KL penalty in the reward:
- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
### Dual-clip PPO
The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.

- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
## Reference Example
Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
```bash
bash run_gemma.sh
trainer.n_gpus_per_node=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
trainer.logger=console \
critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
data.train_batch_size=256 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=2 \
critic.ppo_micro_batch_size=2
```
Reference performance with verl v0.2:
| Model | Method | Score | Link |
|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
| Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
================================================
FILE: docs/algo/rollout_corr.md
================================================
# Rollout Correction
**Author:** [Yingru Li](https://richardli.xyz/)
Last updated: 10/30/2025.
---
> **📖 Documentation Structure**
>
> - **This document** - Practical usage guide: configurations, presets, troubleshooting
> - **[Mathematical Formulations](rollout_corr_math.md)** - Theoretical foundations, derivations, and algorithmic details
>
> Start here for implementation, refer to the math doc for theory and design rationale.
---
This document provides a comprehensive overview of the Rollout Correction implementation in verl.
**Note on Naming**: This feature is called "Rollout Correction" to reflect the complete functionality: importance sampling (IS) weights and rejection sampling (RS). The internal variable `rollout_is_weights` retains its name as it specifically refers to the IS weights component.
### BibTeX Citation
```bibtex
@online{liu-li-2025-rl-collapse,
title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
year = {2025},
month = sep,
url = {https://richardli.xyz/rl-collapse}
}
@article{li2025trust,
title={Trust Region Masking for Long-Horizon LLM Reinforcement Learning},
author={Li, Yingru and Liu, Jiacai and Xu, Jiawei and Tong, Yuxuan and Li, Ziniu and Liu, Qian and Wang, Baoxiang},
journal={arXiv preprint arXiv:2512.23075},
year={2025}
}
```
### Blog Series
- Main blog post: https://richardli.xyz/rl-collapse
- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
- Latest Paper: https://arxiv.org/abs/2512.23075
## Overview
Rollout Correction provides a unified framework to handle **general off-policy problems** in RL training. Any scenario where the data collection distribution differs from the training distribution can benefit from these methods.
**Common off-policy scenarios:**
1. **Policy Mismatch** (Implementation Differences)
- Different precision: FP8 vs FP16 vs BF16 vs FP32
- Different backends: vLLM vs SGLang vs FSDP vs Megatron
- Different implementations even with identical weights
2. **Temporal Lag** (Model Staleness)
- Rollout uses older checkpoint while training has progressed
- Asynchronous rollout workers with stale parameters
- Common in distributed/async RL systems
3. **Replay Buffers**
- Training on historical trajectories from earlier iterations
- Experience replay from different policy versions
- Data augmentation or resampling strategies
4. **Off-Policy Algorithms**
- Behavioral cloning from expert demonstrations
- DAPO (data from auxiliary policies)
- Any algorithm using trajectories from a different policy
5. **Data Quality Filtering**
- Reweighting or filtering collected data
- Preference learning with modified distributions
- Curriculum learning with distribution shifts
These off-policy gaps can cause training instability and policy collapse. Rollout Correction uses importance sampling (IS) weights and rejection sampling (RS) to correct for any distribution shift between data collection and training.
**Important Note on Common Implementation Mistakes:**
Many LLM-RL implementations incorrectly apply PPO by **ignoring the actual rollout policy** π_rollout and assuming the training reference policy π_old is the behavior policy. This is mathematically incorrect when π_rollout ≠ π_old (which is typical in LLM-RL due to precision/backend differences between rollout and training).
**This is not PPO's fault** - PPO itself is mathematically correct. The issue is the incorrect assumption that π_old = π_rollout in naive implementations.
This critical implementation mistake that leads to RL training collapse was identified in the blog post ["When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"](https://richardli.xyz/rl-collapse) and motivated the development of this rollout correction framework.
**Mathematically correct approaches:**
- **Decoupled mode**: Three policies (π_rollout, π_old, π_θ) with IS correction from π_rollout to π_old
- **Bypass mode**: Two policies (π_rollout = π_old, π_θ) using actual rollout policy as PPO anchor
- **Bypass + Policy Gradient mode**: Two policies (π_rollout, π_θ) with IS/RS correction and no PPO clipping
See [Mathematical Formulations](rollout_corr_math.md#37-common-implementation-mistake) for detailed explanation.
### Key Design Principle: Separation of IS Weights and Rejection Sampling
The implementation cleanly separates two orthogonal mechanisms:
1. **IS Weights** (`rollout_is_weights`): Continuous reweighting for gradient correction
- Policy ratio: π_old/π_rollout (decoupled) or π_θ/π_rollout (bypass)
- **Safety-bounded**: Clamped to [exp(-20), exp(20)] ≈ [2e-9, 5e8] to prevent overflow
- Token level: Bounds per-token ratios
- Sequence level: Bounds product of ratios (broadcast to all tokens)
- **Truncated**: Upper clamped via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
- **Zeroed at padding**: Multiplied by response_mask to zero out padding positions
- Used to weight policy gradients (variance reduction)
2. **Rejection Sampling** (`modified_response_mask`): Binary filtering for outlier exclusion
- Creates binary mask: 1 = keep, 0 = reject
- Rejects tokens/sequences with IS ratios outside [lower_threshold, upper_threshold]
- Modifies response_mask to exclude rejected samples from training
This separation ensures:
- ✅ IS weights provide continuous reweighting (reduce variance)
- ✅ Rejection sampling provides hard filtering (remove extreme outliers)
- ✅ Both mechanisms can be enabled independently or together
- ✅ Safety bounds prevent numerical overflow in all cases
## Quick Start: Using Verified Presets
**NEW**: We now provide typed configuration with verified presets for common scenarios. These presets have been validated with tens of thousands of GPU hours across various models and training scenarios.
### Python API
```python
from verl.trainer.config.algorithm import RolloutCorrectionConfig
# === Decoupled PPO mode (3 policies: π_rollout, π_old, π_θ) ===
# IS weights correct for gap between π_old and π_rollout
config = RolloutCorrectionConfig.decoupled_token_is() # Token-TIS
config = RolloutCorrectionConfig.decoupled_seq_is() # Seq-TIS
config = RolloutCorrectionConfig.decoupled_seq_is_rs() # Seq-MIS
config = RolloutCorrectionConfig.decoupled_geo_rs() # Geo-RS (ratio mode)
config = RolloutCorrectionConfig.decoupled_geo_rs_token_tis() # Geo-RS + Token-TIS
# === K3 KL Estimator presets (more stable for small KL) ===
config = RolloutCorrectionConfig.decoupled_k3_rs() # K3-RS only
config = RolloutCorrectionConfig.decoupled_k3_rs_token_tis() # K3-RS + Token-TIS
# === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast ===
# PPO ratio handles IS, so no explicit IS weights needed
config = RolloutCorrectionConfig.bypass_ppo_clip() # PPO-clip only
config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs() # PPO-clip + Geo-RS
config = RolloutCorrectionConfig.bypass_ppo_clip_k3_rs() # PPO-clip + K3-RS
# === Bypass PG mode (2 policies, no PPO clipping) - fast ===
# IS weights computed on-the-fly as π_θ / π_rollout
config = RolloutCorrectionConfig.bypass_pg_is() # Seq-TIS + PG
config = RolloutCorrectionConfig.bypass_pg_geo_rs() # Geo-RS + PG
config = RolloutCorrectionConfig.bypass_pg_geo_rs_token_tis() # Geo-RS + Token-TIS + PG
# === Other ===
config = RolloutCorrectionConfig.disabled() # Metrics only (no correction)
```
### YAML Configuration (Advanced)
For advanced customization or YAML-based configs:
```yaml
algorithm:
rollout_correction:
rollout_is: token # IS weights: "token", "sequence", or null
rollout_is_threshold: 2.0 # Upper threshold for IS weights
rollout_is_batch_normalize: false # Batch normalize IS weights to mean=1.0
rollout_rs: null # Rejection sampling: comma-separated canonical options (e.g. "token_k1,seq_max_k2")
rollout_rs_threshold: null # Threshold spec: float(s) or "lower_upper" string(s)
bypass_mode: false # Skip old_log_prob computation (sets π_old = π_rollout)
loss_type: ppo_clip # Loss type in bypass mode: "ppo_clip" (default) or "reinforce"
# REQUIRED: Enable log prob calculation
actor_rollout_ref:
rollout:
calculate_log_probs: true
```
## Files
### **Core Implementation**
- `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()`
- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`)
- `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation)
- `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection
### **Configuration Files**
- `verl/trainer/config/algorithm.py` - Rollout Correction parameters in `RolloutCorrectionConfig`
- `verl/workers/config/actor.py` - Rollout Correction parameters in `PolicyLossConfig`
- `verl/trainer/config/actor/actor.yaml` - Rollout Correction configuration section
- `verl/trainer/config/ppo_trainer.yaml` - Algorithm config with Rollout Correction
### **Documentation**
- `docs/examples/config.rst` - Configuration parameter descriptions
### **Example Scripts**
- `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh` - DAPO example with Rollout Correction
- `examples/rollout_correction/run_with_rollout_corr.sh` - Basic example
- `examples/rollout_correction/run_with_rollout_corr_multi_rs.sh` - Multi-RS example
### **Tests**
- `tests/trainer/ppo/test_rollout_corr.py` - Unit tests for IS/RS mechanisms
- `tests/trainer/ppo/test_rollout_corr_integration.py` - Integration tests
## Configuration Parameters
All parameters are under `algorithm.rollout_correction`:
### `rollout_is` (str or null)
Importance sampling weights aggregation level:
- `null` = No IS weights computed (metrics-only mode)
- `"token"`: Per-token IS weights
- **Decoupled mode**: ρ_t = π_old(t)/π_rollout(t)
- **Bypass/Pure IS mode**: ρ_t = π_θ(t)/π_rollout(t)
- Independent truncation per token
- Typical threshold: 1.5 - 5.0
- `"sequence"`: Per-sequence weight ρ_seq = ∏_t ρ_t
- Multiplicative aggregation across sequence
- Typical threshold: 2.0 - 10.0
All IS weights are safety-bounded to [exp(-20), exp(20)] ≈ [2e-9, 5e8]
### `rollout_is_threshold` (float)
Upper threshold for IS weight truncation. Default: `2.0`
- Truncates IS weights via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
- Applied to IS weights for variance reduction
- Separate from rejection sampling (controlled by `rollout_rs` parameters)
### `rollout_is_batch_normalize` (bool)
Apply batch normalization to IS weights. Default: `False`
- `True`: Normalize IS weights to have mean=1.0 within each batch
- **Token-level IS**: Normalizes over all token weights
- **Sequence-level IS**: Normalizes over sequence means (one weight per sequence)
- `False`: Use raw (truncated) IS weights
- Reduces variance by ensuring average weight is 1.0 per batch
- Applied AFTER truncation to preserve truncation semantics
- Only affects IS weight values, not rejection sampling
### `rollout_rs` (str or null)
Rejection sampling aggregation modes. Supply a comma-separated string (spaces optional) using the canonical options implemented in `rollout_corr_helper`:
- `token_k1`: Token-level rejection with `-log r` bounds (ratio thresholds supplied as `lower_upper`). Example: `"0.6_1.4"`
- `token_k2`: Token-level rejection with `0.5 * (log r)^2` (upper bound only)
- `token_k3`: Token-level rejection with `exp(log r) - 1 - log r` (upper bound only)
- `seq_sum_k1`: Sequence-level rejection with sum of `-log r` (ratio bounds)
- `seq_sum_k2`: Sequence-level rejection with sum of `0.5 * (log r)^2` (upper bound only)
- `seq_sum_k3`: Sequence-level rejection with sum of `exp(log r) - 1 - log r` (upper bound only)
- `seq_mean_k1`: Sequence-level rejection with mean of `-log r` (ratio bounds)
- `seq_mean_k2`: Sequence-level rejection with mean of `0.5 * (log r)^2` (upper bound only)
- `seq_mean_k3`: Sequence-level rejection with mean of `exp(log r) - 1 - log r` (upper bound only)
- `seq_max_k2`: Sequence-level rejection with max of `0.5 * (log r)^2` (upper bound only)
- `seq_max_k3`: Sequence-level rejection with max of `exp(log r) - 1 - log r` (upper bound only)
### `rollout_rs_threshold` (str, float, or null)
Threshold specification for rejection sampling.
- Provide **one entry per option**, separated by commas. A single entry is broadcast to every option.
- **K1 KL modes (`*k1`)**: Use `"lower_upper"` strings (e.g. `"0.7_1.3"`). Supplying a float implies only the upper bound; the lower bound defaults to its reciprocal.
- **K2/K3 KL modes (`*k2`/`*k3`)**: Supply positive upper bounds (float or numeric string).
- Set to `null` to disable thresholds entirely (only valid when `rollout_rs` is null).
## Understanding the Framework: Components and Combinations
The rollout correction framework is built from **orthogonal components** that can be combined flexibly. Understanding these components helps you choose the right configuration for your scenario.
### Key Components
1. **Operating Mode** (Section: [Operation Modes](#operation-modes))
- **Decoupled**: Three policies (π_rollout, π_old, π_θ) with separate π_old computation
- **Bypass**: Two policies (π_rollout = π_old, π_θ), skips π_old computation
2. **Loss Function** (in bypass mode, controlled by `loss_type`)
- **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio)
- **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping)
3. **IS/RS Aggregation Level**
- **Token**: Per-token IS weights/rejection
- **Sequence**: Sequence-level IS weights/rejection
See [Mathematical Formulations](rollout_corr_math.md#3-algorithmic-components-and-combinations) for detailed theory.
---
## Preset Configuration Guide
This section provides detailed guidance on choosing and using the verified presets. Each preset is a specific combination of components optimized for common scenarios.
### Understanding the Presets
#### Available Preset Methods
| Preset Method | Estimator | Mode | IS Level | RS Level | Properties |
| ------------------------------------------------------------------------------ | ---------------- | ------------------ | -------- | -------- | --------------------------------------- |
| **Decoupled PPO Mode** (3 policies: π_rollout, π_old, π_θ) |
| `decoupled_token_is()` | Token-TIS | Decoupled | token | - | Token-level IS weights |
| `decoupled_seq_is()` | Seq-TIS | Decoupled | sequence | - | Sequence-level IS weights |
| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled | sequence | sequence | Sequence IS + seq_sum_k1 RS |
| `decoupled_geo_rs()` | Geo-RS | Decoupled | - | sequence | Geometric RS (seq_mean_k1) |
| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled | token | sequence | Geometric RS + token IS |
| **K3 KL Estimator** (more stable for small KL values) |
| `decoupled_k3_rs()` | K3-RS | Decoupled | - | sequence | seq_mean_k3 RS |
| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled | token | sequence | seq_mean_k3 RS + token IS |
| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers) |
| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | - | - | PPO-clip only |
| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | - | sequence | PPO-clip + Geo-RS |
| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | - | sequence | PPO-clip + K3-RS |
| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) |
| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | sequence | - | REINFORCE with explicit IS |
| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | - | sequence | REINFORCE with Geo-RS |
| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | token | sequence | REINFORCE + Geo-RS + token IS |
| **Other** |
| `disabled()` | - | - | - | - | Metrics only, no correction |
**Note:**
- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
- `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS
- `"reinforce"`: REINFORCE with explicit IS weights as π_θ/π_rollout
- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples.
- All estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS, ...) are compatible with Decoupled and Bypass modes.
#### Other Supported Combinations (Manual Configuration Required)
**Other supported combinations without preset methods:**
- Token IS + Token RS: Token-level IS weights + Token-level RS mask
- Pure token RS: Token-level RS only, no IS weights
- Pure sequence RS: Sequence-level RS only, no IS weights
See [detailed configuration examples below](#additional-useful-configurations-not-exposed-as-presets) for manual configurations.
**Key properties:**
- Any aggregation level (token/sequence) works in either decoupled or bypass mode
- All combinations are fully supported by the implementation
- Rejection sampling is independent of IS weighting
- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights)
---
### 1. Decoupled Mode with Token-level Importance Sampling (`decoupled_token_is`)
**Configuration:**
```python
config = RolloutCorrectionConfig.decoupled_token_is(threshold=2.0)
```
**Components:**
- **Operating Mode**: Decoupled (3 policies)
- **Loss**: PPO with clipping (only for the second drift correction)
- **IS Aggregation**: Token-level
- **RS**: None (can be added separately)
**Equivalent YAML:**
```yaml
algorithm:
rollout_correction:
rollout_is: token
rollout_is_threshold: 2.0
rollout_rs: null
bypass_mode: false # Decoupled mode
```
**Properties:**
- Independent truncation per token
- Lower variance than sequence-level (product of ratios bounded individually)
- Typical threshold: 1.5 - 5.0
**Theory:** See [rollout_corr_math.md §3.3.1](rollout_corr_math.md#331-token-level-aggregation)
---
### 2. Decoupled Mode with Sequence-level Importance Sampling (`decoupled_seq_is`)
**Also known as: Seq-TIS (Sequence-Level Truncated IS)**
**Configuration:**
```python
config = RolloutCorrectionConfig.decoupled_seq_is(threshold=2.0)
```
**Components:**
- **Operating Mode**: Decoupled (3 policies)
- **Loss**: PPO with clipping (only for the second drift correction)
- **IS Aggregation**: Sequence-level (Seq-TIS)
- **RS**: None (can be added separately)
**Equivalent YAML:**
```yaml
algorithm:
rollout_correction:
rollout_is: sequence
rollout_is_threshold: 2.0
rollout_rs: null
bypass_mode: false # Decoupled mode
```
**Properties:**
- Multiplicative aggregation across sequence
- More sensitive to outliers than token-level
- Typical threshold: 2.0 - 10.0 (higher than token-level)
**Theory:** See [rollout_corr_math.md §3.3.2](rollout_corr_math.md#332-sequence-level-aggregation)
---
### 3. Decoupled Mode with Sequence-level IS + Rejection Sampling (`decoupled_seq_is_rs`)
**Also known as: Seq-MIS (Sequence-Level Masked IS)**
**Configuration:**
```python
config = RolloutCorrectionConfig.decoupled_seq_is_rs(is_threshold=2.0, rs_threshold="0.5_2.0")
```
**Components:**
- **Operating Mode**: Decoupled (3 policies)
- **Loss**: PPO with clipping (only for the second drift correction)
- **IS Aggregation**: Sequence-level (Seq-TIS)
- **RS**: Sequence-level rejection (Seq-MIS)
**Equivalent YAML:**
```yaml
algorithm:
rollout_correction:
rollout_is: sequence
rollout_is_threshold: 2.0
rollout_rs: seq_sum_k1
rollout_rs_threshold: 0.5_2.0
bypass_mode: false # Decoupled mode
```
**Properties:**
- Double mechanism: IS reweighting (Seq-TIS) + rejection filtering (Seq-MIS)
- Lower effective sample size (rejects outliers)
- For severe off-policy gaps or when the distribution tail is "toxic" (garbage/adversarial samples)
**When to use Seq-MIS over Seq-TIS:**
- **Seq-TIS (clipping only)**: Maximizes information efficiency; extracts signal from all samples. Use when data is clean and mismatch is moderate.
- **Seq-MIS (rejection)**: Maximizes safety; acts as a hard trust region filter. Use when mismatch is severe or when high-weight samples are likely garbage rather than signal.
**Theory:** See [rollout_corr_math.md §3.5](rollout_corr_math.md#35-rejection-sampling-rs)
---
### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`)
**Configuration:**
```python
config = RolloutCorrectionConfig.bypass_ppo_clip()
```
**Components:**
- **Operating Mode**: Bypass (2 policies: π_rollout = π_old, π_θ)
- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights)
- **IS Aggregation**: None (PPO ratio handles it)
- **RS**: None
**Equivalent YAML:**
```yaml
rollout_correction:
rollout_is: null
rollout_rs: null
bypass_mode: true
loss_type: ppo_clip
```
**Properties:**
- PPO clipped objective in bypass mode
- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed)
- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3)
- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS
**Configuration requirement:**
- Set `actor_rollout_ref.rollout.calculate_log_probs: true`
**Additional requirements for bypass mode:**
- Set `actor_rollout_ref.actor.use_rollout_log_probs: true`
- Set `actor_rollout_ref.actor.policy_loss.loss_mode: bypass_mode`
- Set rollout correction config via `actor_rollout_ref.actor.policy_loss.rollout_correction`
**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
---
### 7. REINFORCE with IS (`bypass_pg_is`)
**Configuration:**
```python
config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0)
```
**Components:**
- **Operating Mode**: Bypass (2 policies: π_rollout, π_θ)
- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping)
- **IS Aggregation**: Sequence-level
- **RS**: None
**Equivalent YAML:**
```yaml
rollout_correction:
rollout_is: sequence
rollout_is_threshold: 2.0
rollout_rs: null
bypass_mode: true
loss_type: reinforce # REINFORCE with explicit IS weights
```
**Properties:**
- REINFORCE loss with explicit IS weights (no PPO clipping)
- Single forward pass (skips old_log_prob computation)
- IS weights computed on-the-fly in loss function
**Theory:** See [rollout_corr_math.md §3.2.2](rollout_corr_math.md#322-policy-gradient-loss-with-isrs-correction)
---
## Additional Useful Configurations (Not Exposed as Presets)
These configurations are **fully supported** but don't have convenience preset methods yet.
### 1. Token IS + Token RS (`token_is_rs`)
Token-level IS weights with token-level RS mask.
**Python:**
```python
config = RolloutCorrectionConfig(
rollout_is="token",
rollout_is_threshold=2.0,
rollout_rs="token_k1",
rollout_rs_threshold=2.0,
)
```
**Properties:** Per-token IS weights + per-token RS mask.
### 2. Pure Token RS (`token_rs`)
Token-level RS only, no IS weights.
**Python:**
```python
config = RolloutCorrectionConfig(
rollout_is=None,
rollout_rs="token_k1",
rollout_rs_threshold=2.0,
)
```
**Properties:** Token-level RS mask, no IS reweighting.
### 3. Pure Sequence RS (`seq_rs`)
Sequence-level RS only, no IS weights.
**Python:**
```python
config = RolloutCorrectionConfig(
rollout_is=None,
rollout_rs="seq_sum_k1",
rollout_rs_threshold="0.5_2.0",
)
```
**Properties:** Sequence-level RS mask, no IS reweighting.
---
### Summary: How IS Weights are Processed
IS weights (`rollout_is_weights`) go through a fixed processing pipeline:
**Stage 1: Safety Bound (Prevent Overflow)**
- Token level: `exp(clamp(log_ratio, -20, 20))` per token → bounds each token to [2e-9, 5e8]
- Sequence level: `exp(clamp(sum(log_ratio), -20, 20))` → bounds product to [2e-9, 5e8], broadcast to all tokens
**Stage 2: Truncation (Reduce Variance)**
- `.clamp(max=rollout_is_threshold)` → caps weights at upper threshold (TIS: Truncated Importance Sampling)
- No lower truncation (preserves unbiasedness for small weights)
**Stage 3: Padding Zeroing (Correct Aggregation)**
- `weights * response_mask` → zeros out padding positions
**Stage 4: Optional Batch Normalization**
- If `rollout_is_batch_normalize=True`: Normalize weights to mean=1.0 within batch
- Applied after truncation to preserve truncation semantics
**Rejection Sampling (Separate Mechanism)**
Rejection sampling modifies `response_mask` (NOT weights) through `compute_rollout_rejection_mask()`:
- Computes safety-bounded ratios independently
- Creates binary mask: tokens/sequences outside [lower_threshold, upper_threshold] → 0 (rejected)
- Modified mask used for loss aggregation
## Operation Modes
The framework provides **two operating modes** for computing π_old, which can be combined with different loss functions.
### Operating Modes and Configuration
| Configuration | `bypass_mode` | `loss_type` | Operating Mode | Loss Function | Description |
| ---------------------- | ------------- | ---------------------- | -------------- | ------------- | ----------------------------------------------------------------- |
| **Decoupled** | `false` | N/A | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` |
| **Bypass + PPO-clip** | `true` | `"ppo_clip"` (default) | Bypass | PPO-clip | PPO clipped objective (IS handled by ratio) |
| **Bypass + REINFORCE** | `true` | `"reinforce"` | Bypass | REINFORCE | Policy gradient with explicit IS weights (no PPO clipping) |
### Operating Mode Details
#### Decoupled Mode (Three Policies)
**Policy setup:**
- π_rollout: Behavior policy (data collection)
- π_old: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
- π_θ: Current policy (being updated)
**Configuration:** `bypass_mode = false`
**Properties:**
- ✅ Achieves batch size invariance
- ✅ Separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
- ✅ Efficient stale data utilization
- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
**Theory:** See [rollout_corr_math.md §3.1.1](rollout_corr_math.md#311-decoupled-mode-three-policies)
#### Bypass Mode (Two Policies)
**Policy setup:**
- π_rollout: Behavior policy (data collection)
- π_old = π_rollout: Proximal policy equals behavior policy
- π_θ: Current policy (being updated)
**Configuration:** `bypass_mode = true`
**Properties:**
- ✅ Skips `actor.compute_log_prob()` call (faster)
- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
- ✅ Uses two policies instead of three (π_rollout = π_old)
- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
---
### IS/RS Aggregation Levels (Orthogonal to Operating Mode)
The aggregation level can be chosen **independently** of the operating mode. Any aggregation level works in either decoupled or bypass mode.
| `rollout_is` | `rollout_rs` | Behavior |
| ------------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
| `null` | `null` | **Disabled**: No computation, no metrics, no rejection |
| `null` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Rejection only**: Compute metrics, NO weight correction, YES rejection sampling |
| `"token"` or `"sequence"` | `null` | **IS weights only**: Weight correction enabled, NO rejection sampling |
| `"token"` or `"sequence"` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Full correction**: Both weight correction and rejection sampling enabled |
### Key Insights
- ✅ Any IS/RS aggregation level (token/sequence/geometric) can be used in **either** decoupled or bypass mode
- ✅ You can use **rejection sampling alone** without IS weight correction (`rollout_is=null, rollout_rs="token_k1"`)
- ✅ You can use **IS weights alone** without outlier rejection (`rollout_is="token", rollout_rs=null`)
- ✅ You can use **both together** (`rollout_is="token", rollout_rs="token_k1"`)
- ✅ You can **monitor metrics only** without any correction by setting both to `null` but still providing rollout_log_probs
**Theory:** See [rollout_corr_math.md §3.3](rollout_corr_math.md#33-isrs-aggregation-levels) for details on aggregation levels.
### Example Workflow
**Recommended: Bypass Mode**
This workflow uses bypass mode for efficiency.
1. **Start with metrics only** to understand the off-policy gap:
```yaml
rollout_correction:
rollout_is: null
rollout_rs: null
bypass_mode: true # Bypass mode (recommended)
loss_type: ppo_clip # Default: PPO clipped objective
```
Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap.
2. **Enable rejection sampling** if you see high outlier fractions:
```yaml
rollout_correction:
rollout_is: null
rollout_rs: sequence # or "geometric" for higher sensitivity
rollout_rs_threshold: 2.0
bypass_mode: true # Bypass mode
loss_type: ppo_clip # or "reinforce" for explicit IS weights
```
This excludes outliers from training without modifying gradients.
3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics:
```yaml
rollout_correction:
rollout_is: sequence # Recommended: unbiased, suitable for most cases
rollout_is_threshold: 2.0
rollout_rs: sequence # or "geometric" for more aggressive filtering
rollout_rs_threshold: 2.0
bypass_mode: true # Bypass mode
loss_type: reinforce # REINFORCE with explicit IS weights
```
**Benefits of bypass mode:**
- ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster)
- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce"
- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied
- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout)
- ✅ Both loss types work with all IS/RS combinations
## Usage
### Basic Setup
```yaml
algorithm:
rollout_correction:
rollout_is: token # Enable IS weights at token level
rollout_is_threshold: 2.0 # Threshold for IS weights
rollout_rs: null # No rejection sampling
actor_rollout_ref:
rollout:
calculate_log_probs: true # Required!
```
### Additional Configurations for Bypass Mode
- Set `actor_rollout_ref.actor.use_rollout_log_probs: true`
- Set `actor_rollout_ref.actor.policy_loss.loss_mode: bypass_mode`
- Set rollout correction config via `actor_rollout_ref.actor.policy_loss.rollout_correction`
### Metrics
All metrics are prefixed with `rollout_corr/` in logs. For example, `rollout_is_mean` appears as `rollout_corr/rollout_is_mean`.
These metrics cover both:
- **Diagnostic metrics**: KL divergence, perplexity differences (measuring off-policy gap)
- **Correction statistics**: IS weights, rejection rates (measuring correction applied)
#### **Core IS Weight Metrics**
- **`rollout_is_mean`**: Mean importance sampling weight across all valid tokens
- Value close to 1.0 indicates minimal off-policy gap
- **`rollout_is_std`**: Standard deviation of IS weights
- Higher values indicate greater variance in IS weights
- **`rollout_is_min`**: Minimum IS weight observed
- Shows the most underweighted token/sequence
- For sequence/geometric: computed from unclamped log-space ratios (true minimum)
- For token: computed from safety-bounded weights
- **`rollout_is_max`**: Maximum IS weight observed
- Shows the most overweighted token/sequence
- For sequence/geometric: computed from unclamped log-space ratios (true maximum before safety bound)
- For token: computed from safety-bounded weights (before threshold clamping)
- Compare with `rollout_is_threshold` to see truncation impact
#### **Effective Sample Size**
- **`rollout_is_eff_sample_size`**: Effective sample size after IS weighting
- **Formula**: `1 / mean(weights²)` where weights are normalized
- **Range**: 0.0 to 1.0 (as fraction of original batch)
- Lower values indicate weight concentration on fewer samples
#### **Threshold Exceedance Metrics**
- **`rollout_is_ratio_fraction_high`**: Fraction of weights exceeding upper threshold
- Shows how often truncation/masking occurs on high end
- For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
- For token: computed from safety-bounded weights (before threshold clamping)
- **`rollout_is_ratio_fraction_low`**: Fraction of weights below lower threshold (1/upper_threshold)
- Diagnostic metric showing how many weights are below the reciprocal threshold
- For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
- For token: computed from safety-bounded weights (before truncation)
#### **Sequence-Level Metrics** (for sequence aggregation)
- **`rollout_is_seq_mean`**: Mean IS weight at sequence level
- Should match `rollout_is_mean` for sequence-level aggregation
- **`rollout_is_seq_std`**: Standard deviation of sequence-level IS weights
- **`rollout_is_seq_min`**: Minimum sequence-level IS weight
- **`rollout_is_seq_max`**: Maximum sequence-level IS weight
- **`rollout_is_seq_max_deviation`**: Maximum absolute deviation from 1.0 at sequence level
- Shows worst-case sequence off-policy gap
- **`rollout_is_seq_fraction_high`**: Fraction of sequences exceeding upper threshold
- **`rollout_is_seq_fraction_low`**: Fraction of sequences below lower threshold
#### **Rejection Sampling Metrics** (when `rollout_rs` is enabled)
- **`rollout_rs_masked_fraction`**: Fraction of tokens rejected via rejection sampling
- **Important**: Rejection sampling modifies `response_mask` (sets rejected tokens to 0)
- **Separate from IS weights**: IS weights are still truncated; rejection is an independent filtering step
- Only present when `rollout_rs` is enabled (token/sequence/geometric)
- **`rollout_rs_seq_masked_fraction`**: Fraction of sequences with at least one rejected token
- Shows sequence-level impact of rejection sampling
- Token-level RS: sequence rejected if ANY token is outside [lower, upper]
- Sequence-level RS: entire sequence rejected or accepted based on sequence-level ratio
- Geometric RS: entire sequence rejected or accepted based on geometric mean
#### **Off-Policy Diagnostic Metrics** (Training vs Rollout Policy)
**Note on terminology:** These metrics use "training" to refer to the training reference policy and "rollout" to refer to π_rollout (the behavior policy used for data collection).
- **Decoupled mode**: "training" = π_old (computed at start of training epoch)
- **Bypass/Pure IS mode**: "training" = π_θ (current policy being trained)
In bypass/pure IS mode, metrics measure the drift between π_θ and π_rollout directly.
- **`training_ppl`**: Perplexity of training reference policy (π_old in decoupled mode, π_θ in bypass/pure IS mode)
- **Formula**: `exp(-mean(log_probs))`
- Lower values indicate higher model confidence
- **`rollout_ppl`**: Perplexity of rollout policy π_rollout (e.g., vLLM BF16)
- **`ppl_ratio`**: Ratio of training PPL to rollout PPL
- **Formula**: `exp(mean(log(training_ppl / rollout_ppl)))`
- **Meaning**: > 1.0 means training is less confident than rollout
- **`training_log_ppl`**: Log perplexity of training policy
- Useful for identifying trends (linear scale)
- **`rollout_log_ppl`**: Log perplexity of rollout policy
- **`log_ppl_diff`**: Mean difference in log perplexities
- **Formula**: `mean(log_ppl_rollout - log_ppl_training)`
- Sign indicates which policy is more confident
- **`log_ppl_abs_diff`**: Mean absolute log perplexity difference
- Magnitude of off-policy gap regardless of direction
- **`log_ppl_diff_max`**: Maximum log perplexity difference across sequences
- Identifies worst-case sequence
- **`log_ppl_diff_min`**: Minimum log perplexity difference across sequences
- **`kl`**: KL divergence KL(π_rollout || π_training)
- **Formula**: `mean(log_prob_rollout - log_prob_training)`
- **Note**: Can be negative (rollout is less confident)
- **`k3_kl`**: K3 divergence (equals KL(π_rollout || π_training) in expectation)
- **Formula**: `mean(exp(log_ratio) - log_ratio - 1)`
- More stable than direct KL (non-negative per token)
- Always >= 0
- **`chi2_token`**: Chi-squared divergence at token level
- **Formula**: `mean(ratio²) - 1` where ratio = π_training/π_rollout
- Measures second moment of IS weight distribution
- Always non-negative
- **`chi2_seq`**: Chi-squared divergence at sequence level
- **Formula**: `mean((∏_t ratio_t)²) - 1`
- Sequence-level second moment of IS weights
- More sensitive than token-level chi-squared
#### **Example: Accessing Metrics in Code**
```python
# Metrics are returned from compute_rollout_correction_and_rejection_mask
from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask
# Returns 3 values (weights, modified_response_mask, metrics)
weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
old_log_prob=training_log_probs, # from training policy
rollout_log_prob=rollout_log_probs, # from rollout policy
response_mask=response_mask,
rollout_is="token", # Enable IS weights at token level
rollout_is_threshold=2.0,
rollout_rs="token_k1",
rollout_rs_threshold="0.5_2.0",
)
# Extract IS weights (processed, zeroed at padding)
is_weights = weights_proto.batch["rollout_is_weights"]
# IS weights processing (with IS enabled at token level):
# 1. Safety-bounded: exp(clamp(log_ratio, -20, 20)) per token
# 2. Truncated: .clamp(max=2.0) to cap extreme weights
# 3. Zeroed at padding positions
# Note: Truncation is ALWAYS applied to IS weights (TIS: Truncated Importance Sampling)
# modified_response_mask has rejection applied (since rollout_rs="token_k1"):
# 1. RS rejection: tokens outside [0.5, 2.0] masked to 0 via response_mask
# Note: RS and IS are separate mechanisms - both can be enabled independently
# All metrics have 'rollout_corr/' prefix
print(f"Mean IS weight: {metrics['rollout_corr/rollout_is_mean']:.3f}")
print(f"Effective sample size: {metrics['rollout_corr/rollout_is_eff_sample_size']:.3f}")
print(f"RS masked fraction: {metrics['rollout_corr/rollout_rs_masked_fraction']:.3f}")
print(f"KL divergence: {metrics['rollout_corr/kl']:.3f}")
# Check IS weights for valid tokens (non-padding)
valid_weights = is_weights[response_mask.bool()]
print(f"\n✓ IS weights min (valid tokens): {valid_weights.min():.4f}")
print(f"✓ IS weights max (valid tokens): {valid_weights.max():.4f}")
print(f"✓ All valid IS weights > 0: {(valid_weights > 0).all()}")
print(f"✓ IS weights are capped at threshold: {(valid_weights <= 2.0).all()}")
# Check rejection via response_mask
rejected_tokens = (response_mask == 1) & (modified_response_mask == 0)
print(f"\n✓ Rejected {rejected_tokens.sum()} tokens via response_mask")
print(f"✓ Rejection sampling modifies response_mask (separate from IS weight truncation)")
print(f"✓ IS weights are always truncated to [0, threshold] after safety bounding")
# Check for warning conditions
if metrics['rollout_corr/rollout_is_mean'] < 0.5 or metrics['rollout_corr/rollout_is_mean'] > 2.0:
print("⚠️ Warning: Mean IS weight far from 1.0, significant off-policy gap detected")
if metrics['rollout_corr/rollout_is_eff_sample_size'] < 0.3:
print("⚠️ Warning: Low effective sample size, high weight concentration")
```
#### **Example: Monitoring Metrics During Training**
```python
# In your training loop
for epoch in range(num_epochs):
for batch_idx, batch in enumerate(dataloader):
# ... rollout phase ...
# Compute IS weights and get metrics
rollout_corr_config = config.algorithm.get("rollout_correction", None)
if rollout_corr_config is not None:
weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
old_log_prob=batch.old_log_prob,
rollout_log_prob=batch.rollout_log_prob,
response_mask=batch.response_mask,
rollout_is=rollout_corr_config.get("rollout_is", None),
rollout_is_threshold=rollout_corr_config.get("rollout_is_threshold", 2.0),
rollout_rs=rollout_corr_config.get("rollout_rs", None),
rollout_rs_threshold=rollout_corr_config.get("rollout_rs_threshold", None),
)
# Log to tensorboard/wandb
for metric_name, metric_value in metrics.items():
logger.log_scalar(metric_name, metric_value, step=global_step)
# IMPORTANT: Update batch response_mask with rejection applied
batch.response_mask = modified_response_mask
# Use IS weights in training (always safety-bounded, zeroed at padding)
is_weights = weights_proto.batch["rollout_is_weights"]
# ... apply weights to policy gradient ...
```
#### **Example: Conditional Alerting Based on Metrics**
```python
def check_rollout_correction_health(metrics, config):
"""Check if Rollout Correction metrics indicate healthy training."""
warnings = []
# Check mean IS weight
mean_weight = metrics['rollout_corr/rollout_is_mean']
if mean_weight < 0.5 or mean_weight > 2.0:
warnings.append(f"Mean IS weight {mean_weight:.3f} is far from 1.0")
# Check effective sample size
ess = metrics['rollout_corr/rollout_is_eff_sample_size']
if ess < 0.3:
warnings.append(f"Effective sample size {ess:.3f} is too low")
# Check standard deviation
std = metrics['rollout_corr/rollout_is_std']
if std > 1.0:
warnings.append(f"IS weight std {std:.3f} is too high")
# Check KL divergence
kl = metrics['rollout_corr/kl']
if abs(kl) > 0.1:
warnings.append(f"KL divergence {kl:.3f} indicates significant off-policy gap")
# Check chi-squared divergence
if 'rollout_corr/chi2_token' in metrics:
chi2_token = metrics['rollout_corr/chi2_token']
if chi2_token > 1.0:
warnings.append(f"Chi-squared divergence (token) {chi2_token:.3f} indicates severe distribution shift")
if warnings:
print("⚠️ Rollout Correction Health Warnings:")
for warning in warnings:
print(f" - {warning}")
return False
else:
print("✅ Rollout Correction metrics look healthy")
return True
# Use in training
_, _, metrics = compute_rollout_correction_and_rejection_mask(...)
is_healthy = check_rollout_correction_health(metrics, config)
if not is_healthy:
# Consider adjusting config or investigating issues
print("Consider:")
print(" - Tightening rollout_is_threshold")
print(" - Switching to geometric aggregation level")
print(" - Checking if rollout and training policies are too different")
```
### Running Examples
Start with the basic token-level truncate configuration:
```bash
bash examples/rollout_correction/run_with_rollout_corr.sh
```
Monitor metrics for 1-2 epochs before adjusting parameters.
## Configuration Examples
### Example 1: IS Weights Only (Token Level)
```yaml
algorithm:
rollout_correction:
rollout_is: token
rollout_is_threshold: 2.0
rollout_rs: null # No rejection sampling
```
### Example 2: Rejection Sampling Only (No IS Weights)
```yaml
algorithm:
rollout_correction:
rollout_is: null # No IS weights
rollout_rs: token_k1
rollout_rs_threshold: "0.5_2.0"
```
### Example 3: Both IS and RS (Token RS)
```yaml
algorithm:
rollout_correction:
rollout_is: token
rollout_is_threshold: 2.0
rollout_rs: token_k1
rollout_rs_threshold: "0.5_2.0"
```
### Example 5: Bypass Mode with PPO-clip (Default)
```yaml
algorithm:
rollout_correction:
rollout_is: token
rollout_is_threshold: 2.0
rollout_rs: token_k1
rollout_rs_threshold: "0.5_2.0"
bypass_mode: true # Skip old_log_prob computation
loss_type: ppo_clip # PPO clipped objective (default)
```
**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.**
### Example 6: Bypass Mode with REINFORCE
```yaml
rollout_correction:
rollout_is: sequence # Explicit IS correction in loss
rollout_is_threshold: 2.0
rollout_rs: null # Optional: can add rejection sampling
bypass_mode: true
loss_type: reinforce # REINFORCE with explicit IS weights
```
**No PPO clipping, pure policy gradient with IS correction**
### Example 7: Bypass Mode with PPO-clip + Rejection Sampling
```yaml
rollout_correction:
rollout_is: sequence # Computed for metrics
rollout_is_threshold: 2.0
rollout_rs: seq_max_k2 # Sequence max χ²/2 guard
rollout_rs_threshold: 2.5
bypass_mode: true
loss_type: ppo_clip # PPO clipped objective (IS handled by ratio)
```
**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).**
## Troubleshooting
### Issue: High spread in IS weights
**Symptoms:** `rollout_is_std` > 1.0, `rollout_is_eff_sample_size` < 0.3
**Solutions:**
1. Switch from `sequence` to `geometric` level
2. Tighten thresholds
3. Verify rollout and training aren't too different
### Issue: Mean IS weight far from 1.0
**Symptoms:** `rollout_is_mean` < 0.5 or > 2.0
**Solutions:**
1. Verify `calculate_log_probs=True` is set
2. Check rollout_log_probs are correctly passed
3. Check for systematic distribution shift
### Debugging: Visualizing Metrics
**Example: Plot IS weight distribution**
```python
import matplotlib.pyplot as plt
import numpy as np
def plot_is_metrics(metrics_history):
"""Plot rollout IS metrics over training steps."""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# Plot 1: Mean IS weight over time
axes[0, 0].plot(metrics_history['rollout_corr/rollout_is_mean'])
axes[0, 0].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
axes[0, 0].set_title('Mean IS Weight')
axes[0, 0].set_xlabel('Step')
axes[0, 0].legend()
# Plot 2: Effective sample size
axes[0, 1].plot(metrics_history['rollout_corr/rollout_is_eff_sample_size'])
axes[0, 1].axhline(y=0.5, color='g', linestyle='--', label='Good')
axes[0, 1].axhline(y=0.3, color='r', linestyle='--', label='Warning')
axes[0, 1].set_title('Effective Sample Size')
axes[0, 1].set_xlabel('Step')
axes[0, 1].legend()
# Plot 3: KL divergence over time
axes[1, 0].plot(metrics_history['rollout_corr/kl'], label='KL')
axes[1, 0].plot(metrics_history['rollout_corr/k3_kl'], label='K3 KL')
axes[1, 0].axhline(y=0, color='g', linestyle='--', alpha=0.3)
axes[1, 0].set_title('KL Divergence')
axes[1, 0].set_xlabel('Step')
axes[1, 0].legend()
# Plot 4: PPL ratio over time
axes[1, 1].plot(metrics_history['rollout_corr/ppl_ratio'])
axes[1, 1].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
axes[1, 1].set_title('PPL Ratio (Training/Rollout)')
axes[1, 1].set_xlabel('Step')
axes[1, 1].legend()
# Plot 5: Chi-squared divergence
if 'rollout_corr/chi2_token' in metrics_history:
axes[1, 2].plot(metrics_history['rollout_corr/chi2_token'], label='Token-level')
if 'rollout_corr/chi2_seq' in metrics_history:
axes[1, 2].plot(metrics_history['rollout_corr/chi2_seq'], label='Seq-level')
axes[1, 2].axhline(y=1.0, color='r', linestyle='--', label='Warning')
axes[1, 2].set_title('Chi-squared Divergence')
axes[1, 2].set_xlabel('Step')
axes[1, 2].legend()
else:
axes[1, 2].axis('off')
plt.tight_layout()
plt.savefig('rollout_is_metrics.png', dpi=150)
print("Saved plot to rollout_is_metrics.png")
```
**Example: Metric collection during training**
```python
# Collect metrics over time
metrics_history = {
'rollout_corr/rollout_is_mean': [],
'rollout_corr/rollout_is_eff_sample_size': [],
'rollout_corr/kl': [],
'rollout_corr/k3_kl': [],
'rollout_corr/ppl_ratio': [],
'rollout_corr/chi2_token': [],
'rollout_corr/chi2_seq': [],
}
# In training loop
for step in range(num_steps):
# ... compute IS weights and rejection mask ...
_, _, metrics = compute_rollout_correction_and_rejection_mask(...)
# Store metrics
for key in metrics_history.keys():
if key in metrics:
metrics_history[key].append(metrics[key])
# Plot every 100 steps
if step % 100 == 0:
plot_is_metrics(metrics_history)
```
## Performance Impact
- **Memory overhead**: ~1% of model memory
- **Computational overhead**: 1-3% depending on level
- **Training stability**: Significantly improved when off-policy gap exists
## Testing
Run the test suite to verify everything works:
```bash
# Basic unit tests
python tests/trainer/ppo/test_rollout_corr.py
# Integration tests (if pytest is available)
pytest tests/trainer/ppo/test_rollout_corr_integration.py -v
```
Expected output: All tests pass ✓
## Additional Resources
- **Implementation**: `verl/trainer/ppo/rollout_corr_helper.py`
- **Examples**: `examples/rollout_correction/`
- **DAPO Example**: `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh`
## Summary
Rollout Correction provides a unified framework for handling general off-policy problems in RL:
- ✅ Corrects ANY distribution shift between data collection and training
- ✅ Supports diverse scenarios: policy mismatch, staleness, replay buffers, off-policy algorithms
- ✅ Numerical stability with safety bounds and rejection mechanisms
- ✅ Comprehensive diagnostics: KL, perplexity, χ² divergence
- ✅ Flexible methods from token-level to sequence-level aggregation
- ✅ Memory-efficient implementation
## References
- **[Mathematical Formulations](rollout_corr_math.md)** - Detailed mathematical theory and derivations for all rollout correction methods
- [Your Efficient RL Framework Secretly Brings You Off-Policy RL Training](https://fengyao.notion.site/off-policy-rl)
================================================
FILE: docs/algo/rollout_corr_math.md
================================================
# Mathematical Formulations of Rollout Correction Methods in `verl`
**Author:** [Yingru Li](https://richardli.xyz)
**Last updated:** 2025-11-04
---
> **📖 Documentation Structure**
> - **This document** - Mathematical theory: formulations, derivations, and algorithmic foundations
> - **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical implementation: configurations, presets, troubleshooting
>
> Start here for theory and design rationale, refer to the usage guide for implementation.
---
### BibTeX Citation
```bibtex
@online{liu-li-2025-rl-collapse,
title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
year = {2025},
month = sep,
url = {https://richardli.xyz/rl-collapse}
}
@article{li2025trust,
title={Trust Region Masking for Long-Horizon LLM Reinforcement Learning},
author={Li, Yingru and Liu, Jiacai and Xu, Jiawei and Tong, Yuxuan and Li, Ziniu and Liu, Qian and Wang, Baoxiang},
journal={arXiv preprint arXiv:2512.23075},
year={2025}
}
```
### Blog Series
- Main blog post: https://richardli.xyz/rl-collapse
- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
- Latest Paper: https://arxiv.org/abs/2512.23075
## Abstract
This document provides the definitive mathematical formulations for rollout correction methods in `verl`, following the natural progression from **REINFORCE** to **PPO** to **Decoupled PPO**.
Rollout correction provides a unified framework to handle **general off-policy problems** in RL training - any scenario where the data collection distribution differs from the training distribution.
**Applicable scenarios include:**
- **Policy mismatch**: Different precision (FP8 vs FP16 vs BF16 vs FP32), different backends (vLLM vs SGLang vs FSDP vs Megatron)
- **Temporal lag**: Model staleness, asynchronous rollout workers
- **Replay buffers**: Training on historical trajectories from earlier policy versions
- **Off-policy algorithms**: Behavioral cloning, DAPO, expert demonstrations
- **Data filtering**: Reweighting, preference learning, curriculum learning
---
## Table of Contents
1. [Theoretical Foundation: From REINFORCE to Decoupled PPO](#1-theoretical-foundation-from-reinforce-to-decoupled-ppo)
2. [Implementation in verl: The Three-Policy Framework](#2-implementation-in-verl-the-three-policy-framework)
3. [Algorithmic Components and Combinations](#3-algorithmic-components-and-combinations)
4. [Off-Policy Diagnostic Metrics](#4-off-policy-diagnostic-metrics)
5. [Summary and Decision Guide](#5-summary-and-decision-guide)
6. [Implementation References](#6-implementation-references)
---
## 1. Theoretical Foundation: From REINFORCE to Decoupled PPO
This section establishes the theoretical progression that `verl` implements.
### 1.1 REINFORCE: Policy Gradient Baseline
The REINFORCE algorithm ([Williams, 1992](https://doi.org/10.1007/BF00992696)) is the foundation of policy gradient methods.
**Vanilla REINFORCE (On-Policy)**
For trajectories $\tau = (s_0, a_0, s_1, a_1, \ldots, s_T, a_T)$ sampled from the current policy $\pi_\theta$, the policy gradient is:
$$
\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
$$
where $A_t$ is the advantage function at timestep $t$.
**Off-Policy REINFORCE**
When trajectories are sampled from a different behavior policy $\mu$, we apply importance sampling over the **joint trajectory distribution**:
$$
\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \mu} \left[ \frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
$$
where the trajectory-level importance weight is:
$$
\frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} = \frac{p(s_0) \prod_{t=0}^T \pi_\theta(a_t|s_t) p(s_{t+1}|s_t, a_t)}{p(s_0) \prod_{t=0}^T \mu(a_t|s_t) p(s_{t+1}|s_t, a_t)} = \prod_{t=0}^T \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}
$$
The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel out, leaving only the product of per-step action probability ratios.
**Key properties:**
- **Off-policy capable**: Can learn from any behavior policy via importance sampling
- **No trust region**: Policy updates not constrained
**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling.
### 1.2 PPO: Adding Trust Region Control
Proximal Policy Optimization ([Schulman et al., 2017](https://arxiv.org/abs/1707.06347)) adds a clipped surrogate objective:
$$
L_{\text{PPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
$$
where $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}$ and $\epsilon$ is the clip range (typically 0.2).
**Key properties:**
- **Two policies**: $\mu$ (reference for clipping) and $\pi_\theta$ (being updated)
- **Trust region via clipping**: Limits policy update magnitude via ratio $r_t(\theta) = \frac{\pi_\theta}{\mu}$
### 1.3 Decoupled PPO: Achieving Batch Size Invariance
Decoupled PPO ([Hilton et al., 2021](https://arxiv.org/abs/2110.00641)) solves PPO's batch size sensitivity by **decoupling two roles**:
1. **Proximal policy** $\pi_{\text{prox}}$: The anchor policy for PPO clipping (controls policy update size)
2. **Behavior policy** $\mu$: The policy that collected the data (for off-policy correction via importance sampling)
**The problem**: Standard PPO controls policy update size via the ratio $\frac{\pi_\theta}{\pi_{\text{old}}}$, where $\pi_{\text{old}}$ is assumed to be both the proximal policy *and* the behavior policy. This coupling makes the algorithm sensitive to batch size because aggregating data from multiple workers or using replay buffers changes the effective behavior policy.
**The solution**: Decouple these two roles, leading to a **three-policy formulation**:
$$
L_{\text{DecoupledPPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
$$
where:
- $w_t = \frac{\pi_{\text{prox}}(a_t|s_t)}{\mu(a_t|s_t)}$: Importance sampling weight (corrects for behavior policy $\mu$). Here $\pi_{\text{prox}}$ is frozen during training, so $w_t$ is constant (no stopgrad operator needed).
- $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{prox}}(a_t|s_t)}$: PPO ratio (controls policy update size against proximal policy $\pi_{\text{prox}}$)
**Key properties**: By decoupling:
- **Batch size invariance**: Policy update control (via $\pi_{\text{prox}}$) is independent of data aggregation
- **Flexible behavior policy**: Any $\mu$ can be used (different workers, replay buffers, or stale checkpoints)
- **Stale data utilization**: Older trajectories can be corrected via importance sampling
- **Clipping preserved**: Clipping against $\pi_{\text{prox}}$ limits update magnitude
**This is the algorithm that `verl` implements via its three-policy framework.**
---
## 2. Implementation in verl: The Three-Policy Framework
The `verl` library implements decoupled PPO using three distinct policies, each serving a specific role.
### 2.1 Policy Roles and Notation
**$\pi_{\text{rollout}}$ (Behavior Policy $\mu$)**
The policy used for data collection. This is the behavior distribution $\mu$ from theory.
- **When created**: During rollout/data collection phase
- **Purpose**: Generate trajectories for training
- **Common sources**:
- Policy mismatch: Same weights, different implementation (precision, backend)
- Temporal lag: Stale checkpoint from async workers
- Replay buffer: Historical data from earlier iterations
- Off-policy algorithms: Expert demonstrations, auxiliary policies (DAPO)
- Data filtering: Reweighted or filtered data
- **Fixed**: Frozen during training on a batch
**$\pi_{\text{old}}$ (Proximal Policy $\pi_{\text{prox}}$)**
The reference policy for PPO clipping. This is the "proximal policy" from decoupled PPO theory.
- **When created**:
- **Decoupled mode**: Computed at start of training epoch via `actor.compute_log_prob()`
- **Bypass mode**: Set equal to $\pi_{\text{rollout}}$ (skips separate computation)
- **Purpose**:
- Anchor point for PPO clipping (controls policy update size)
- When separate from $\pi_{\text{rollout}}$: Enables batch size invariance and efficient use of stale data
- **Fixed**: Frozen during all PPO update epochs on the same batch
**$\pi_{\theta}$ (Current Policy)**
The policy being actively optimized during training.
- **Updated**: Every gradient step
- **Purpose**: The policy we're improving
### 2.2 Operating Modes
The three-policy framework can operate in two modes:
**Decoupled Mode (Three Policies)**
- Computes $\pi_{\text{old}}$ separately at the start of each training epoch
- **Algorithm**: Full decoupled PPO with three policies (mathematically correct)
- **Properties**: Achieves batch size invariance; separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
**Bypass Mode (Two Policies)**
- Sets $\pi_{\text{old}} = \pi_{\text{rollout}}$ (skips separate computation)
- **Algorithm**: Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
- **Key difference**: Proximal policy equals behavior policy, so no IS correction needed between them
- **Properties**: Faster (skips `actor.compute_log_prob()` call); does not achieve batch size invariance
### 2.3 Two Distribution Shifts
The three-policy framework handles two types of distribution drift:
**Drift 1: $\pi_{\text{rollout}} \to \pi_{\text{old}}$ (Off-Policy Gap)**
This is the distribution shift between the data collection policy and the training reference policy.
- **Nature**: Ranges from negligible (same checkpoint, minor differences) to severe (replay buffers, expert data)
- **Correction**: Importance sampling weight $w_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$
- **Optional**: Can be ignored (bypass mode) when negligible
**Drift 2: $\pi_{\text{old}} \to \pi_{\theta}$ (Policy Update Drift)**
This is the drift from policy parameter updates during training.
- **Nature**: Occurs as $\pi_\theta$ is updated via gradient descent
- **Correction**: PPO clipping on ratio $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$
- **Universal**: Applies to both on-policy and off-policy training
### 2.4 Notation Summary
- $\pi_{\text{rollout}}$: Behavior policy (data collection)
- $\pi_{\text{old}}$: Proximal policy (PPO anchor)
- $\pi_{\theta}$: Current policy (being updated)
- $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$: Per-token IS ratio (corrects Drift 1)
- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio (corrects Drift 2)
- $A_t$: Advantage at token $t$
- $T$: Set of valid tokens in a sequence
- $C_{\text{IS}}$: Upper threshold for IS weights (e.g., 2.0)
- $C_{\text{RS-upper}}$: Upper threshold for RS mask (e.g., 2.0)
- $C_{\text{RS-lower}}$: Lower threshold for RS mask (typically $1/C_{\text{RS-upper}}$)
- $\epsilon$: PPO clip range (typically 0.2)
---
## 3. Algorithmic Components and Combinations
The rollout correction framework in `verl` is built from **orthogonal components** that can be combined flexibly:
1. **Operating Mode**: How $\pi_{\text{old}}$ is computed (Decoupled vs Bypass)
2. **Loss Function**: PPO (with clipping) vs Pure IS (policy gradient only)
3. **IS/RS Aggregation Level**: Token, Sequence, or Geometric
This section explains each component and their valid combinations.
### 3.1 Operating Modes: Decoupled vs Bypass
The operating mode determines how the proximal policy $\pi_{\text{old}}$ is computed.
#### 3.1.1 Decoupled Mode (Three Policies)
**Configuration:** `bypass_mode = false`
**Policy setup:**
- $\pi_{\text{rollout}}$: Behavior policy (data collection)
- $\pi_{\text{old}}$: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
- $\pi_{\theta}$: Current policy (being updated)
**IS ratio:** $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (corrects Drift 1: rollout→old)
**PPO ratio:** $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (corrects Drift 2: old→current)
**Properties:**
- ✅ Achieves batch size invariance
- ✅ Separately corrects two distribution drifts
- ✅ Efficient stale data utilization
- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
#### 3.1.2 Bypass Mode (Two Policies)
**Configuration:** `bypass_mode = true`
**Policy setup:**
- $\pi_{\text{rollout}}$: Behavior policy (data collection)
- $\pi_{\text{old}} = \pi_{\text{rollout}}$: Proximal policy equals behavior policy
- $\pi_{\theta}$: Current policy (being updated)
**Ratios:**
- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio)
- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function
**Properties:**
- ✅ Skips `actor.compute_log_prob()` call (faster)
- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
- ✅ Uses two policies instead of three (π_rollout = π_old)
- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
---
### 3.2 Loss Functions: PPO vs Policy Gradient
#### 3.2.1 PPO Loss (with Clipping)
**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode)
**Loss function:**
$$
L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
$$
where:
- $w_t$: IS weight (depends on aggregation level, see Section 3.3). In decoupled mode, $w_t = \frac{\pi_{\text{old}}}{\pi_{\text{rollout}}}$ where $\pi_{\text{old}}$ is frozen, so $w_t$ is constant (no stopgrad needed). In bypass mode with PPO loss, no separate IS weights are typically computed.
- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio
- $\epsilon$: Clip range (typically 0.2)
**Properties:**
- Trust region control via clipping
- Limits policy update magnitude
- Standard in RL training
#### 3.2.2 Policy Gradient Loss (with IS/RS Correction)
**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`)
**Loss function** (example with sequence-level IS):
$$
L_{\text{PG}}(\theta) = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
$$
where:
- $w_{\text{seq}}(\theta)$: Sample weight (IS or RS, see §3.3-3.4 for details)
- For IS: $w_{\text{seq}}(\theta) = \min\left( \prod_{t \in T} \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}, C_{\text{IS}} \right)$
- For RS: $w_{\text{seq}}(\theta) \in \{0, 1\}$ (binary rejection mask)
- **stopgrad operator**: The weight $w_{\text{seq}}(\theta)$ is computed using $\pi_\theta$ but treated as a **constant coefficient** when computing $\nabla_\theta L$. This is essential for importance sampling correctness (see theoretical justification below).
**Effective gradient:**
$$
\nabla_\theta L_{\text{PG}} = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \nabla_\theta \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
$$
**Theoretical Justification for stopgrad:**
The stopgrad operator is **mathematically required** by importance sampling theory, not an implementation detail. Here's why:
**The fundamental principle**: Importance sampling is a technique to **change the measure** (reweight samples from one distribution to estimate expectations under another), not to optimize the reweighting function itself.
**Formal derivation**:
1. **Original objective**: We want to optimize $J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta}[\sum_t A_t]$.
2. **Off-policy setting**: We only have samples from $\pi_{\text{rollout}}$, so we use importance sampling:
$$
J(\theta) = \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ \underbrace{\frac{P_{\pi_\theta}(\tau)}{P_{\pi_{\text{rollout}}}(\tau)}}_{w(\tau;\theta)} \sum_t A_t \right]
$$
3. **Computing the policy gradient**: The correct gradient uses the **policy gradient theorem BEFORE importance sampling**:
$$
\begin{aligned}
\nabla_\theta J(\theta) &= \nabla_\theta \mathbb{E}_{\tau \sim \pi_\theta}\left[\sum_t A_t\right] \\
&= \mathbb{E}_{\tau \sim \pi_\theta} \left[\sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(policy gradient theorem)} \\
&= \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ w(\tau;\theta) \sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(change of measure)}
\end{aligned}
$$
In the final line, $w(\tau;\theta)$ appears as a **multiplicative coefficient** from the change of measure, not as something we differentiate.
4. **What goes wrong without stopgrad**: If we naively compute $\nabla_\theta \left[w(\theta) \log \pi_\theta \right]$ in the loss, we get:
$$
\nabla_\theta \left[w(\theta) \log \pi_\theta \right] = \underbrace{\log \pi_\theta \cdot \nabla_\theta w(\theta)}_{\text{WRONG: bias term}} + \underbrace{w(\theta) \cdot \nabla_\theta \log \pi_\theta}_{\text{CORRECT: IS-weighted gradient}}
$$
The first term $\log \pi_\theta \cdot \nabla_\theta w(\theta)$ is an artifact of the computational trick (using loss times log-prob), not part of the true policy gradient. It biases the gradient estimator and optimizes a different objective than $J(\theta)$.
5. **Implementation requirement**: In PyTorch, to compute only the second term, we must use:
```python
loss = -advantages * log_prob * rollout_is_weights.detach() # stopgrad on weights
```
Without `.detach()`, autograd computes both terms, giving an incorrect gradient.
**Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective).
**Properties:**
- **Algorithm**: Off-policy policy gradient with IS/RS correction
- **Loss types** (`loss_type` config option in bypass mode):
- `"ppo_clip"` (default): PPO clipped objective
- $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$
- Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting)
- `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping
- $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$
- **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison
- **Fast**: Single forward pass
**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py)
---
### 3.3 IS/RS Aggregation Levels
The aggregation level determines how per-token probability ratios are combined into IS weights and/or rejection masks. This choice is **orthogonal to the operating mode** - you can use any aggregation level in either decoupled or bypass mode.
#### 3.3.1 Token-Level Aggregation
**IS weights:** $w_t = \min(\rho_t, C_{\text{IS}})$ where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (decoupled) or $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (bypass/pure IS)
**Configuration:**
```python
rollout_is = "token" # IS weights
rollout_rs = "token_k1" # Optional: rejection sampling (ratio bounds)
```
**Properties:**
- Independent truncation per token
- Lower variance than sequence-level (product of ratios bounded individually)
- **Bias-variance tradeoff**: Token-level correction has $O(T^2 \Delta_{\max})$ bias where $T$ is sequence length and $\Delta_{\max}$ is maximum per-token policy divergence. This bias becomes significant when the rollout policy deviates substantially from the training policy. Sequence-level correction is unbiased but has higher variance.
- Typical threshold: 1.5 - 5.0
- Optional batch normalization [§3.4](rollout_corr_math.md#34-batch-normalization): Normalizes over all token weights to ensure $\mathbb{E}[\tilde{w}_t] = 1$ (reduces variance)
- **When to use**: Token-level works well when rollout policy stays within the trust region of training policy. When mismatch is significant, the bias becomes intolerable and sequence-level correction is preferred.
**Loss function (REINFORCE + Token IS):**
$$
L_{\text{REINFORCE+TIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_t) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
$$
where $w_t = \min(\rho_t, C_{\text{IS}})$ are the truncated token-level IS weights. The stopgrad operator ensures that when computing $\nabla_\theta L$, the weights are treated as constants (see §3.2.2 for theoretical justification). This formulation can also be combined with PPO clipping by replacing the REINFORCE gradient with the clipped surrogate objective.
**Implementation:**
- IS weights: `compute_rollout_correction_weights()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L325-L402)
- Loss: `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
#### 3.3.2 Sequence-Level Aggregation
**IS weights:** $w_{\text{seq}} = \min\left( \prod_{t \in T} \rho_t, C_{\text{IS}} \right) = \min\left( \exp\left(\sum_{t \in T} \log \rho_t\right), C_{\text{IS}} \right)$ (broadcast to all tokens)
**Configuration:**
```python
rollout_is = "sequence" # IS weights
rollout_rs = "seq_sum_k1" # Optional: rejection sampling
```
**Properties:**
- Multiplicative aggregation across sequence
- More sensitive to outliers than token-level
- Typical threshold: 2.0 - 10.0
- Optional batch normalization [§3.4](rollout_corr_math.md#34-batch-normalization): Normalizes over sequence means (one weight per sequence)
**Terminology Note:**
- **Seq-TIS (Sequence-Level Truncated IS)**: Clips the sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$. Maximizes information efficiency by extracting signal from all samples. Best for clean data with moderate mismatch.
- **Seq-MIS (Sequence-Level Masked IS)**: Rejects (masks) sequences with $\rho(\tau) > C$ instead of clipping. Acts as a hard trust region filter. Best for severe mismatch or when the distribution tail is "toxic" (contains garbage/adversarial samples rather than signal).
**Loss function (REINFORCE + Sequence IS):**
$$
L_{\text{REINFORCE+SeqIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_{\text{seq}}) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
$$
where $w_{\text{seq}}$ is broadcast to all tokens in the sequence. The stopgrad operator ensures correct IS gradient computation (see §3.2.2). This formulation can also be combined with PPO clipping.
#### 3.3.3 Geometric Mean Aggregation (Geo-RS)
**Geometric mean ratio:** $\rho_{\text{geo}} = \exp\left( \frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{1/|T|}$ (broadcast to all tokens)
**Configuration:**
```python
rollout_is = null # No IS weights, pure rejection
rollout_rs = "seq_mean_k1" # Geometric mean rejection sampling (ratio bounds)
```
**Properties:**
- Length-invariant (normalizes by sequence length)
- Ideal ratio = 1.0 (policies match)
- Typical bounds: `"0.999_1.001"` (~±0.1%)
- **Used for rejection sampling only, not IS weighting**
**The Length Trap Problem:**
Standard IS estimators have a systematic **length bias** that penalizes long sequences. The importance ratio $\rho(y)$ is multiplicative:
$$
\rho(y) = \prod_{t=1}^T \frac{\pi(y_t|y_{= 0 per token (equals 0 when ρ = 1)
- More stable than geometric ratio checks because each token term is non-negative
- Only upper threshold applies (no lower threshold since K3 >= 0)
- Typical threshold: 0.001 - 0.01
**Why K3 over geometric ratio?**
- Geometric ratio uses average log-ratio; small numerical bias can flip sign
- K3 = E[ρ - log ρ - 1] is non-negative per token, offering a smoother detector
- Both estimate the same quantity: KL(π_rollout || π_old)
- For small divergences, K3 ≈ 0.5 × Var(log_ratio)
**Combined Estimator (K3-RS-Token-TIS):**
For best results, combine K3 filter with token-level IS weights:
$$
\hat{g}_{\text{k3-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( K3_{\text{seq}} \le C_{\text{k3}} \right)}_{\text{K3 Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y)
$$
This is implemented by combining `rollout_rs="seq_mean_k3"` with `rollout_is="token"`.
---
### 3.4 Batch Normalization
An optional variance reduction technique that normalizes IS weights to have mean 1.0 within each batch.
**Configuration:**
```python
rollout_is_batch_normalize = True # Default: False
```
**Normalization formula (aggregation-aware):**
For **token-level IS** (§3.3.1):
$$
\tilde{w}_t = \frac{w_t}{\frac{1}{\sum_{i,t} m_{i,t}} \sum_{i,t} w_{i,t} \cdot m_{i,t}}
$$
where $w_{i,t}$ are truncated token IS weights, $m_{i,t}$ is the response mask, and normalization is over **all tokens**.
For **sequence-level IS** (§3.3.2):
$$
\tilde{w}_i = \frac{w_i}{\frac{1}{B}\sum_{j=1}^B \bar{w}_j}
$$
where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the per-sequence mean (all tokens in a sequence have the same weight), and normalization is over **sequences**.
**Properties:**
- Applied **after** truncation to preserve truncation semantics
- Ensures $\mathbb{E}[\tilde{w}] = 1$ within each batch
- **Aggregation-aware**: Token-level normalizes over tokens; sequence-level normalizes over sequences
- Uses `masked_mean` to respect padding tokens
- Reduces gradient magnitude variance by removing random batch-level scale fluctuations
**Metrics:**
- `rollout_is_batch_norm_factor`: The normalization factor applied (batch mean before normalization)
**Implementation:** [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L401-L421)
---
### 3.5 Rejection Sampling (RS)
Rejection sampling can be added to **any combination** of operating mode and aggregation level. It modifies the `response_mask` to exclude outlier tokens/sequences.
**Configuration examples:**
```python
rollout_rs = "token_k1" # Token-level ratio bounds
rollout_rs_threshold = "0.6_1.6"
rollout_rs = "seq_sum_k1" # Sequence sum of log ratios
rollout_rs_threshold = "0.5_2.0"
rollout_rs = "seq_mean_k3" # Sequence mean of K3 divergence
rollout_rs_threshold = 0.01
```
**Acceptance set:**
- **Token-level**: $\mathcal{A}_{\text{token}} = \{ t : C_{\text{RS-lower}} \leq \rho_t \leq C_{\text{RS-upper}} \}$
- **Sequence-level**: $\mathcal{A}_{\text{seq}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \prod_{t \in T} \rho_t \leq C_{\text{RS-upper}} \}$
- **Geometric**: $\mathcal{A}_{\text{geo}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \rho_{\text{geo}} \leq C_{\text{RS-upper}} \}$
**Properties:**
- Separate from IS weighting (can use RS without IS)
- Reduces effective sample size
- Filters extreme outliers
**Implementation:** `compute_rollout_rejection_mask()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L80-L188)
---
### 3.6 Combination Matrix
**Key insight:** Estimators (how IS/RS is computed) and operating modes (decoupled PPO vs bypass PG) are **orthogonal**. Any estimator can be combined with any operating mode.
#### Estimator × Operating Mode
| Estimator | Configuration | Compatible Modes |
|-----------|---------------|------------------|
| **Token-TIS** | `rollout_is="token"` | Decoupled PPO, Bypass PG |
| **Seq-TIS** | `rollout_is="sequence"` | Decoupled PPO, Bypass PG |
| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Decoupled PPO, Bypass PG |
| **Geo-RS** | `rollout_rs="seq_mean_k1"` (geometric mean) | Decoupled PPO, Bypass PG |
| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Decoupled PPO, Bypass PG |
| **K3-RS** | `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce".
#### Available Preset Methods
| Preset Method | Estimator | Mode | Properties |
|---------------|-----------|------|------------|
| **Decoupled PPO Mode** (3 policies: π_rollout, π_old, π_θ) |
| `decoupled_token_is()` | Token-TIS | Decoupled PPO | Per-token IS weights |
| `decoupled_seq_is()` | Seq-TIS | Decoupled PPO | Sequence-level IS weights |
| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS |
| `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS |
| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled PPO | Geometric filter + token IS |
| **K3 KL Estimator** (more stable for small KL values) |
| `decoupled_k3_rs()` | K3-RS | Decoupled PPO | K3 rejection, no IS weights |
| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled PPO | K3 filter + token clipped weight |
| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) |
| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only |
| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS (ratio) |
| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | PPO-clip + K3-RS |
| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) |
| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS |
| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS (ratio) |
| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + token IS |
| **Other** |
| `disabled()` | - | - | Metrics only |
**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function.
#### Additional Supported Combinations (Manual Configuration)
These combinations are **fully supported** but require manual configuration:
**1. Token IS + Token RS**
```python
config = RolloutCorrectionConfig(
rollout_is="token",
rollout_is_threshold=2.0,
rollout_rs="token_k1",
rollout_rs_threshold="0.5_2.0",
)
```
**Properties:** Token-level IS weights + token-level RS mask.
**2. Pure Token RS**
```python
config = RolloutCorrectionConfig(
rollout_is=None,
rollout_rs="token_k1",
rollout_rs_threshold="0.5_2.0",
)
```
**Properties:** Token-level RS mask only, no IS weights.
**3. Pure Sequence RS**
```python
config = RolloutCorrectionConfig(
rollout_is=None,
rollout_rs="seq_sum_k1",
rollout_rs_threshold="0.5_2.0",
)
```
**Properties:** Sequence-level RS mask only, no IS weights.
**Key properties:**
- Any IS aggregation level (token/sequence) can be used in either decoupled or bypass mode
- Rejection sampling can be added to any combination
- Geometric aggregation is typically used for RS only (not IS weighting)
- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights)
- All combinations in the table above are valid and supported by the implementation
---
### 3.7 Common Implementation Mistake
#### Incorrect LLM-RL Implementation (PPO Without Rollout Correction)
**Theory:** Naive LLM-RL implementation that incorrectly applies PPO by **ignoring the actual rollout policy** and assuming $\pi_{\text{old}} = \pi_{\text{rollout}}$.
**Note:** This incorrect implementation pattern was identified in [Liu, Li, et al. (2025)](https://richardli.xyz/rl-collapse) as a key cause of training instability in LLM-RL systems, motivating the development of this rollout correction framework.
**Loss Function:**
$$
L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
$$
where $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (ignores $\pi_{\text{rollout}}$).
**Why it's wrong:**
- **Ignores $\pi_{\text{rollout}}$**: Uses $\pi_{\text{old}}$ as behavior policy instead of actual $\pi_{\text{rollout}}$
- **Policy mismatch**: In LLM-RL, rollout typically uses different precision/backend/checkpoint than training, causing $\pi_{\text{rollout}} \neq \pi_{\text{old}}$ even with same model weights
- **Not PPO's fault**: PPO itself is correct; the issue is the incorrect assumption
**Correct alternatives:**
1. **Decoupled mode**: Three policies with IS correction from $\pi_{\text{rollout}}$ to $\pi_{\text{old}}$
2. **Bypass mode**: Two policies using $\pi_{\text{rollout}}$ as both behavior policy and proximal policy
3. **Bypass + Policy Gradient mode**: Two policies with IS/RS correction and no PPO clipping
**Implementation:** `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
---
## 4. Off-Policy Diagnostic Metrics
These metrics quantify the severity of off-policy drift.
**Note on notation:** Metrics use $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$. In bypass mode, $\pi_{\text{old}} = \pi_{\text{rollout}}$, so metrics measure rollout→current drift using $\rho_t = \frac{\pi_{\theta}}{\pi_{\text{rollout}}}$ instead.
### 4.1 KL Divergence
**Direct KL estimator:**
$$
\text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}}) = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \log \pi_{\text{rollout}}(a_t|s_t) - \log \pi_{\text{old}}(a_t|s_t) \right]
$$
**K3 KL estimator** (alternative formulation):
$$
\text{KL}_{\text{K3}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t - \log \rho_t - 1 \right]
$$
where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$.
### 4.2 Perplexity
**Old policy perplexity:**
$$
\text{PPL}_{\text{old}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{old}}(a_t|s_t) \right)
$$
**Rollout policy perplexity:**
$$
\text{PPL}_{\text{rollout}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{rollout}}(a_t|s_t) \right)
$$
**PPL ratio** (inverse of geometric mean IS weight):
$$
\text{PPL}_{\text{ratio}} = \frac{\text{PPL}_{\text{old}}}{\text{PPL}_{\text{rollout}}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{-1/|T|}
$$
**Interpretation:** Values > 1 mean $\pi_{\text{old}}$ assigns lower probability than $\pi_{\text{rollout}}$ to the observed actions (distribution shift).
### 4.3 Chi-squared Divergence
Measures the second moment of the IS weight distribution.
**Token-level:**
$$
\chi^2_{\text{token}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t^2 \right] - 1
$$
**Sequence-level:**
$$
\chi^2_{\text{seq}} = \mathbb{E}_{\text{seq} \sim \pi_{\text{rollout}}} \left[ \left(\prod_{t \in T} \rho_t\right)^2 \right] - 1
$$
**Interpretation:**
- $\chi^2 = 0$: Policies are identical
- $\chi^2 > 0$: Higher values indicate more severe off-policy distribution shift
**Implementation:** `compute_offpolicy_metrics()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L670-L776)
---
## 5. Summary and Decision Guide
### 5.1 Method Summary Table
| Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed |
|--------|--------|----------|----------|---------------|-------------|-------|
| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) |
| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** |
| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** |
| **Bypass Mode Presets (PPO-clip)** |
| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** |
| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask (ratio) | ✅ Correct | **Fast** |
| **Bypass Mode Presets (REINFORCE)** |
| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** |
| `bypass_pg_geo_rs` | REINFORCE + Geo-RS | 2 (rollout, θ) | ❌ | Geo-RS only (ratio) | ✅ Correct | **Fast** |
| `bypass_pg_geo_rs_token_tis` | REINFORCE + Geo RS + Token IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Token-TIS | ✅ Correct | **Fast** |
| **Decoupled PPO Mode** (IS weights = π_old / π_rollout) |
| `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard |
| `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard |
| `decoupled_seq_is_rs` | Decoupled PPO + RS | 3 (rollout, old, θ) | ✅ | ✅ Seq-MIS | ✅ Correct | Standard |
| `decoupled_geo_rs` | Decoupled PPO + Geo-RS | 3 (rollout, old, θ) | ✅ | Geo-RS only (ratio) | ✅ Correct | Standard |
| `decoupled_geo_rs_token_tis` | Decoupled PPO + Geo RS + Token IS | 3 (rollout, old, θ) | ✅ | ✅ Geo-RS-Token-TIS | ✅ Correct | Standard |
| **Incorrect (for reference)** |
| Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard |
**Notes:**
- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
- `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting)
- `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$
- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples
### 5.2 Estimator Hierarchy
These estimators define **how IS weights and rejection masks are computed**. They are orthogonal to the operating mode (decoupled PPO vs bypass policy gradient) and can be combined with either.
| Estimator | Configuration | Mechanism | Best For |
|-----------|---------------|-----------|----------|
| **Token-TIS** | `rollout_is="token"` | Clips per-token ratios | Lower variance IS with acceptable bias |
| **Seq-TIS** | `rollout_is="sequence"` | Clips sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$ | Clean data with moderate mismatch; unbiased |
| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Rejects sequences with $\rho(\tau) > C$ | Severe mismatch; filters "toxic tail" (garbage data) |
| **Geo-RS** | `rollout_rs="seq_mean_k1"` | Rejects on geometric mean ratio exp(E[log(r)]) | Length-invariant trust region |
| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Geometric filter + token IS weights | Ratio-based length normalization + lower variance IS |
| **K3-RS** | `rollout_rs="seq_mean_k3"` | Rejects on K3 KL divergence | Small KL values; smooth detector |
| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | K3 filter + token IS weights | Small KL + lower variance IS |
**Note:** Each estimator can be used with either:
- **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping
- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type
- `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied)
- `loss_type="reinforce"`: REINFORCE with explicit IS weights
### 5.3 Method Characteristics by Scenario
**Choosing estimator by off-policy severity:**
- **Negligible** (same checkpoint, minor differences): No IS correction needed; use bypass mode for efficiency
- **Moderate** (async workers, slight staleness): Token-TIS provides per-token IS correction with lower variance
- **Severe** (replay buffers, old data): Seq-TIS or Seq-MIS provides sequence-level IS correction; use Seq-MIS when high-weight samples are likely garbage
**Choosing estimator by sequence length:**
- **Short sequences** (standard chat): Seq-TIS is optimal
- **Long sequences** (CoT, agents): K1-RS or K1-RS-Token-TIS to avoid Length Trap
**Choosing operating mode:**
- **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`)
- **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation
- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"`
### 5.4 Decoupled Mode vs Bypass Mode
**Decoupled mode** (computes `old_log_prob` separately):
- Implements full decoupled PPO with three policies (mathematically correct)
- Separately measures and corrects Drift 1 (rollout→old) and Drift 2 (old→current)
- Achieves batch size invariance and efficient stale data utilization
- Enables accurate off-policy metrics monitoring
**Bypass mode** (sets $\pi_{\text{old}} = \pi_{\text{rollout}}$):
- Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
- Computational efficiency: Skips separate `old_log_prob` computation
- Does not achieve batch size invariance (proximal policy depends on data collection)
---
## 6. Implementation References
- **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical configuration and troubleshooting
- **Config:** [verl/trainer/config/algorithm.py](../../verl/trainer/config/algorithm.py)
- **IS/RS Helper:** [verl/trainer/ppo/rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py)
- **PPO Loss:** [verl/trainer/ppo/core_algos.py](../../verl/trainer/ppo/core_algos.py)
- **Tests:** [tests/trainer/ppo/test_rollout_corr.py](../../tests/trainer/ppo/test_rollout_corr.py)
---
## References
- **Williams, R. J. (1992).** "Simple statistical gradient-following algorithms for connectionist reinforcement learning." *Machine Learning*, 8(3-4), 229-256. https://doi.org/10.1007/BF00992696
- **Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017).** "Proximal policy optimization algorithms." *arXiv preprint arXiv:1707.06347.* https://arxiv.org/abs/1707.06347
- **Hilton, J., Cobbe, K., & Schulman, J. (2021).** "Batch size-invariance for policy optimization." *arXiv preprint arXiv:2110.00641.* https://arxiv.org/abs/2110.00641
- Introduced decoupled PPO: separating proximal policy (for controlling policy update size) from behavior policy (for off-policy correction) to achieve batch size invariance
================================================
FILE: docs/algo/spin.md
================================================
# Recipe: Self-Play Fine-Tuning (SPIN)
Last updated: 05/31/2025.
`verl` provides a recipe inspired by the paper **"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models"** (SPIN). SPIN is a language model finetuning algorithm that enables iterative self-improvement through a self-play mechanism inspired by game theory.
**Core Idea:** Models learn by playing against themselves, reducing reliance on external preference datasets or stronger teacher models:
1. **Synthetic Data Generation:** The current model generates responses, creating its own training data from previous iterations.
2. **Two-Player Game Setup:** A game involving two players acted by a single LLM.
3. **Iterative Training:** The model progressively improves by refining its policy, with each iteration's model becoming the opponent for the next iteration.
Paper Authors: [Zixiang Chen](https://github.com/uclaml/SPIN)\*, [Yihe Deng](https://github.com/uclaml/SPIN)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
[[Webpage](https://uclaml.github.io/SPIN/)] [[Huggingface](https://huggingface.co/papers/2401.01335)] [[Paper](https://arxiv.org/abs/2401.01335)] [[Original Implementation](https://github.com/uclaml/SPIN)]
verl Implementation Authors: [Chendong Wang](https://cdwang96.github.io/), [Chenyang Zhao](https://github.com/zhaochenyang20)
---
## Key Function (compute_online_dpo_loss) and Related works
SPIN (Chen et al., 2024) proposes an iterative self-play mechanism to fine-tune language models. In each iteration, SPIN's training objective, when using a logistic loss function, is equivalent to Direct Preference Optimization (DPO) loss (Rafailov et al., 2023).
This `verl` recipe realizes SPIN's core concept by using DPO loss iteratively (Xu et al., 2023; Xiong et al., 2023; Snorkel AI, 2024). This means that in each iteration, we fine-tune the LLM using DPO loss for preference optimization. Notably, Xu et al. (2023) explored iterative preference optimization with pairwise cringe loss, while Xiong et al. (2023) discussed how to bridge theory and practice for RLHF under KL constraints using iterative training. The concept of iterative preference learning was also explored in online DPO (Guo et al., 2024), which focuses on direct alignment from online AI feedback. In online DPO, preference data is dynamically updated during training, allowing the model to learn from its own generated data.
Specifically, we developed the **`compute_online_dpo_loss`** function and built this SPIN recipe on top of it. By incorporating online preference generation, this approach enables continuously refining language models without relying on fixed external preference datasets.
**Reference Papers:**
* [Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models](https://arxiv.org/abs/2401.01335) (Chen et al., 2024)
* [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) (Rafailov et al., 2023)
* [Somethings are more cringe than others: Preference optimization with the pairwise cringe loss](https://arxiv.org/abs/2312.16682) (Xu et al., 2023)
* [Iterative preference learning from human feedback: Bridging theory and practice for rlhf under kl-constraint](https://arxiv.org/abs/2312.11456) (Xiong et al., 2023)
* [Snorkel-Mistral-PairRM-DPO](https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO) (Snorkel AI, 2024)
* [Direct language model alignment from online ai feedback](https://arxiv.org/abs/2402.04792) (Guo et al., 2024)
## Our Online DPO Implementation
Our `compute_online_dpo_loss` function adapts `verl`'s existing PPO infrastructure (based on `verl` v0.3.0.post1) for this iterative online DPO. Key aspects of our implementation include:
* **No Critic:** Unlike PPO, we omit the value function critic.
* **Dynamic Reference Model:** An explicit reference policy (`ref_policy_wg`) is used for DPO loss. This reference model's weights can be periodically updated from the actor (`ref_update_freq`), providing a dynamic baseline.
* **Online Preference Generation:** The `compute_onlineDPO_pref` function (in `core_algos.py`) dynamically creates chosen/rejected pairs based on a reward source (e.g., rule-based ranking for math problems).
* **DPO Loss Integration:** We replace PPO's policy loss with our `compute_online_dpo_loss` (in `core_algos.py`) within the actor update (`dp_actor.py`), directly optimizing the policy using the generated preferences.
* **Iterative Training Orchestration:** The `SpinTrainer` (in `spin_trainer.py`) manages the entire self-play loop: generation, preference labeling, optional reference model updates, and policy updates, enabling continuous self-improvement aligned with SPIN's principles.
---
## Algorithm
This recipe implements an Online algorithm adapted to the `verl` Reinforcement Learning framework, which provides an alternative to PPO for fine-tuning language models.
**Online Loop:** Instead of maximizing a scalar reward signal in PPO, this approach directly optimizes the policy model to align with preference data generated *online* during training:
1. **Generation:** The current model generates multiple responses for each prompt in a batch.
2. **Preference Labeling:** A function evaluates these generated responses to determine which one is preferred (chosen) and which is dispreferred (rejected). This can be done using a reward function or implicit ranking based on specific rules. (In this recipe, we use rule-based ranking on the math problem).
3. **Update:** This preference tuple (`prompt`, `chosen_response`, `rejected_response`) is used to update the actor model using `compute_online_dpo_loss`, comparing against a reference model.
**Connection with SPIN:**
Instead of only using a fixed target data distribution, the online generation loop in step 2 will dynamically change the target data distribution by using a certain Preference Labeling method (rule-based ranking on the math problem by selecting the better one in this recipe). This explores the direction mentioned in SPIN's paper Section 7 about "dynamically changing target data distribution" to potentially elevate LLM performance beyond the fixed human-annotated data ceiling.
---
## Reproduce the Experiment (Example Setup)
The following steps outline how to set up the environment and run the SPIN recipe, based on the provided test log using GSM8K and Qwen2.5-3B-Instruct.
1. **Setup Environment (Example using Docker):**
```bash
# Start a container with GPU access and shared memory
docker run -it --name spin_test --gpus all \
--shm-size=32g \
--ipc=host \
-v /path/to/host/.cache:/root/.cache \
-e HF_TOKEN= \
lmsysorg/sglang:latest \
/bin/bash
# Inside the container or on your host machine:
# Ensure /tmp is writable
mkdir -p /tmp
chmod 1777 /tmp
# Install Python 3.10 (if not present) and venv
sudo apt update
sudo apt install -y python3.10 python3.10-venv tmux
python3 -m ensurepip --upgrade
# Create and activate a virtual environment
python3 -m venv ~/.python/spin_env
source ~/.python/spin_env/bin/activate
# Install uv (fast package installer)
python3 -m pip install uv
```
2. **Install verl and Dependencies:**
```bash
# Clone the verl repository and checkout the spin branch
cd ~
git clone git@github.com:volcengine/verl.git && cd verl
# Install flash-attn (handle potential build issues)
python3 -m uv pip install wheel packaging
python3 -m uv pip install flash-attn --no-build-isolation --no-deps
# Install verl with sglang extras
python3 -m uv pip install -e ".[sglang]"
```
*Note: If `flash-attn` installation fails, try the manual steps again or consult its documentation.*
3. **Login & Download Data/Model:**
```bash
# Login to Weights & Biases (optional, for logging)
export WANDB_API_KEY=
# wandb login
# Download the GSM8K dataset
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
# Download the base model (Example: Qwen2.5-3B-Instruct)
hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
```
4. **Configure:**
* Modify the configuration file (e.g., `config/spin_trainer.yaml` or the one specified in the run script) with correct paths to your downloaded model, data, desired hyperparameters (`dpo_beta`, learning rate, etc.), and distributed training settings (nodes, GPUs per node).
* Pay attention to `actor_rollout_ref.model`, `data` paths, `reward_model` config (if using one), and `trainer.ref_update_freq`.
5. **Run Training:**
```bash
# Set CUDA visible devices (adjust based on your hardware and config)
export CUDA_VISIBLE_DEVICES=0,1,2,3
# Launch the training script (e.g., test.sh or a custom script)
# Ensure test.sh points to the correct config and main script
bash recipe/spin/run_spin.sh
```
---
## Configuration
* The primary configuration is typically managed through a YAML file specified in the launch script (e.g., `config/spin_trainer.yaml`).
* Key configuration sections:
* `data`: Paths to training/validation prompt files, batch sizes, sequence lengths.
* `actor_rollout_ref`: Paths to the base model (used for actor and initial reference), FSDP settings, optimization parameters (learning rate, scheduler).
* `reward_model`: Configuration for the reward model used for online preference labeling (path, batch size, etc.). Can be omitted if using a simpler reward function.
* `algorithm`: DPO-specific hyperparameters like `dpo_beta`, `dpo_loss_type`.
* `trainer`: Distributed training settings (nodes, GPUs per node), logging (WandB), checkpointing frequency, and `ref_update_freq` (set > 0 to enable periodic reference model updates from the actor).
---
## Key Files
* `main_spin.py`: Main entry point using Hydra to load the config and launch the `SpinTrainer`.
* `spin_trainer.py`: Defines the `SpinTrainer` class, orchestrating the Online DPO training loop.
* `fsdp_workers.py`: Implements Ray workers (Actor, Reference) potentially using FSDP.
* `dp_actor.py`: Contains the actor class, including the DPO policy update logic.
* `core_algos.py`: Includes helper functions for `compute_online_dpo_loss` and `compute_onlineDPO_pref`.
* `config/spin_trainer.yaml` (or similar): Main Hydra configuration file for the recipe.
* `run_spin.sh` (or similar): Example bash script for launching a training run.
* `README.md`: This file.
---
## Acknowledgement
We sincerely thank the contribution and guidance from the `verl` community and advisors, including (adapted from SPPO):
* [Zixiang Chen](https://sites.google.com/view/zxchen)
* [Yuhao Yang](https://github.com/yhyang201)
* [Yifan Zhang](https://github.com/yifanzhang-pro)
* [Yongan Xiang](https://github.com/BearBiscuit05)
* [Junrong Lin](https://github.com/ocss884)
* [Yuxuan Tong](https://github.com/tongyx361)
* [Guangming Shen](https://github.com/PeterSH6)
* [Biao He](https://www.linkedin.com/in/biao-he/)
* [Qingquan Song](https://qingquansong.github.io/)
* [Chenyang Zhao](https://zhaochenyang20.github.io/Chayenne/)
* [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
================================================
FILE: docs/algo/sppo.md
================================================
# Recipe: Self-Play Preference Optimization (SPPO)
Last updated: 05/28/2025.
verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets.
Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20)
[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)]
## Reproduce the Experiment
We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework.
```
git clone git@github.com:volcengine/verl.git
cd verl
python3 -m uv pip install -e ".[sglang]"
export WANDB_API_KEY=
python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
hf download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct
export CUDA_VISIBLE_DEVICES=0,1,2,3
bash recipe/sppo/run_qwen2.5-7b_rm.sh
```
Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running:
```bash
python3 -m uv pip install wheel
python3 -m uv pip install packaging
python3 -m uv pip install flash-attn --no-build-isolation --no-deps
```
## Acknowledgement
We sincerely thank the contribution and guidance from:
- [Yue Wu](https://yuewu.us/)
- [Chendong Wang](https://cdwang96.github.io/)
- [Yifan Zhang](https://github.com/yifanzhang-pro)
- [Yongan Xiang](https://github.com/BearBiscuit05)
- [Junrong Lin](https://github.com/ocss884)
- [Yuxuan Tong](https://github.com/tongyx361)
- [Guangming Shen](https://github.com/PeterSH6)
- [Biao He](https://www.linkedin.com/in/biao-he/)
- [Qingquan Song](https://qingquansong.github.io/)
- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
================================================
FILE: docs/amd_tutorial/amd_build_dockerfile_page.rst
================================================
Getting started with AMD (ROCM Kernel)
=====================================================
Last updated: 07/06/2025.
Author: `Yusheng Su `_
Setup
-----
If you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` or ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
docker/Dockerfile.rocm
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"
SHELL ["/bin/bash", "-ceuxo", "pipefail"]
ENV MAX_JOBS=512
ENV PATH="/usr/local/python3.12/bin:$PATH"
RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
ln -sf /usr/bin/pip3.12 /usr/bin/pip
############################################
RUN apt-get update
RUN apt-get install -y pkg-config liblzma-dev
############################################
###########################################
##########Install TransformerEngine########
###########################################
WORKDIR /workspace/
# transformer-engine install
# https://github.com/ROCm/TransformerEngine
RUN rm -rf TransformerEngine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
WORKDIR /workspace/TransformerEngine
git checkout 236178e5
# git checkout bb061ade
# git checkout 864405c
ENV NVTE_FRAMEWORK=pytorch
ENV NVTE_ROCM_ARCH=gfx942
ENV NVTE_USE_HIPBLASLT=1
ENV NVTE_USE_ROCM=1
# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
WORKDIR /workspace/
###########################################
###########################################
###########################################
####################################################################################
################Install vllm - sglang require vllm 0.6.7 dependency#################
####################################################################################
#### Require vllm 0.6.7 - checkout 113274a0
WORKDIR /workspace/
RUN rm -rf vllm
RUN pip uninstall -y vllm
# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
RUN git clone https://github.com/ROCm/vllm.git
# git clone https://github.com/vllm-project/vllm.git
WORKDIR /workspace/vllm
RUN git checkout 113274a0
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
#ENV MAX_JOBS=512
ENV MAX_JOBS=${MAX_JOBS}
RUN pip install "boto3>=1.26.0"
RUN pip install setuptools_scm
# will add src into py. You can delete the repo
RUN python3 setup.py install
WORKDIR /workspace/
####################################################################################
####################################################################################
####################################################################################
###########################################
############For hack docker################
###########################################
RUN pip install setuptools==75.8.0
###########################################
###########################################
###########################################
###########################################
############build sgalng###################
###########################################
# Set environment variables
ENV BASE_DIR=/sgl-workspace
ENV BUILD_TYPE=all
ENV SGL_REPO=https://github.com/sgl-project/sglang
ENV SGL_BRANCH=v0.4.6.post5
ENV TRITON_REPO=https://github.com/ROCm/triton.git
ENV TRITON_COMMIT=improve_fa_decode_3.0.0
ENV AITER_REPO=https://github.com/ROCm/aiter.git
ENV AITER_COMMIT=v0.1.2
# v0.1.2 version - commit id: 9d11f47
# ENV AITER_COMMIT=9d11f47
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_SET_CPU_AFFINITY=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
ENV NCCL_MIN_NCHANNELS=112
ENV MOE_PADDING=1
ENV VLLM_FP8_PADDING=1
ENV VLLM_FP8_ACT_PADDING=1
ENV VLLM_FP8_WEIGHT_PADDING=1
ENV VLLM_FP8_REDUCE_CONV=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
ENV AMDGPU_TARGETS=gfx942
ENV ROCM_ARCH=gfx942
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Switch to working directory
WORKDIR /sgl-workspace
# Clean and create directory
RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
# Clone and build sglang
RUN git clone ${SGL_REPO} \
&& cd sglang \
&& git checkout ${SGL_BRANCH} || echo "Using default branch" \
&& cd sgl-kernel \
&& rm -f pyproject.toml \
&& mv pyproject_rocm.toml pyproject.toml \
&& python setup_rocm.py install \
&& cd .. \
&& if [ "$BUILD_TYPE" = "srt" ]; then \
python -m pip --no-cache-dir install -e "python[srt_hip]"; \
else \
python -m pip --no-cache-dir install -e "python[all_hip]"; \
fi \
&& cd /sgl-workspace \
&& cp -r /sgl-workspace/sglang /sglang \
&& python -m pip cache purge
# Install common Python packages
RUN pip install IPython orjson python-multipart torchao pybind11
# Rebuild Triton
RUN pip uninstall -y triton || true \
&& git clone ${TRITON_REPO} \
&& cd triton \
&& git checkout ${TRITON_COMMIT} \
&& cd python \
&& python3 setup.py install \
&& cd /sgl-workspace
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
# Build aiter
#version: Commit 9d11f47
# && git checkout ${AITER_COMMIT} \
RUN pip uninstall -y aiter || true
RUN git clone ${AITER_REPO} \
&& cd aiter \
&& git checkout ${AITER_COMMIT} \
&& git submodule sync \
&& git submodule update --init --recursive \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
&& cd /sgl-workspace
# Copy MI300X config
RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
-type f -name '*MI300X*' | \
xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
# Environment setup complete.
RUN echo "Environment setup complete."
WORKDIR /workspace/
###########################################
###########################################
###########################################
###########################################
###############vllm v0.8.5#################
###########################################
WORKDIR /workspace/
ENV VLLM_TARGET_DEVICE=rocm
ENV ROCM_PATH=/opt/rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
# Find the repo path in: DockerFile/Dockerfile.rocm_yang
# RUN git clone https://github.com/RLFoundation/vllm-patch.git
RUN pip uninstall -y vllm || true
RUN rm -rf vllm-patch
RUN git clone https://github.com/RLFoundation/vllm-patch.git \
&& cd vllm-patch \
&& git checkout v0.8.5-sleep-numa \
&& rm -rf build/ dist/ *.egg-info \
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
&& SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
# RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
WORKDIR /workspace/
###########################################
###########################################
###########################################
#########################################
#### Install megatron-core###############
#########################################
RUN pip uninstall -y megatron-core && \
git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
cd Megatron-LM-amd_version && \
pip install -vvv -e . && \
cd /workspace/
#########################################
#########################################
#########################################
#######################################
################apex###################
#######################################
WORKDIR /workspace/
RUN pip uninstall -y apex && \
git clone git@github.com:ROCm/apex.git && \
cd apex && \
python setup.py install && \
cd /workspace/
#######################################
#######################################
#######################################
################################################################################
###########################Add torch_memory_saver###############################
################################################################################
# Set environment variables
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
################################################################################
################################################################################
################################################################################
########################################
######Install ray#######################
########################################
# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
RUN pip uninstall ray -y
RUN pip install "ray[data,train,tune,serve]>=2.47.0"
########################################
########################################
########################################
##########################################
#######Install other dependencies#########
##########################################
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
torchdata \
wandb \
orjson \
pybind11
WORKDIR /workspace/
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
##########################################
##########################################
##########################################
WORKDIR /workspace/
CMD ["/usr/bin/bash"]
Build the image:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
docker docker/build -t verl-rocm .
Run the container
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Note: You can pull the docker from this DockerHub: [RLSys Foundation](https://hub.docker.com/u/yushengsuthu)
Pull the image:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest
Run the container
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Optional: Running without root and with user permissions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: bash
docker run --rm -it \
--device /dev/dri \
--device /dev/kfd \
-p 8265:8265 \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME/.ssh:/root/.ssh \
-v $HOME:$HOME \
--shm-size 128G \
-w $PWD \
verl-rocm \
/bin/bash
(Optional): If you do not want to root mode and require assign yourself as the user
Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script.
Example
-------
Due to to special setting in AMD (ROCM) torch,
1. If your ``ray>=2.45.0`` (default), you need to set ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training and add this [patch](https://github.com/ray-project/ray/pull/53531/files).
2. If your ``ray<2.45.0``, you need to set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
Inference ``$ENGINE`` can be ``vllm`` or ``sglang``. We choose ``vllm`` as default in the following examples.
PPO
~~~
.. code-block:: bash
YOUR_PROJECT_NAME=r1-verl-ppo-upstream
YOUR_RUN_NAME=r1-training_ppo-upstream
# export HYDRA_FULL_ERROR=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=256 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=256 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=$MODEL_PATH \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=console \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
trainer.val_before_train=False \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=15 #2>&1 | tee verl_demo.log
GRPO
~~~~
.. code-block:: bash
YOUR_PROJECT_NAME=r1-verl-grpo-upstream
YOUR_RUN_NAME=r1-training_grpo-upstream
# export HYDRA_FULL_ERROR=1
# export FSDP_VERBOSE=1
#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=console \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15
Multi-node training: slurm with Docker/Podman container
---------------------------------------------------------------------------------------
If you want to run multi-node training with slurm, you can use the following script.
.. note::
1. You need to use ``podman`` or ``docker`` in the following script. We will release the apptainer script later.
2. If you want to use ``podman``, you just replace ``docker`` with ``podman`` in the following script.
The script includes the following steps:
1. SLURM Configuration
2. Environment Setup
3. Docker/Podman Container Setup
4. Ray Cluster Initialization
5. Data Preprocessing
6. Model Setup
7. Training Launch
slurm_script.sh
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
#!/bin/bash
#SBATCH --job-name=verl-ray-on-slurm
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --mem=200G
#SBATCH --time=30-00:00:00
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=28
#SBATCH --output=../verl_log/slurm-%j.out
#SBATCH --error=../verl_log/slurm-%j.err
#SBATCH --nodelist=gpu-[0,1]
# load necessary modules
### Run this setup
# [Cluster]: Use docker
# docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
##########################################################################
###The following setting should be set in different project and cluster###
##########################################################################
### Project
CONTAINER_NAME="multinode_verl_training"
IMG="verl.rocm"
DOCKERFILE="docker/Dockerfile.rocm"
# echo $PWD
verl_workdir="${HOME}/projects/verl_upstream"
export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface"
export HF_HOME=$TRANSFORMERS_CACHE
### Cluster Network Setting
export NCCL_DEBUG=TRACE
export GPU_MAX_HW_QUEUES=2
export TORCH_NCCL_HIGH_PRIORITY=1
export NCCL_CHECKS_DISABLE=1
# export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
export NCCL_IB_GID_INDEX=3
export NCCL_CROSS_NIC=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_PROTO=Simple
export RCCL_MSCCL_ENABLE=0
export TOKENIZERS_PARALLELISM=false
export HSA_NO_SCRATCH_RECLAIM=1
##########################################################################
## Assign using GPUs
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
### For rocm and training script
# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
# Build and launch the Docker container
srun bash -c "
# Exit on any error
set -e
# Clean up dangling images (images with tag)
docker image prune -f
# Need to pull the docker first
docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then
echo \"Building ${IMG} image...\"
docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" .
else
echo \"${IMG} image already exists, skipping build\"
fi
# Removing old container if exists
docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true
# Checking network devices
ibdev2netdev
# Launch the docker
docker run --rm -d \
-e HYDRA_FULL_ERROR=1 \
-e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
-e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \
-e NCCL_DEBUG=${NCCL_DEBUG} \
-e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \
-e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \
-e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \
-e NCCL_IB_HCA=${NCCL_IB_HCA} \
-e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \
-e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \
-e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \
-e NCCL_PROTO=${NCCL_PROTO} \
-e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \
-e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \
-e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \
-e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \
-e HF_HOME=${HF_HOME} \
--network host \
--device /dev/dri \
--device /dev/kfd \
--device /dev/infiniband \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v \${HOME}:\${HOME} \
-v \${HOME}/.ssh:/root/.ssh \
-w "${verl_workdir}" \
--shm-size 128G \
--name \"${CONTAINER_NAME}\" \
\"${IMG}\" \
tail -f /dev/null
echo \"Container setup completed\"
"
# (Optional): If you do not want to root mode and require assign yuorself as the user
# Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script.
### Ray launch the nodes before training
# Getting the node names
nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' '))
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$head_node_ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<<"$head_node_ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
head_node_ip=${ADDR[1]}
else
head_node_ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
fi
port=6379
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"
# make sure we set environment variables before Ray initialization
# Print out all env variables
printenv
echo "Starting HEAD at $head_node"
srun --nodes=1 --ntasks=1 -w "$head_node" \
docker exec "${CONTAINER_NAME}" \
ray start --head --node-ip-address="$head_node_ip" --port=$port \
--dashboard-port=8266 \
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
# optional, though may be useful in certain versions of Ray < 1.0.
sleep 10
# number of nodes other than the head node
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Debug: Starting worker on node_i = ${node_i}"
if [ -z "$node_i" ]; then
echo "Error: Empty node name for worker $i"
continue
fi
echo "Starting WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w "$node_i" \
docker exec "${CONTAINER_NAME}" \
ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
sleep 5
done
# Ray initlization test (See whether any error in the above execution)
echo "Testing Ray initialization in the slurm nodes..."
docker exec "${CONTAINER_NAME}" python3 -c '
import ray
try:
ray.init(address="auto")
print("\n=== Ray Cluster Status ===")
print(f"Number of nodes: {len(ray.nodes())}")
for node in ray.nodes():
print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"]))
# print(f"Node: {node}")
ray.shutdown()
print("Ray initialization successful!")
except Exception as e:
print(f"Ray initialization failed: {str(e)}")
'
echo "=== Ray test completed ==="
######
# Run data preprocessing
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \
python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \
python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math"
train_files="../data/gsm8k/train.parquet"
val_files="../data/gsm8k/test.parquet"
# Download and test model
echo "Loading model..."
docker exec "${CONTAINER_NAME}" \
python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
MODEL_PATH="Qwen/Qwen2-7B-Instruct"
# Set model path after pipeline test
MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct"
echo "== Data and model loading Done =="
echo "Start to train..."
docker exec "${CONTAINER_NAME}" \
python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
MODEL_PATH="Qwen/Qwen2-7B-Instruct"
PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \
docker exec "${CONTAINER_NAME}" \
python3 -m verl.trainer.main_ppo \
data.train_files=$train_files \
data.val_files=$val_files \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.enable_gradient_checkpointing=False \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=$MODEL_PATH \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.0001 \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \
trainer.val_before_train=False \
trainer.nnodes=${SLURM_NNODES} \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15
Run slurm_script.sh
~~~~~~~~~~~~~~~~~~~~
Just sbatch your slurm_script.sh
.. code-block:: bash
sbatch slurm_script.sh
================================================
FILE: docs/amd_tutorial/amd_vllm_page.rst
================================================
verl performance tuning for AMD (ROCm Kernel)
=====================================================
Last updated: 11/13/2025.
Author: `Yang Wang `_, `Songlin Jiang `_
Use vLLM Sleep Mode for AMD MI3xx series GPUs
--------------------------------------------------------------
By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. This feature has been merged into the main branch of vLLM for version later than 0.11.0.
For now, you can use the vLLM main branch and build it from the source code, or you can directly install vLLM from the pre-built ROCm wheels for vLLM version later than 0.11.0 when it's available.
1. Clone the vLLM repository and build it with the following commands:
.. code-block:: bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
git reset --hard 4ca5cd5740c0cd7788cdfa8b7ec6a27335607a48 # You can also use a later commit as you wish
python -m pip install -r requirements/rocm.txt
VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ python3 setup.py develop
2. Additionally, we recommend you to use the ROCm version later than or equal to ROCm 7.0.
After the upgrade, you can verify whether sleep mode is working by trying out `these scripts `_.
If sleep mode is working, you should see the memory usage reduce after sleep.
After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning.
Enable CUDA Graph and Bypass ROCm-related issues
--------------------------------------------------------------
Due to potential issues with CUDA graph capture in ROCm, we've found that vLLM's CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance.
Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to set ``actor_rollout_ref.rollout.cudagraph_capture_sizes`` to values such as ``[1, 2, 4, 8, 16, 32, 64]`` (change depending on your GPU memory size).
Then, you can choose to enable CUDA graph by setting ``actor_rollout_ref.rollout.enforce_eager`` to ``False`` in your verl configuration file.
================================================
FILE: docs/api/data.rst
================================================
Data interface
=========================
Last updated: 05/19/2025 (API docstrings are auto-generated).
DataProto is the interface for data exchange.
The :class:`verl.DataProto` class contains two key members:
- batch: a :class:`tensordict.TensorDict` object for the actual data
- meta_info: a :class:`Dict` with additional meta information
TensorDict
~~~~~~~~~~~~
:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
.. code-block:: python
>>> import torch
>>> from tensordict import TensorDict
>>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
>>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
>>> zeros = tensordict["zeros"]
>>> tensordict
TensorDict(
fields={
ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([2]),
device=None,
is_shared=False)
One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
.. code-block:: python
>>> tensordict[..., :1]
TensorDict(
fields={
ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([1]),
device=None,
is_shared=False)
>>> tensordict = tensordict.to("cuda:0")
>>> tensordict = tensordict.reshape(6)
For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
.. _tensordict: https://pytorch.org/tensordict/stable/overview.html
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.DataProto
:members: to, select, union, make_iterator, concat
================================================
FILE: docs/api/single_controller.rst
================================================
Single Controller interface
============================
Last updated: 05/27/2025 (API docstrings are auto-generated).
The Single Controller provides a unified interface for managing distributed workers
using Ray or other backends and executing functions across them.
It simplifies the process of dispatching tasks and collecting results, particularly
when dealing with data parallelism or model parallelism.
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.single_controller.Worker
:members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank
.. autoclass:: verl.single_controller.WorkerGroup
:members: __init__, world_size
.. autoclass:: verl.single_controller.ClassWithInitArgs
:members: __init__, __call__
.. autoclass:: verl.single_controller.ResourcePool
:members: __init__, world_size, local_world_size_list, local_rank_list
.. autoclass:: verl.single_controller.ray.RayWorkerGroup
:members: __init__
.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls
================================================
FILE: docs/api/trainer.rst
================================================
Trainer Interface
================================
Last updated: 06/08/2025 (API docstrings are auto-generated).
Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
.. autosummary::
:nosignatures:
verl.trainer.ppo.ray_trainer.RayPPOTrainer
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
:members: __init__, init_workers, fit
.. automodule:: verl.utils.tokenizer
:members: hf_tokenizer
.. automodule:: verl.trainer.ppo.core_algos
:members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
.. automodule:: verl.trainer.ppo.reward
:members: load_reward_manager, compute_reward, compute_reward_async
.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
.. autoclass:: verl.workers.reward_manager.DAPORewardManager
================================================
FILE: docs/api/utils.rst
================================================
Utilities
============
Last updated: 05/19/2025 (API docstrings are auto-generated).
This section documents the utility functions and classes in the VERL library.
Python Functional Utilities
------------------------------
.. automodule:: verl.utils.py_functional
:members: append_to_dict
File System Utilities
------------------------
.. automodule:: verl.utils.fs
:members: copy_to_local
Tracking Utilities
---------------------
.. automodule:: verl.utils.tracking
:members: Tracking
Metrics Utilities
---------------------
.. automodule:: verl.utils.metric
:members: reduce_metrics
Checkpoint Management
------------------------
.. automodule:: verl.utils.checkpoint.checkpoint_manager
:members: find_latest_ckpt_path
.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
:members: FSDPCheckpointManager
Dataset Utilities
---------------------
.. automodule:: verl.utils.dataset.rl_dataset
:members: RLHFDataset, collate_fn
Torch Functional Utilities
-----------------------------
.. automodule:: verl.utils.torch_functional
:members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
Sequence Length Balancing
----------------------------
.. automodule:: verl.utils.seqlen_balancing
:members: get_reverse_idx, rearrange_micro_batches
Ulysses Utilities
--------------------
.. automodule:: verl.utils.ulysses
:members: gather_outputs_and_unpad, ulysses_pad_and_slice_inputs
FSDP Utilities
------------------
.. automodule:: verl.utils.fsdp_utils
:members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
Debug Utilities
-------------------
.. automodule:: verl.utils.profiler
:members: log_gpu_memory_usage, GPUMemoryLogger
================================================
FILE: docs/ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst
================================================
NPU-CI 添加指导
===========
Last updated: 02/02/2026.
我们在 verl 上增加基于华为昇腾设备的CI用例添加指导。
verl 仓库使用 GitHub Actions 作为 CI 平台,通过分层测试架构保障代码质量与系统稳定性。
NPU 相关的工作流主要包括:
* ``npu_unit_test.yml``:运行单元测试。
* 以 ``_ascend.yml`` 结尾的文件:运行针对 Ascend NPU 的端到端测试或专项测试。
添加新用例指南
-----------------------------------
1. 数据集与权重
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
流水机器上的权重与绝对路径:
+---------------------------------------+-------------------------------------------------------------------+
| 模型名称 | 绝对路径 |
+=======================================+===================================================================+
| Qwen3-30B-A3B-Instruct-2507 | ``${HOME}/.cache/models/Qwen/Qwen3-30B-A3B-Instruct-2507`` |
+---------------------------------------+-------------------------------------------------------------------+
| Qwen2.5-VL-3B-Instruct | ``${HOME}/.cache/models/Qwen/Qwen2.5-VL-3B-Instruct`` |
+---------------------------------------+-------------------------------------------------------------------+
| Qwen2.5-0.5B | ``${HOME}/.cache/models/Qwen/Qwen2.5-0.5B`` |
+---------------------------------------+-------------------------------------------------------------------+
| Qwen2.5-0.5B-Instruct | ``${HOME}/.cache/models/Qwen/Qwen2.5-0.5B-Instruct`` |
+---------------------------------------+-------------------------------------------------------------------+
| Qwen2.5-1.5B-Instruct | ``${HOME}/.cache/models/Qwen/Qwen2.5-1.5B-Instruct`` |
+---------------------------------------+-------------------------------------------------------------------+
| Skywork-Reward-V2-Llama-3.2-1B | ``${HOME}/.cache/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B`` |
+---------------------------------------+-------------------------------------------------------------------+
流水机器上的数据集与绝对路径:
+--------------+---------------------------------------------------+
| 数据集名称 | 绝对路径 |
+==============+===================================================+
| gsm8k | ``${HOME}/.cache/datasets/openai/gsm8k`` |
+--------------+---------------------------------------------------+
| geo3k | ``${HOME}/.cache/datasets/hiyouga/geometry3k`` |
+--------------+---------------------------------------------------+
**Note**
{HOME}是root
gpu用例中权重在~/models/路径下,如需适配可以用软链接,``ln -s /root/.cache/models ~/models``
此处为原始数据集,按需进行数据处理,如下。
``python examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k``
2. 工作流 YAML 模板
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
如需新增一个工作流,可参考以下模板创建 ``.github/workflows/your_yml_ascend.yml`` 文件。
主要修改部分包括:
* 工作流名称(``name``)
* 触发条件(``on``)
* 运行环境(``runs-on``)
* 容器镜像(``container.image``)
* 具体执行步骤(``jobs..steps``)
.. code-block:: yaml
:linenos:
name: your_yml_ascend # 工作流唯一标识
# 触发条件配置
on:
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
paths:
- ".github/workflows/your_yml_ascend.yml" # 必须包含此工作流文件路径
- "path/to/affected_files" # 需监控的相关代码路径
# 并发控制策略
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} # 仅非main分支取消进行中的任务
permissions:
contents: read # 最小权限原则
jobs:
your_job_name: # 任务唯一标识
if: github.repository_owner == 'verl-project' # 仅在主仓库运行
runs-on: linux-aarch64-a2-4 # 硬件规格:a2实例,4卡NPU
timeout-minutes: 60 # 任务超时阈值(分钟)
container:
#运行镜像 该示例为vllm的镜像
image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
options: >-
--shm-size 16g # 共享内存配置
env:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Check initial pip list from image
run: pip list
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
clean: true
- name: Install dependencies
run: |
pip install -r requirements-npu.txt
pip install -e .
- name: Verify environment
run: pip list
# 以下为具体测试步骤(根据需求定制)
- name: Preprocess dataset
run: python examples/data_preprocess/your_script.py --local_dataset_path ${HOME}/.cache/datasets/your_dataset
- name: Execute NPU test
run: |
ray stop --force
bash tests/special_npu/your_test_script.sh
**Note**
${HOME}/.cache/文件夹内一旦添加新内容,不会因CI跑完容器销毁而删除,请避免往该文件夹添加内容。
3. 添加单元测试
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
步骤:
(1) 在 ``tests/`` 目录下创建或修改单元测试文件(例如 ``test_xxx.py``)。
(2) 若测试文件路径未被 ``npu_unit_test.yml`` 中的 ``--ignore-glob`` 规则排除,则会在以下命令中自动执行:
.. code-block:: yaml
pytest -s -x --ignore-glob="xxx" --ignore-glob="xxx" tests/
(3) 若测试路径在 ``--ignore-glob`` 排除范围内,需在 ``npu_unit_test.yml`` 中新增一个 step 来显式运行该测试。
(4) 如新增一批相关用例,建议单独创建专门的工作流文件以保持清晰。
4. 添加端到端测试脚本
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
步骤:
(1) 在 ``tests/special_npu/`` 目录下创建端到端测试脚本。
(2) 在 ``.github/workflows/`` 目录中找到功能最接近的以 ``_ascend.yml`` 结尾的工作流文件,在其中添加一个 step 调用该脚本。
(3) 若测试场景独立或较复杂,可考虑单独创建新的工作流文件。
5. 测试策略建议
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* **单元测试**:覆盖核心函数、类与方法,确保逻辑正确。
* **集成/端到端测试**:覆盖典型训练、推理 pipeline,验证多模块协同与硬件适配。
* **资源管理**:一个workflow里的多个job为并行运行,请合理设置超时时间,避免任务长时间挂起,请控制单个 job 的运行时间在 40min 以内。
通过以上步骤,可系统化地为 verl 仓库添加 NPU 相关的自动化测试,确保代码变更在合并前经过充分验证。
================================================
FILE: docs/ascend_tutorial/examples/ascend_performance_analysis_guide.md
================================================
# Ascend Performance Analysis Guide
Last updated: 02/24/2026.
## 背景介绍
随着DeepSeek-R1的发布,大模型强化学习(RL)训练受到广泛关注。在昇腾NPU环境下,verl框架已积累了丰富的性能调优经验。本文系统总结了包括性能数据采集与分析在内的方法论,旨在帮助开发者更高效地运用MindStudio工具链,实现强化学习场景下的性能优化。
### 强化学习计算流程概述
1. **Rollout**:策略(actor)模型基于输入的prompt序列,推理生成回答(response序列)
2. **ref logprob**:基于prompt和生成的response,reference模型计算ref logprob用于KL散度计算
3. **logprob**:基于prompt和生成的response,actor模型计算logprob用于重要性采样
4. **reward**:基于prompt和生成的response,奖励模型评估奖励值R_N。
5. **update**:基于计算得到的R_N、ref logprob、logprob计算优化函数和策略梯度,对actor模型进行更新

## profilling工具使能
### 使能方法
使能和配置教程可参考:[verl/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst at main · verl-project/verl](https://github.com/verl-project/verl/raw/main/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst)
## 性能分析方法论
### 整体性能概览分析
#### 1. 长耗时任务与资源空泡分析
- **操作**:使用MindStudio Insight加载profiling数据,自动识别不同计算阶段,通过RL页签流水图定位长耗时任务与NPU资源空泡
- **价值**:快速掌握不同阶段耗时占比
- **效果展示**:

#### 2. 负载均衡分析
- **操作**:通过MindStudio Insight直接查看MSTX打点数据,观察Rollout阶段不同DP Rank的负载均衡情况
- **价值**:快速识别负载不均问题
- **效果展示:**

#### 3. 集群整体性能分析
- **操作**:结合MSTT的rl_analysis功能,生成集群Timeline缩略图,观察各阶段整体耗时
- **价值**:宏观掌握集群性能瓶颈
- **操作指南**:[rl_analysis使用文档](https://gitcode.com/Ascend/mstt/raw/pre-research/profiler/msprof_analyze/docs/features/rl_analysis.md)
- **效果展示**:

### 细粒度分析
#### 性能分析
- **操作**:可通过 MindStudio Insight Windows 或 Linux 版本加载 Profiling 数据
- **价值**:MindStudio Insight 支持分析任务调度效率、算子执行性能、计算资源利用率、集合通信性能等。其 Timeline 视图具备任务拆解与 Overlap 分析功能(**为 MindStudio 独有核心特性,在 NV 及其他竞品中不具备,是 AI 调优的必备工具**),并支持鼠标交互式分析。
- **效果展示**:

#### 内存分析
##### **通过 Profiling 结合调用栈分析系统内存变化**
- **操作**:采集数据时开启调用栈和内存视图功能。
- **价值**:观察框架、CANN内存申请释放情况,可结合调用栈跟踪到前端python代码。
- **效果展示**:结合调用栈进行内存变化分析。效果如下所示:

##### **使用 msleaks 工具进行深层次内存分析**
- **操作步骤**:参考 [msleaks 工具使用指南](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/83RC1alpha003/devaids/msleaks/atlas_msleaks_0001.html)。
- **价值**:可以查看框架内存申请总量折线图/内存块图,并直接对应调用栈,可深层次分析框架内存使用情况。
- **效果展示**:

## 性能分析案例
要做具体的性能分析,profiling要开启**level1**,否则算子的关键信息会缺失。
### 1.host bound诊断
host bound是指CPU任务量综合大于NPU,导致NPU执行出现空泡的现象。可以通过看Host2Device的同步连线来判断,如果连线都是歪的,那证明这里的set信号早于wait信号,NPU一ready就执行了,那也是device bound:

如果确诊为host bound,那么我们可以打开CPU侧,找出各算子的下发耗时。注意找的时候需要找出所有CPU耗时的累加值,而不能找单层,因为首次调用的耗时是很长的。例如下图的GmmSwigluQuant,CPU上首次调用需要1ms,后续每次只需要200us。

此时有的算子在负重前行,有的算子拖了后腿,后者多于了前者。我们优先**找出来host耗时大于device的top算子,这些算子是拖后腿的**,可以交予算子团队重点分析。
### 2.组网合理性分析
有的时候,模型组网没有按照最高效的方式来,这一点在profiling中是非常易于识别的,下面会介绍一下分析思路并给出例子。
通常来讲,LLM中的大的热点算子是Attention和FFN中的矩阵乘计算,二者加起来在prefill下可能达到计算耗时的70%+,decode下可能达到50%+。如果整体的耗时比例不符合预期,或者profiling中出现了一些新面孔,或者拼接类算子太多了,这都值得我们去分析一下模型组网,是不是使用算子的方式错了?尤其是拼接类算子,是值得我们逐一分析的。
对于slice/split/concat这样的拼接类算子,还有transpose/cast这种转换算子,他们的存在往往是前后算子不直接配套造成的。如果前一个算子可以直接对输出做好尾处理,往往可以节省一个算子的启动开销和一次冗余读写。但这样的改变不一定符合算子的基本设计原则。
举一个正例,对于某次Matmul的输出shape为[m, n0 + n1],在这后面我们接了两个slice,输入均为这个[m, n0 + n1]的tensor,输出分别为[m, n0]和[m, n1]。第一个优化的思路是将两个slice改为一个split,这样耗时可以基本减半,[m, n0 + n1]的显存也可以尽早释放。进一步优化的思路是将矩阵乘的权重从[k, n0 + n1]分割为[k, n0]和[k, n1],将原来的矩阵乘任务分成两个(前提是这两个的耗时加起来不比之前的劣化太多,分核策略不能出问题),从而彻底消除这个slice/split操作。

举一个反例,Rmsnorm(fp16)+Cast(fp16->fp32)+Matmul(fp32),Rmsnorm虽然输入输出都是fp16,但考虑到累加运算的精度,内部是fp32做计算的。如果将Cast融到Rmsnorm内,本就内部使用fp32做计算的Rmsnorm就可以省去一个末尾fp32->fp16的cast,加上我们干掉的Cast,总共节省两个cast的同时避免了一次精度丢失。虽然这样看起来精度性能双收了,但fp16进,fp32出的Rmsnorm是反原则的(核心输入和输出需要是同数据类型),除非我们能在广大开源模型中频繁找到这样的结构,证明它的普适性,否则算子团队是不允许做这样的算子的。

### 3.算子性能初诊
需要利用`".\ASCEND_PROFILER_OUTPUT\operator_details.csv"`来做分析,从而判断算子识否有性能问题。
Profiling工具会统计这些流水线在不同核上的平均繁忙时间(xxx_time),与最慢核的完整kernel耗时(task_duration)做除法,得到流水线利用率(xxx_ratio)。这些流水线之间虽然互有依赖,且搬运类流水线会互抢带宽,但算子只要设计得当,是可以做到互相掩盖的。因此我们可以初步认为,**当算子的执行耗时大到一定程度上,算子应当在某一条流水线上形成bound**,即利用率要高到一定程度。经验上,在单算子耗时达到50μ时,就可以认为算子应当在bound流水线上,达成80%+的占用率了。
以下图为例,第一行是一个FA算子,第二行是一个Matmul算子,FA在vec流水线上达到了88.1%的利用率,Matmul算子在mac流水线上达到了89.8%的利用率,他们的性能可以认为是合格的。

### 4.亲和shape调整
对于一个模型而言,超参是我们控制不了的,但我们可以控制并发度、权重格式、切分策略等因素来迎合算子,使其发挥出最大的性能,这一节主要从算子搬运效率和负载均衡两个方面出发,讨论模型侧值得尝试的调整方向。
#### 4.1 搬运效率亲和的shape
mte2是一个自身效率严重受shape影响的流水线。要想让mte2保证最大搬运效率,我们需要保障如下两个条件至少满足其一:
**(1)被搬运的矩阵使用nz作为format(最优)
(2)被搬运的矩阵的尾轴512B对齐,且不为16KB的整数倍(近似最优)**
对于权重矩阵来说,推理阶段尤其是decode,我们通常满足(1),训练阶段我们通常满足(2)。**如果我们做不到(1),我们就要迎合(2)**。典型的手段有:
1,如果没达成B的矩阵的首轴是亲和的而尾轴不亲和,那么对它做transpose
2,调整TP切分策略,避免出现不亲和的尾轴
#### 4.2 负载均衡亲和的shape
在算子shape不大时,受制于算子语义,我们有可能不能把所有核都利用起来,或者即使开满核,负载均衡却很差。这一小节主要是对decode阶段的小shape做分析。
首先,我们明确出当前NPU卡是多少核的,如果不清楚,跑出来的profiling里都是20,40这样的数,就说明是20核,反之是24核。这里我的24核其实是代表了一个cube和两个vector组成的小组,我们可以认为是一个cube作为主核,带了两个vector作为从核。如果一个算子是纯vector算子,那么就不再有组的概念,40或48个vector核会作为主核直接独立去拿逻辑任务。
对于LLM中的vector算子,它的一种常见分核策略有可能是分在最高维,也就是batch维,常见于对低维(也叫尾轴)有规约操作的norm类、动态量化类等算子;另一种是整体拍平,允许算子切分的非常细的算子,如elementwse算子。对于第一种,我们就可以在模型侧关注它的负载均衡问题。例如我们打48batch,而硬件却是个40个vector核,那这40个核会循环2次,第二次有多数的核会无事可做,这个batch数就可以认为是不友好的。如果将batch打到64或80,性能可以预见会是无损的。同样的情况下,如果是48核的卡,那我们可以认为这就是个非常友好的batch数。
对于cube类算子,它常见的分核策略是以base快去切分M和N(K轴是累加轴,对它分核会引入确定性问题)。最常见的分块是baseM=128,baseN=256。在decode阶段,我们的耗时基本可以看做都是在搬权重,这是因为激活的M极小,M方向大概率只分了一块,那么右矩阵就只需要搬一次。所以我们在M≤128的范围内可以尽情提高M,对性能都基本是无损的,如果M大于128,可以认为(128, 256]是下一个性能分档。
除了M外,N轴切分的任务也影响算子亲和性,以deepseekR1中的MLA预处理为例,它会使用同一个激活(shape为[batch_size, 7168])与两个权重做矩阵乘(shape为[7168, 1536]和[7168, 576])。在batch_size打不大的情况下,即使baseN缩短为128,N轴都不能用满核数,所以此时这两个矩阵乘各自的耗时,会约等于将他们权重N轴拼起来乘(shape为[7168, 2112])的矩阵乘的耗时。如果仅考虑模型竞争力,我们更希望对这两个权重做合并,否则两个小的矩阵乘带宽利用率都会非常差。
对于Attention算子,它常见的分核策略是q_seqlen、batch_size和kv_headnum。增量阶段q_seqlen会以MTP和GQA倍数做合并,但是通常也不会大过128,划分不出第二个任务,那么并行度基本就是batch_size * kv_headnum。
总的来说,我们可以依据shape信息和算子类别,对算子是否有负载均衡问题作出识别,从而对我们切分策略选择,最高吞吐量的batch策略作出预判。
================================================
FILE: docs/ascend_tutorial/examples/ascend_retool_best_pratice.rst
================================================
Ascend Retool Best Practice
===================================
Last updated: 03/01/2026.
引言
----------------------------------
Retool论文参考([Retool](https://arxiv.org/pdf/2504.11536))
集成代码解释器工具,通过多轮实时代码执行进行策略部署,并教会模型根据结果反馈学习何时以及如何调用工具。
1. 环境构建
2. 模型训练
用例模型脚本以及其需要的硬件条件各自如下:
=============== ============ ============ ===============
模型 NPU型号 节点数量 训推后端
=============== ============ ============ ===============
``Qwen2.5-7B`` Atlas 900 A2 1 ``vllm + FSDP``
=============== ============ ============ ===============
环境构建
-----------------------------------
1.从自定义Conda环境进行构建
============ ============================================================
software version
============ ============================================================
Python ``>= 3.10, <3.12``
CANN ``== 8.3.RC1``
torch ``== 2.7.1``
torch_npu ``== 2.7.1``
verl ``v0.6.1 commitId=d62da4950573d7a4b7ef2362337952e7ab59e78d``
vllm ``v0.11.0``
vllm-ascend ``v0.11.0-dev``
transformers ``4.57.6``
============ ============================================================
模型训练与评估
-----------------------------------
1.模型数据准备
^^^^^^^^^^^
`Qwen2.5-7B`
^^^^^^^^^^^
**下载模型权重**
--local-dir: 模型保存路径
.. code-block:: bash
git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
**下载训练数据集**
.. code-block:: bash
git clone https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
**下载评估数据集**
.. code-block:: bash
git clone https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
**下载预训练数据集**
.. code-block:: bash
python3 recipe/retool/retool_sft_preprocess.py
*注:自动下载ReTool-SFT,最后生成数据默认保存在~/ReTool-SFT/data目录下*
**执行预训练脚本**
.. code-block:: bash
bash recipe/retool/run_qwen2_7b_sft_npu.sh # 需适配脚本中路径
**合并预训练权重生成checkpoint**
.. code-block:: bash
python3 -m verl.model_merger merge --backend fsdp \
--local_dir ${DATASETS}/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372 \
--target_dir ${DATASETS}/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372/huggingface
2.代码沙箱准备
开源沙箱代码及部署参考
https://github.com/bytedance/SandboxFusion
**沙箱代码下载**
.. code-block:: bash
git clone -b main https://github.com/bytedance/SandboxFusion.git
**沙箱安装**
.. code-block:: bash
cd SandboxFusion
conda create -n sandbox -y python=3.11
conda activate sandbox
pip install poetry
poetry lock
poetry install
mkdir -p docs/build
cd runtime/python
bash install-python-runtime.sh
cd ../../
make run-online
3.训练
示例配置文件如下,在recipe/retool目录下创建一个run_qwen2.5_7b_dapo_npu.sh
根据开发者实际路径配置情况修改模型训练脚本中的以下参数
.. code-block:: bash
set -x
export VLLM_USE_V1=1
export TORCHDYNAMO_DISABLE=1
export VLLM_ASCEND_ENABLE_NZ=0
export TASK_QUEUE_ENABLE=1
export VLLM_ENABLE_GRAPH_MODE=1
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_ASCEND_ENABLE_MLP_OPTIMIZE=1
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
# ================= data/model/tool =================
HDFS_ROOT=${HDFS_ROOT:-"${PWD}"}
DATA_ROOT=${DATA_ROOT:-"${PWD}"}
dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
#aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
model_path=$DATA_ROOT/dataset/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372/huggingface
train_files="['$dapo_math_17k']"
test_files="['$aime_2024']"
# tool
tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
# wandb
project_name=retool
experiment_name=qwen2.5-7b_dapo
default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
# 创建日志文件
export TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_DIR="$HDFS_ROOT/verl/logs/$project_name/$experiment_name"
# 判断路径是否存在
if [ ! -d "$LOG_DIR" ]; then
# 路径不存在,创建路径
mkdir -p "$LOG_DIR"
echo "Directory $LOG_DIR created."
else
echo "Directory $LOG_DIR already exists."
fi
LOG_FILE="${LOG_DIR}/${TIMESTAMP}.log"
touch "$LOG_FILE"
echo "Log file $LOG_FILE created."
# ================= algorithm =================
adv_estimator=grpo
use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
max_turns=16
max_prompt_length=2048
max_response_length=20480
actor_lr=1e-6
train_batch_size=32
ppo_mini_batch_size=16
n_resp_per_prompt=16
n_resp_per_prompt_val=30
# ================= performance =================
infer_tp=2 # vllm
train_sp=4 # train
offload=True
actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 1 ))
log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 4 ))
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=$adv_estimator \
algorithm.use_kl_in_reward=$use_kl_in_reward \
algorithm.kl_ctrl.kl_coef=$kl_coef \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.return_raw_chat=True \
data.train_batch_size=$train_batch_size \
data.max_prompt_length=$max_prompt_length \
data.max_response_length=$max_response_length \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.custom_cls.path=recipe/retool/retool.py \
data.custom_cls.name=CustomRLHFDataset \
custom_reward_function.path=recipe/retool/retool.py \
custom_reward_function.name=compute_score \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
actor_rollout_ref.actor.clip_ratio_c=10.0 \
actor_rollout_ref.actor.optim.lr=$actor_lr \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
actor_rollout_ref.rollout.max_num_batched_tokens=$actor_max_token_len_per_gpu \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.mode=async \
actor_rollout_ref.rollout.max_num_seqs=1024 \
actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
actor_rollout_ref.rollout.multi_turn.enable=True \
actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
actor_rollout_ref.rollout.multi_turn.format=hermes \
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
actor_rollout_ref.rollout.n=$n_resp_per_prompt \
actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.enforce_eager=False \
trainer.logger=['console'] \
trainer.project_name=$project_name \
trainer.experiment_name=$experiment_name \
trainer.n_gpus_per_node=8 \
trainer.val_before_train=False \
trainer.log_val_generations=20 \
trainer.nnodes=1 \
trainer.save_freq=100 \
trainer.default_local_dir=$default_local_dir \
trainer.test_freq=20 \
trainer.device=npu \
actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.ref.use_torch_compile=False \
actor_rollout_ref.actor.entropy_checkpointing=True \
actor_rollout_ref.ref.entropy_checkpointing=True \
actor_rollout_ref.ref.use_torch_compile=False \
trainer.total_epochs=1 $@ > $LOG_FILE 2>&1 &
================================================
FILE: docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
================================================
Ascend SGLang Best Practice
===================================
Last updated: 01/27/2026.
.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
引言
----------------------------------
SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生支持该推理引擎在verl中使用,
仅需简单的构建流程,开发者即可完成环境构建,本文将提供两个经典用例来帮助开发者了解以下内容:
1. 环境构建
2. 模型训练与评估
3. 性能采集
两个用例模型脚本以及其需要的硬件条件各自如下:
+----------------------+---------------------+----------+------------------------+
| 模型 | NPU型号 | 节点数量 | 训推后端 |
+======================+=====================+==========+========================+
| `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron |
+----------------------+---------------------+----------+------------------------+
| `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP |
+----------------------+---------------------+----------+------------------------+
环境构建
-----------------------------------
我们在quickstart中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建
在本实践中, 我们额外指定verl 的commit id 以避免引入其他问题
.. code-block:: bash
cd verl
git checkout c98cb8cc
模型训练与评估
-----------------------------------
1.模型数据准备
^^^^^^^^^^^
`Qwen3-30B`_
^^^^^^^^^^^
**下载模型权重**
Qwen3-30B: https://huggingface.co/Qwen/Qwen3-30B-A3B
**下载数据集**
DAPO-Math-17k: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
**HuggingFace To Megatron权重转换(可选)**
.. code-block:: bash
python scripts/converter_hf_to_mcore.py \
--hf_model_path Qwen/Qwen3-30B-A3B \
--output_path Qwen/Qwen3-30B-A3B-mcore \
--use_cpu_initialization # Only work for MoE models
*注:verl当前已支持mbridge进行灵活的hf和mcore之间的权重转换,可以修改以下相关参数直接加载hf权重*
.. code-block:: bash
actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
actor_rollout_ref.actor.megatron.use_mbridge=True
`Qwen2.5-32B`_
^^^^^^^^^^^
**下载模型权重**
--local-dir: 模型保存路径
.. code-block:: bash
export HF_ENDPOINT=https://hf-mirror.com
hf download --resume-download Qwen/Qwen2.5-32B --local-dir /path/to/local_dir
**下载及处理数据集**
.. code-block:: bash
wget https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/resolve/main/deepscaler.json
python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9
2.训练
^^^^^^^^^^^
根据开发者实际路径配置情况修改模型训练脚本中的以下参数
.. code-block:: bash
# Model Weights Paths
MODEL_PATH=Qwen/Qwen3-30B-A3B
MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
# File System Paths
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
#保存频率,-1默认不保存,如需评测请修改此参数
trainer.save_freq=-1
对于单机任务 `Qwen3-30B`_ , 可以直接bash执行verl仓上示例脚本
.. code-block:: bash
bash examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
对于多节点任务 `Qwen2.5-32B`_ ,我们推荐使用以下脚本进行大规模多节点训练拉起
.. code-block:: bash
pkill -9 python
ray stop --force
rm -rf /tmp/ray
export RAY_DEDUP_LOGS=0
export HYDRA_FULL_ERROR=1
# TASK_QUEUE_ENABLE,下发优化,图模式设置为1,非图模式设置为2
export TASK_QUEUE_ENABLE=1
export HCCL_ASYNC_ERROR_HANDLING=0
export HCCL_EXEC_TIMEOUT=3600
export HCCL_CONNECT_TIMEOUT=3600
export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8
# 修改为当前需要跑的用例路径
DEFAULT_SH="./run_*.sh"
echo "Use $DEFAULT_SH"
ulimit -n 32768
mkdir logs
NNODES=2
NPUS_PER_NODE=8
# 修改为对应主节点IP
MASTER_ADDR="IP FOR MASTER NODE"
# 修改为当前节点的通信网卡
SOCKET_IFNAME="Your SOCKET IFNAME"
export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
# 获取当前IP
CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
# 主节点启动
ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
while true; do
ray_status_output=$(ray status)
npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
device_count=$((npu_count_int / $NPUS_PER_NODE))
# 判断device_count 是否与 NNODES 相等
if [ "$device_count" -eq "$NNODES" ]; then
echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
ray status
bash $DEFAULT_SH
break
else
echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
sleep 5
fi
done
else
# 子节点尝试往主节点注册 ray 直到成功
while true; do
# 尝试连接 ray 集群
ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
# 检查连接是否成功
ray status
if [ $? -eq 0 ]; then
echo "Successfully connected to the Ray cluster!"
break
else
echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
sleep 5
fi
done
fi
sleep 600
DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 `Qwen2.5-32B`_ 路径。
NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为2和8。
MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。
SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡,通信网卡可以通过以下命令获取:
.. code-block:: bash
ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1
3.模型评估
^^^^^^^^^^^
不同模型步骤一致,仅以Qwen3-30b为例列举
我们通过 AISBenchmark 评估模型,该工具支持vllm/sglang多种推理后端的评估
**安装方法**
.. code-block:: bash
git clone https://gitee.com/aisbench/benchmark.git
cd benchmark
pip install -e .
**下载评估数据集**
.. code-block:: bash
cd path/to/benchmark/ais_bench/datasets
wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip
unzip math.zip
rm math.zip
**修改AISBench配置代码使能sglang推理评测**
打开 benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py 文件,这是推理配置文件
.. code-block:: bash
from ais_bench.benchmark.models import VLLMCustomAPIChatStream
from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content
from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatStreamSglangClient
models = [
dict(
attr="service",
type=VLLMCustomAPIChatStream,
abbr='sgl-api-stream-chat',
path="/path/to/Qwen3-30B", # 修改为 Qwen3-30B 模型路径
model="qwen3-30b",
request_rate = 0,
max_seq_len=2048,
retry = 2,
host_ip = "localhost", # 推理服务的IP
host_port = 8005, # 推理服务的端口
max_out_len = 8192, # 最大输出tokens长度
batch_size=48, # 推理的最大并发数
trust_remote_code=False,
custom_client=dict(type=OpenAIChatStreamSglangClient), #使用sglang客户端
generation_kwargs = dict(
temperature = 0,
seed = 1234,
),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]
**启动sglang_server服务**
.. code-block:: bash
python -m sglang.launch_server --model-path "/path/to/Qwen3-30B" --tp-size 4 --dp-size 1 --port 8005
**启动sglang_client评测**
.. code-block:: bash
ais_bench --models vllm_api_stream_chat --datasets math500_gen_0_shot_cot_chat_prompt
**评测结果**
经过训练,模型在Math-500上的评分显著上升
+------+----------------------+---------+----------+------+----------------------+
| iter | dataset | version | metric | mode | sgl-api-stream-chat |
+======+======================+=========+==========+======+======================+
| 0 | math_prm800k_500 | c4b6f0 | accuracy | gen | 84.4 |
+------+----------------------+---------+----------+------+----------------------+
| 150 | math_prm800k_500 | c4b6f0 | accuracy | gen | 91.7 |
+------+----------------------+---------+----------+------+----------------------+
性能采集
-----------------------------------
关于NPU profiling的详细文档请参考 `ascend_profiling_zh `_
在 `Qwen3-30B`_ 的脚本中提供了基本的采集性能选项PROF_CONFIG,默认设置 global_profiler.steps=null 关闭采集, 开发者可根据实际需要进行参数修改
采集完成后,开发者可以使用 `MindStudio Insight `_ 进行数据解析
注: verl框架侧进行采集全量 Profiling 产生海量且重复的算子记录,可以根据文档修改代码仅采集关键阶段
================================================
FILE: docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
================================================
# DAPO multi model optimization practice
## DAPO 介绍
Last updated: 03/04/2026.
DAPO的论文可以参考:[DAPO](https://arxiv.org/pdf/2503.14476),其中包含以下几个关键技术。
* **Clip-Higher**: 通过对重要性采样比的上限剪裁促进了系统的多样性并避免了熵坍缩(Entropy Collapse)。
* **Dynamic Sampling**: 提高了训练效率和稳定性。DAPO出了一种执行动态采样的策略,并过滤掉准确率等于1和0的提示组,从而保持批次间具有有效梯度的提示数量一致。
* **Token-level Policy Gradient Loss**: 在长链思维强化学习 (long-CoT RL) 场景中至关重要。
* **Overlong Reward Shaping**: 减少奖励噪声并稳定了训练。
在verl中,可以进行如下设置,从而进行DAPO算法的运行。
- **奖励模型的管理策略为 DAPO**
在dapo算法中,必须配置成dapo。
```
reward_model.reward_manager.name=dapo
```
- **Clip-Higher 更高裁剪**
`clip_ratio_low` 和 `clip_ratio_high` 用于指定 DAPO 目标函数中的 $\varepsilon_{\text {low }}$ 和 $\varepsilon_{\text {high }}$。
```
clip_ratio_low=0.2 # 裁剪比例下限,默认值为0.2
clip_ratio_high=0.28 # 裁剪比例上限,默认值为0.28
```
- **动态采样的相关配置**
将 `filter_groups.enable` 设置为 `True` 会过滤掉输出 `metric` 完全相同的组,例如对于 `acc` 指标,过滤掉输出准确率全部为 1 或 0 的组。
训练器会使用 `gen_batch_size` 进行重复采样,直到生成足够数量的符合条件的组,或者达到 `max_num_gen_batches` 所指定的上限为止。
```
data.gen_batch_size=${gen_prompt_bsz}
algorithm.filter_groups.enable=${enable_filter_groups} # 动态采样开关
algorithm.filter_groups.metric=${filter_groups_metric} # 使用准确率作为过滤标准
algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} # 最大生成批次数量,最多重复生成数据的次数
```
- **Token-level Loss**
将 `loss_agg_mode` 设置为 `token-mean` 意味着计算一个批次中所有序列内所有 token 的(策略梯度)损失的平均值。
```
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
# 注意:“token-mean”是默认行为。
```
- **奖励模型对超长回答的惩罚配置**
将 `overlong_buffer.enable` 设置为 `True` 将对输出长度过长但仍未超过硬上下文限制的输出进行惩罚。具体来说,当输出的长度超过 `max_response_length - overlong_buffer.len` 且超出 `0` 到 `overlong_buffer.len` 个 token 时,惩罚值会从 `0` 线性增加到 `overlong_buffer.penalty_factor`。
```
reward_model.overlong_buffer.enable=${enable_overlong_buffer} # 启用超长缓冲区惩罚,开启对超长输出的惩罚机制
reward_model.overlong_buffer.len=${overlong_buffer_len} # 缓冲区长度,定义缓冲区的toke,最大惩罚强度
reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} #惩罚因子,最大惩罚强度
```
相关参数涉及的代码可以参考:[Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)](https://github.com/verl-project/verl-recipe/blob/main/dapo/README.md)
## 硬件要求
当前支持Atlas 800T A3 与 Atlas 900 A3 SuperPoD。完成跑完本次最佳实践需要 2台Atlas 800T A3。关键软件版本可以参考:[Ascend Quickstart](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/quick_start/ascend_quick_start.rst)
## 安装基础环境
| software | version|
| --- | --- |
| Python| >= 3.10, <3.12 |
| CANN | == 8.3.RC1 |
| torch | == 2.7.1 |
| torch_npu | == 2.7.1 |
| verl | main分支 commitId=252d76908b903ad8fb6969eb3a5e5f873c95ea2b |
| vllm | v0.11.0 |
| vllm-ascend | v0.11.0-dev|
| transformers | 4.57.3|
在本实践中, 我们通过指定 verl 的commit id 以避免引入其他问题
```
cd verl
git checkout 252d76908b903ad8fb6969eb3a5e5f873c95ea2b
# 指定相应的recipe版本
git submodule update --init --recursive recipe
cd recipe
git checkout main
```
## 模型训练
### 数据集准备
Geometry3k 数据集是由加利福尼亚大学洛杉矶分校与浙江大学联合研发的几何领域专用数据集,核心面向视觉问答(VQA)任务展开研究与模型训练。该数据集总计包含 3002 个样本,采用图像和文本两种模态数据形式构建,其中文本模态涵盖各类几何问题描述,图像则以可视化图表呈现问题中的几何图形信息,包括三角形、圆形、四边形等基础几何形状,以及不同图形间的位置、嵌套、相交等关联关系。可以从Hugging Face库下载对应的原始数据集:[Geometry3k ](https://huggingface.co/datasets/hiyouga/geometry3k)
```python
# 下载原始数据并预处理
python ./examples/data_preprocess/geo3k.py --local_dir=./data/geo3k
```
### 权重下载
从Hugging Face库下载对应的模型权重:[Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/main
)
### jemalloc安装
为了确保 Ray 进程能够正常回收内存,需要安装并使能 jemalloc 库进行内存管理。
#### Ubuntu 操作系统
通过操作系统源安装jemalloc(注意: 要求ubuntu版本>=20.04):
```shell
sudo apt install libjemalloc2
```
在启动任务前执行如下命令通过环境变量导入jemalloc,需先通过 **find /usr -name libjemalloc.so.2** 确认文件是否存在 :
```shell
# arm64架构
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2
# x86_64架构
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
```
#### OpenEuler 操作系统
执行如下命令重操作系统源安装jemalloc
```shell
yum install jemalloc
```
如果上述方法无法正常安装,可以通过源码编译安装 前往jemalloc官网下载最新稳定版本,官网地址:https://github.com/jemalloc/jemalloc/releases/
```shell
tar -xvf jemalloc-{version}.tar.bz2
cd jemalloc-{version}
./configure --prefix=/usr/local
make
make install
```
### 全局变量导入
- 为了确保 Ray 进程能够正常回收内存,需要安装并使能 jemalloc 库进行内存管理,用于更好管理内存,避免长跑过程中内存 OOM。
```
# 根据实际安装路径设置 jemalloc 环境变量,例如安装路径为:/usr/local/lib/libjemalloc.so.2(可通过 find /usr -name libjemalloc.so.2 确认文件是否存在)
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
```
- 某些模型是通过 vllm ascend 进行优化的。但在某些情况下,优化后的模型可能并不适用。此时,将此值设置为 0 即可禁用优化后的模型。
```
export USE_OPTIMIZED_MODEL=0
```
- 启用vLLM V1
```
export VLLM_USE_V1=1
```
- 昇腾多卡通信的兜底配置,延长连接超时时间,避免集群环境下训练启动因连接慢而失败
```
export HCCL_CONNECT_TIMEOUT=5400
```
- 控制 vLLM 在昇腾芯片上是否启用NZ优化
```
export VLLM_ASCEND_ENABLE_NZ=0
```
### 训练
```
# Model Weights Paths
MODEL_PATH=hf_weights/Qwen3-VL-30B-A3B-Instruct
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
# File System Paths
TRAIN_FILE=$RAY_DATA_HOME/datasets/geo3k/train.parquet
TEST_FILE=$RAY_DATA_HOME/datasets/geo3k/test.parquet
# 保存频率,-1默认不保存,如需评测请修改此参数
trainer.save_freq=-1
```
对于单机任务 Qwen3-VL-30B , 修改脚本中参数`trainer.nnodes`为 1, `trainer.n_gpus_per_node` 为16,然后直接bash执行verl仓上示例脚本
```
bash recipe/dapo/run_dapo_qwen3_vl_30b_fsdp2_npu.sh
```
对于多节点任务 Qwen3-VL-30B ,我们推荐使用以下脚本进行大规模多节点训练拉起
```
pkill -9 python
ray stop --force
rm -rf /tmp/ray
export VLLM_USE_V1=1
export HCCL_CONNECT_TIMEOUT=5400
export VLLM_ASCEND_ENABLE_NZ=0
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
export USE_OPTIMIZED_MODEL=0
# 修改为当前需要跑的用例路径
DEFAULT_SH="./run_*.sh"
echo "Use $DEFAULT_SH"
ulimit -n 32768
mkdir logs
NNODES=2
NPUS_PER_NODE=8
# 修改为对应主节点IP
MASTER_ADDR="IP FOR MASTER NODE"
# 修改为当前节点的通信网卡
SOCKET_IFNAME="Your SOCKET IFNAME"
export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
# 获取当前IP
CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
# 主节点启动
ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
while true; do
ray_status_output=$(ray status)
npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
device_count=$((npu_count_int / $NPUS_PER_NODE))
# 判断device_count 是否与 NNODES 相等
if [ "$device_count" -eq "$NNODES" ]; then
echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
ray status
bash $DEFAULT_SH
break
else
echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
sleep 5
fi
done
else
# 子节点尝试往主节点注册 ray 直到成功
while true; do
# 尝试连接 ray 集群
ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
# 检查连接是否成功
ray status
if [ $? -eq 0 ]; then
echo "Successfully connected to the Ray cluster!"
break
else
echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
sleep 5
fi
done
fi
sleep 600
```
DEFAULT_SH: 修改为训练所用配置 sh 文件路径。在此案例中修改为 [Qwen3_VL_30B](https://github.com/verl-project/verl-recipe/blob/main/dapo/run%20dapo_qwen3_vl_30b_fsdp2_npu.sh) 路径。
NNODES 和 NPUS_PER_NODE: 修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为2和8。
MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。
SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡,通信网卡可以通过以下命令获取:
```
ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1
```
## 优化参考
- **启动动态批次大小**
根据单 GPU 的最大 Token 总数(ppo_max_token_len_per_gpu)动态调整批次大小
```
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
```
- **单个 GPU 能处理的最大 Token 总数**
当`use_dynamic_bsz=True`时,单 GPU 在一个微批次中能处理的最大 Token 数量
```
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
```
- **单个 GPU 微批次大小**
当`use_dynamic_bsz=True`时,框架会以该值为初始批次大小,再根据`ppo_max_token_len_per_gpu`向上 / 向下调整
```
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
```
- **启用 FSDP2 框架**
“将模型参数、梯度、优化器状态分片存储在不同 GPU 上”,避免单卡加载全量模型导致显存溢出。
```
# 启用 FSDP2 框架
actor_rollout_ref.actor.strategy=fsdp2
actor_rollout_ref.ref.strategy=fsdp2
critic.strategy=fsdp2
# 仅用于 FSDP2:前向传播后重新分片以减少内存占用。
actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True
# 仅用于 FSDP2:是否在模型前向传播后重新分片以节省内存。
actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True
```
- **启用专家并行配置**
指定有多少个 GPU用于并行计算不同的专家网络
```
# MoE 架构 Actor 模型的专家并行配置
actor_rollout_ref.rollout.expert_parallel_size=8
```
================================================
FILE: docs/ascend_tutorial/examples/gspo_optimization_practice.md
================================================
# NPU Qwen3-32B GSPO Optimization Practice
Last updated: 02/26/2026.
本文章对应脚本地址:[qwen3_32b_gspo_npu](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh)
## 算法适配
GSPO通过将优化颗粒度从**token级**提升到**sequence级**,规避了GRPO会遇到的**方差急剧增大**导致训练不稳定的情况,增加了训练的稳定性,同时该算法也在一定程度上提升了算法的收敛速度。
想要成功在verl仓库中成功调用到GSPO算法,需要进行如下的必要配置
~~~python
# 核心算法配置
algorithm.adv_estimator=grpo \ # 使用GRPO优势估计器
algorithm.use_kl_in_reward=False \ # 不在奖励中添加KL惩罚
# GSPO策略损失模式
actor_rollout_ref.actor.policy_loss.loss_mode=gspo \ # 启用GSPO策略损失
# 极小裁剪范围(GSPO特色)
actor_rollout_ref.actor.clip_ratio_low=0.0003 \ # 裁剪下界,论文推荐值
actor_rollout_ref.actor.clip_ratio_high=0.0004 \ # 裁剪上界,论文推荐值
# KL配置(GSPO不使用KL loss)
actor_rollout_ref.actor.use_kl_loss=False \ # 禁用KL损失
actor_rollout_ref.actor.kl_loss_coef=0.0 \ # KL损失系数设为0
# 序列级损失聚合模式(GSPO核心)
actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均,GSPO论文推荐
# 批次配置
actor_rollout_ref.rollout.n=16 \ # 每个prompt生成16个响应(组采样)
~~~
一般选择入口函数为`verl.trainer.main_ppo`
## 基础环境
当前支持Atlas 800T A3 与 Atlas 900 A3 SuperPoD。完成跑完本次最佳实践需要 4台Atlas 800T A3。关键软件版本可以参考:[Ascend Quickstart](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/quick_start/ascend_quick_start.rst)
### 安装基础环境
| software | version |
| ------------ | ---------------------------------------------------------- |
| Python | >= 3.10, <3.12 |
| CANN | == 8.3.RC1 |
| torch | == 2.7.1 |
| torch_npu | == 2.7.1 |
| verl | main分支 commitId=252d76908b903ad8fb6969eb3a5e5f873c95ea2b |
| vllm | v0.11.0 |
| vllm-ascend | v0.11.0-dev |
| transformers | 4.57.3 |
在本实践中, 我们通过指定 verl 的commit id 以避免引入其他问题
~~~bash
cd verl
git checkout 252d76908b903ad8fb6969eb3a5e5f873c95ea2b
# 指定相应的recipe版本
git submodule update --init --recursive recipe
~~~
### 权重获取
从Hugging Face库下载对应的模型权重:[Qwen/Qwen3-32B · Hugging Face](https://huggingface.co/Qwen/Qwen3-32B)
### 数据集准备
~~~bash
# 下载math-17k数据集
git clone https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
# 下载AIME_2024测试数据集
git clone https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
~~~
### jemalloc安装
为了确保 Ray 进程能够正常回收内存,需要安装并使能 jemalloc 库进行内存管理。
#### Ubuntu 操作系统
通过操作系统源安装jemalloc(注意: 要求ubuntu版本>=20.04):
```shell
sudo apt install libjemalloc2
```
在启动任务前执行如下命令通过环境变量导入jemalloc,需先通过 **find /usr -name libjemalloc.so.2** 确认文件是否存在 :
```shell
# arm64架构
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2
# x86_64架构
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
```
#### OpenEuler 操作系统
执行如下命令重操作系统源安装jemalloc
```shell
yum install jemalloc
```
如果上述方法无法正常安装,可以通过源码编译安装 前往jemalloc官网下载最新稳定版本,官网地址:https://github.com/jemalloc/jemalloc/releases/
```shell
tar -xvf jemalloc-{version}.tar.bz2
cd jemalloc-{version}
./configure --prefix=/usr/local
make
make install
```
在启动任务前执行如下命令通过环境变量导入jemalloc:
```shell
#根据实际安装路径设置环境变量,例如安装路径为:/usr/local/lib/libjemalloc.so.2,可通过以下命令来设置环境变量(可通过 find /usr -name libjemalloc.so.2 确认文件是否存在)
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2
```
### 多机任务拉起
针对本实践提供的多机任务,可用下面的脚本拉起
~~~bash
pkill -9 python
ray stop --force
rm -rf /tmp/ray
export RAY_DEDUP_LOGS=0
export HYDRA_FULL_ERROR=1
export TASK_QUEUE_ENABLE=1
export HCCL_EXEC_TIMEOUT=3600
export HCCL_CONNECT_TIMEOUT=3600
export HCCL_ASYNC_ERROR_HANDLING=0
export CPU_AFFINITY_CONF=1
export VLLM_USE_V1=1
export VLLM_ATTENTION_BACKEND=XFORMERS
export VLLM_ASCEND_ENABLE_FLASHCOMM=1
export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
# 修改为当前需要跑的用例路径
DEFAULT_SH="./run_*.sh"
echo "Use $DEFAULT_SH"
ulimit -n 32768
mkdir logs
NNODES=4
NPUS_PER_NODE=16
# 修改为对应主节点IP
MASTER_ADDR="IP FOR MASTER NODE"
# 修改为当前节点的通信网卡
SOCKET_IFNAME="Your SOCKET IFNAME"
export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
# 获取当前IP
CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
# 主节点启动
ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
while true; do
ray_status_output=$(ray status)
npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
device_count=$((npu_count_int / $NPUS_PER_NODE))
# 判断device_count 是否与 NNODES 相等
if [ "$device_count" -eq "$NNODES" ]; then
echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
ray status
bash $DEFAULT_SH
break
else
echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
sleep 5
fi
done
else
# 子节点尝试往主节点注册 ray 直到成功
while true; do
# 尝试连接 ray 集群
ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
# 检查连接是否成功
ray status
if [ $? -eq 0 ]; then
echo "Successfully connected to the Ray cluster!"
break
else
echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
sleep 5
fi
done
fi
sleep 600
~~~
DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 [Qwen2.5-32B](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh) 路径。
NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为4和16。
MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。
SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡,通信网卡可以通过以下命令获取:
```
ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1
```
## 性能调优
优化从训练、推理、调度和其他四个方面入手。
### 训练
#### 动态bsz
~~~bash
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
~~~
**这个优化点主要调整上面这两个参数,不过需要注意这两个参数调整的太大会导致OOM**
**主要调整**`actor_ppo_max_token_len`,调大了会降低训练的耗时,调整`infer_ppo_max_token_len`没有明显的收益,可以不动
**这两个参数的作用介绍如下:**
**这两个参数用于控制动态批处理(dynamic batch size)模式下每个GPU处理的最大token数量**
- **`actor_ppo_max_token_len`**: Actor模型在PPO更新(前向+反向传播)时每个GPU能处理的最大token数
- **`infer_ppo_max_token_len`**: 推理阶段(Reference policy和Rollout)计算log概率时每个GPU能处理的最大token数
### 推理
#### ACLgraph+FULL_DECODE_ONLY
推理算子下发方面的优化,平均能有`15%~20%`左右的性能收益。
先看单开**ACLgraph**,如下:
~~~bash
# 开启ACLgraph+FULL_DECODE_ONLY(注意:当设置此参数为False时,TASK_QUEUE_ENABLE必须设置为1,不然会报错)
actor_rollout_ref.rollout.enforce_eager=False
actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes='[8,16,32,64,128]' \
actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY' \
~~~
`FULL_DECODE_ONLY`开启成功后有如下输出:

**`cudagraph_capture_sizes`参数设置指南**
cudagraph_capture_sizes设置的值对应的是批大小,这里的批大小不是配置里的DP域对应的那个批次大小,这里是相较于vllm来说的批大小,单位为**token**
默认生成的算法如下,可做参考

##### 推理后端切换
使用方式:`export VLLM_ATTENTION_BACKEND=XFORMERS`

注:需要注意某些后端在一些比较老的vllm-ascend版本内并不支持
##### 使能vllm v1版本
使用方式:`export VLLM_USE_V1=1`
可以常开,一般都是正收益。
### 调度
#### AIV
打开方式:设置`export HCCL_OP_EXPANSION_MODE="AIV"`
HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置,支持如下取值:
- AI_CPU:代表通信算法的编排展开位置在Device侧的AI CPU计算单元。
- AIV:代表通信算法的编排展开位置在Device侧的Vector Core计算单元。
- HOST:代表通信算法的编排展开位置为Host侧CPU,Device侧根据硬件型号自动选择相应的调度器。
- HOST_TS:代表通信算法的编排展开位置为Host侧CPU,Host向Device的Task Scheduler下发任务,Device的Task Scheduler进行任务调度执行。
下面介绍两种展开机制
##### HOST展开
- 软件栈工作在hostcpu,通信算法展开一个个task
- 每个task调用runtime接口,下发到device的rtsqueue
- STARS从rstqueue上顺序拿取task
- 根据task类型分别调用掉SDMA和RDMA引擎。
**单算子瓶颈**:hostbound 每个task提交是2~5us,一个通信算子有几百个task,单算子场景不会在device上缓存,下发一个执行一个
##### AICpu机制展开
- host侧不下发一个个task,把通信算子作为一个个kernel,放在通信算子kernel的队列上去。
- STARS调度kernel队列流上的kernel,把kernel放到AiCPU上去执行。
- AICPU调用函数(kernel),用一个线程执行kernel 函数,在函数内把通信task展开,把task放到rstqueue上,STARS调用。
- 降低host和aicpu交互,由几百次降低为一次。
- task的提交在AICPU上提交,做了提交的部分合并。
#### TASK_QUEUE_ENABLE
**使用方式:**`export TASK_QUEUE_ENABLE=2`
TASK_QUEUE_ENABLE,下发优化,图模式设置为1(即开启图模式的时候这个要设置为1),非图模式设置为2
示意图:

##### 绑核优化
**使用方式:**`export CPU_AFFINITY_CONF=1`
详细设置原理可看:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0059.html
### 其他
以下内容汇总了若干全局环境变量的调优配置。由于这些参数在训练阶段与推理阶段往往都能带来正向收益,且目前尚缺乏足够精细的消融实验来严格区分它们各自对训练或推理的贡献占比,故统一归拢在此,供后续持续监控与进一步拆解分析。
#### 使能jemalloc
使用方式(注意需要先安装jemalloc库):`export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2`
**安装使用教程:**[MindSpeed-RL/docs/install_guide.md · Ascend/MindSpeed-RL - AtomGit | GitCode](https://gitcode.com/Ascend/MindSpeed-RL/blob/master/docs/install_guide.md#高性能内存库-jemalloc-安装)
#### 多流复用
内存方面有优化
使能方式:`export MULTI_STREAM_MEMORY_REUSE=1`
原理介绍:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0040.html
#### VLLM_ASCEND_ENABLE_FLASHCOMM
使用方式:`export VLLM_ASCEND_ENABLE_FLASHCOMM=1`
启用昇腾 NPU 特有的FLASHCOMM高速通信优化技术
地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
#### VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
使用方式:`export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1`
启用昇腾 NPU针对大模型推理的稠密计算优化
地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
#### VLLM_ASCEND_ENABLE_PREFETCH_MLP
使用方式:`export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1`
启用 MLP 层的权重预取机制
### verl框架参数设置
主要是内存方面的一些设置开关(注意,这个里面的优化都或多或少会导致吞吐量有一定程度的劣化)
~~~bash
# 梯度检查点 (Gradient Checkpointing)
# 作用: 通过重新计算激活值来节省显存,以计算换内存。在前向传播时不保存中间激活值,反向传播时重新计算,可以显著降低显存占用,允许使用更大的batch size。
actor_rollout_ref.model.enable_gradient_checkpointing=True
# 参数卸载 (Parameter Offload)
# 作用: 将模型参数卸载到CPU内存,训练时再加载回GPU。
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} # True
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} # True
# 优化器状态卸载 (Optimizer Offload)
# 作用: 将优化器状态(如Adam的动量)卸载到CPU。优化器状态通常占用大量显存(对于Adam,每个参数需要额外8字节),卸载可以节省显存。
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} # True
# 释放推理引擎缓存 (Free Cache Engine)
# 作用: 在训练阶段释放推理引擎的KV cache和权重。这是3D-HybridEngine的核心优化,允许在同一GPU上交替进行推理和训练,显著降低显存需求。
actor_rollout_ref.rollout.free_cache_engine=True
# 熵计算优化
# entropy_checkpointing: 在训练时对熵计算启用重计算,降低显存峰值
# entropy_from_logits_with_chunking: 分块处理logits张量(如2048 tokens一组),避免一次性加载整个[bsz*seq_len, vocab]张量
actor_rollout_ref.actor.entropy_checkpointing=True
actor_rollout_ref.ref.entropy_checkpointing=True
actor_rollout_ref.actor.entropy_from_logits_with_chunking=True
actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
# 推理引擎显存配置
# gpu_memory_utilization: 控制vLLM使用的GPU显存比例(0.90 = 90%)
# enforce_eager=False: 启用CUDA graphs加速推理,但会占用额外显存
actor_rollout_ref.rollout.gpu_memory_utilization=0.90
actor_rollout_ref.rollout.enforce_eager=False
~~~
## NPU调优参考文章
环境变量相关:[环境变量列表-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/apiref/Envvariables/Envir_001.html)
社区性能调优教程:[性能调优流程-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0001.html)
================================================
FILE: docs/ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md
================================================
# Long Sequence Qwen3-32B 1k-to-256k Example
Last updated: 6/3/2026.
本章对Qwen3-32B进行了长序列开发。Qwen3-32B的模型能力为最长推到40k
## 全层实验
对Qwen3-32B进行了长序列开发,脚本如下:
```bash
set -x
export USE_OPTIMIZED_MODEL=0
export VLLM_USE_V1=1
export VLLM_ASCEND_ENABLE_NZ=0
export VLLM_VERSION="0.13.0"
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
export PYTORCH_NPU_ALLOC_CONF="max_split_size_mb:2048"
PROJECT_NAME="GRPO-Qwen3-32B"
EXPERIMENT_NAME="GRPO-Qwen3-32B-megatron-gsm8k"
SAVE_CHECKPOINT_DIR=$HOME/verl_checkpoints
math_train_path=$HOME/datasets/gsm8k/train.parquet
math_test_path=$HOME/datasets/gsm8k/test.parquet
train_files="['$math_train_path']"
test_files="['$math_test_path']"
use_dynamic_bsz=False
enable_chunked_prefill=True
tp_size=8
max_prompt_length=1024
max_response_length=$((1024*256))
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / tp_size))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / tp_size))
cp_size=4
python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer.yaml' \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.shuffle=False \
data.validation_shuffle=False \
data.train_batch_size=64 \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.filter_overlong_prompts=False \
data.truncation='error' \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.path=$HOME/hf_weights/Qwen3-32B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=8 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
+actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_size=${CP} \
+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
actor_rollout_ref.actor.megatron.param_offload=True \
actor_rollout_ref.actor.megatron.optimizer_offload=True \
actor_rollout_ref.actor.megatron.grad_offload=True \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.enable_chunked_prefill=${enable_chunked_prefill} \
actor_rollout_ref.rollout.enable_prefix_caching=True \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
actor_rollout_ref.ref.megatron.param_offload=True \
actor_rollout_ref.ref.use_torch_compile=False \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.ref.megatron.use_dist_checkpointing=False \
actor_rollout_ref.ref.megatron.dist_checkpointing_path=${SAVE_CHECKPOINT_DIR} \
actor_rollout_ref.actor.megatron.use_dist_checkpointing=False \
actor_rollout_ref.actor.megatron.dist_checkpointing_path=${SAVE_CHECKPOINT_DIR} \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=console \
trainer.n_gpus_per_node=16 \
trainer.nnodes=2 \
trainer.save_freq=100 \
trainer.test_freq=-1 \
trainer.total_training_steps=100 \
trainer.device=npu \
trainer.project_name=${PROJECT_NAME} \
trainer.experiment_name=${EXPERIMENT_NAME} \
trainer.total_epochs=30
```
- 相关实验结果

## 减层实验
在实际推理中,我们发现其最大在20k左右,因此对其进行减层实验,其response能到达到40k。
在权重的`config.json`文件中,我们将`num_hidden_layers`从64减层到16
```
{
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 25600,
"max_position_embeddings": 40960,
"max_window_layers": 64,
"model_type": "qwen3",
"num_attention_heads": 64,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.51.0",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}
```
- 其实验结果如下:

================================================
FILE: docs/ascend_tutorial/faq/faq.rst
================================================
Last updated: 03/16/2026.
================================================
FILE: docs/ascend_tutorial/features/ascend_backend_features.md
================================================
# Ascend Backend Features Guide
==================================================================================
Last updated: 03/03/2026.
昇腾全面支持verl生态建设,本文将介绍NPU上对于verl的适配工作及后端特性支持供开发者进行参考
---
## 推理后端
当前verl支持vllm/sglang这两种主流推理后端,均可在昇腾NPU上运行。
### 1. vllm:
昇腾通过vllm-ascend插件来支持vllm推理后端,该插件是 vLLM 社区支持 Ascend 后端的推荐方法。它遵循[[RFC]](https://github.com/vllm-project/vllm/issues/11162),提供了一个可插拔接口,将 Ascend NPU 与 vLLM 解耦。
##### 参数特性支持
| vllm参数| verl对应通用参数 | 简介|
| --- | --- | --- |
| `model_path` | `actor_rollout_ref.model.path` |模型权重文件的路径|
| `gpu_memory_utilization` | `actor_rollout_ref.rollout.gpu_memory_utilization` |用于控制每个阶段可使用的 GPU 内存量。它被指定为一个介于 0.0 和 1.0 之间的分数,其中:- 0.8 表示 GPU 总内存的 80%- 1.0 表示 GPU 总内存的 100%(不推荐,没有预留缓冲)|
| `enforce_eager`| `actor_rollout_ref.rollout.enforce_eager` |禁用图模式,verl默认为False|
| `enable_chunked_prefill`| `actor_rollout_ref.rollout.enable_chunked_prefill` | 分块预填充允许将大预填充分块成更小的块,并将它们与解码请求一起批处理。|
| `free_cache_engine`| `actor_rollout_ref.rollout.free_cache_engine` |在部署生成阶段之后卸载 KVCache,默认值为 True。|
| `max_model_len` | `actor_rollout_ref.rollout.max_model_len` | 模型能够处理的最大序列长度。它限制了单个输入序列的最大长度 |
| `tp_size`| `actor_rollout_ref.rollout.tensor_model_parallel_size * data_parallel_size`|TP并行度|
| `dp_size`| `actor_rollout_ref.rollout.data_parallel_size`|DP并行度|
| `ep_size`| `actor_rollout_ref.rollout.expert_parallel_size`|EP并行度|
| `node_rank`| `无,根据实际实例和卡数自动计算` |实例中的节点排序|
| `load_format`| `actor_rollout_ref.rollout.load_format` |要加载的模型权重格式|
| `disable_log_stats`| `actor_rollout_ref.rollout.disable_log_stats`|记录抢占请求的累积数量 |
| `nnodes `| `无,根据实际实例和卡数自动计算` | 每个实例包含的节点数量` |
| `trust_remote_code`| `actor_rollout_ref.model.trust_remote_code`|是否允许在 Hub 上定义自定义模型,并将其写入自己的建模文件中|
| `max_num_seqs` | `actor_rollout_ref.rollout.max_num_seqs` |正在运行的请求的最大数量|
| `max_num_batched_tokens`| `actor_rollout_ref.rollout.max_num_batched_tokens` |在一次批处理(batch)中可以处理的最大总Token数|
| `skip_tokenizer_init`| `actor_rollout_ref.rollout.skip_tokenizer_init` |跳过初始化分词器并将 input_ids 传递到推理请求中|
| `enable_prefix_caching` | `actor_rollout_ref.rollout.enable_prefix_caching`|`用于启用自动前缀缓存` |
| `quantization`| `actor_rollout_ref.rollout.quantization,默认为None`|`量化方法`|
| `enforce_eager`|`actor_rollout_ref.rollout.enforce_eager`|标志用于强制使用PyTorch的eager执行模式,而非默认的图执行模式|
### 2. sglang:
对于sglang推理后端,昇腾通过直接向sglang社区进行持续建设与维护来支持相关功能。
此外在verl中使用sglang还涉及以下组件, 我们在[quick start](https://github.com/verl-project/verl/blob/main/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst)中提供详细说明与一键安装脚本。
| 组件| 描述|
| --- | --- |
| [sgl_kernel_npu](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/sgl_kernel_npu/README.md) | Ascend NPU SGL 优化推理内核集合,包括注意力机制、归一化、激活函数、LoRA 适配器等。 |
| [deepep](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md) | DeepEP的 Ascend 实现,为MoE模型提供高度优化的专家并行 (EP) 通信内核 |
##### 参数特性支持
verl中通过rollout config管理推理后端参数使能,包含通用参数和engine_kwargs自定义传参。
以下列举在verl中常见设置的sglang特性参数,更多参数介绍请参考 [sglang社区NPU特性支持](https://docs.sglang.io/platforms/ascend_npu_support_features.html)
| sglang参数| verl对应通用参数 | 简介|
| --- | --- | --- |
| model_path | actor_rollout_ref.model.path|模型权重文件的路径|
| mem_fraction_static| actor_rollout_ref.rollout.gpu_memory_utilization |用于静态分配(模型权重和键值缓存内存池)的内存比例|
| disable_cuda_graph| actor_rollout_ref.rollout.enforce_eager|禁用图模式,verl默认为False|
| enable_memory_saver| 无,verl中默认设置为True | 允许使用 release_memory_occupation 和 resume_memory_occupation 来节省内存
| base_gpu_id| 无,根据实际实例和卡数自动计算 |用于分配每个实例上计算卡资源时的的初始ID
| gpu_id_step| 无,默认设置为1| 使用的连续计算卡ID 之间的差值
| tp_size| actor_rollout_ref.rollout.tensor_model_parallel_size * data_parallel_size|TP并行度|
| dp_size| actor_rollout_ref.rollout.data_parallel_size|DP并行度|
| ep_size| actor_rollout_ref.rollout.expert_parallel_size|EP并行度|
| node_rank| 无,根据实际实例和卡数自动计算 |实例中的节点排序|
| load_format| actor_rollout_ref.rollout.load_format|要加载的模型权重格式|
| dist_init_addr| 无,自动计算|用于初始化分布式后端的主机地址|
| nnodes| 无,根据实际实例和卡数自动计算|每个实例包含的节点数量|
| trust_remote_code| actor_rollout_ref.model.trust_remote_code|是否允许在 Hub 上定义自定义模型,并将其写入自己的建模文件中|
| max_running_requests| actor_rollout_ref.rollout.max_num_seqs |正在运行的请求的最大数量|
| log_level| 无,默认设置为error |日志记录器的日志级别|
| skip_tokenizer_init| actor_rollout_ref.rollout.skip_tokenizer_init |跳过初始化分词器并将 input_ids 传递到推理请求中|
| skip_server_warmup| 无,默认设置为True |跳过预热|
| quantization| actor_rollout_ref.rollout.quantization,默认为None|量化方法|
| attention_backend|actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend|attention内核,NPU应该设置为ascend|
---
## 训练后端
### 1. FSDP
昇腾通过torch_npu提供FSDP相关支持能力,当前pytorch api支持度参照[版本说明](https://www.hiascend.com/document/detail/zh/Pytorch/730/apiref/PyTorchNativeapi/docs/zh/native_apis/pytorch_2-7-1/torch-distributed-fsdp.md)。
#### FSDP1
##### 参数特性支持
| verl参数 | 简介|
| --- | --- |
| `actor_rollout_ref.actor.fsdp_config.param_offload` |是否卸载模型权重到CPU,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.optimizer_offload` |是否卸载优化器状态到CPU,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.reshard_after_forward` |控制前向计算后的参数行为,平衡内存与通信。默认值为True:前向后重新分片参数,反向时重新全收集|
| `actor_rollout_ref.actor.fsdp_config.fsdp_size` | 每个FSDP分片组中的NPU数量;默认值-1表示自动。|
| `actor_rollout_ref.actor.fsdp_config.forward_prefetch` |在前向计算完成前预取下一次前向传播的 all-gather,仅用于FSDP1,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.use_orig_params` | FSDP是否会使用module的原始参数来初始化,仅用于FSDP1,默认值为False|
| `actor_rollout_ref.actor.ulysses_sequence_parallel_size`|Ulysses序列并行大小|
| `actor_rollout_ref.actor.entropy_from_logits_with_chunking`|通过分块计算熵以减少显存峰值,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.entropy_checkpointing`|在训练时对熵计算启用重计算,降低显存峰值,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.forward_only` |是否只进行前向计算,默认值为False|
#### FSDP2
##### 参数特性支持
| verl参数 | 简介|
| --- | --- |
| `actor_rollout_ref.actor.fsdp_config.param_offload` |是否卸载模型权重到CPU,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.optimizer_offload` |是否卸载优化器状态到CPU,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.reshard_after_forward` |控制前向计算后的参数行为,平衡内存与通信。默认值为True:前向后重新分片参数,反向时重新全收集|
| `actor_rollout_ref.actor.fsdp_config.fsdp_size` | 每个FSDP分片组中的NPU数量;默认值-1表示自动。|
| `actor_rollout_ref.actor.ulysses_sequence_parallel_size`|Ulysses序列并行大小|
| `actor_rollout_ref.actor.entropy_from_logits_with_chunking`|通过分块计算熵以减少显存峰值,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.entropy_checkpointing`|在训练时对熵计算启用重计算,降低显存峰值,默认值为False|
| `actor_rollout_ref.actor.fsdp_config.forward_only` |是否只进行前向计算,默认值为False|
### 2. Megatron
Megatron 是 NVIDIA 推出的一个专注于模型并行的训练框架仓库。如果一个仓库(例如 Verl)的训练后端使用了 Megatron,同时又希望在 NPU 上运行该仓库,那么就需要额外安装 MindSpeed 来提供底层支持。下文将介绍 MindSpeed 是如何实现无感替换 Megatron 中的关键组件,从而使其能够适配 NPU 的。
MindSpeed 底层的替换原理采用了 Monkey Patch 技术
* MindSpeed Moneky Patch框架
在verl里面通过`from mindspeed.megatron_adaptor import repatch `触发patch,调用栈如下:
~~~
from mindspeed.megatron_adaptor import repatch
├── 执行 megatron_adaptor.py 模块导入
├── 导入 features_manager 模块
├── 执行 mindspeed/features_manager/__init__.py
├── @AutoExecuteFunction 装饰器触发
├── patch_features() 自动执行
└── 进行`apply_features_pre_patches`和`apply_features_patches`操作
~~~
`Patch`类是整个patch系统的核心,实现了函数/类的动态替换
~~~python
class Patch
~~~
`parse_path`方法实现了动态模块导入和创建
~~~python
def parse_path(module_path, function_name, create_dummy)
~~~
patch系统支持多层装饰器叠加
~~~
def apply_patch(self):
final_patch_func = self.orig_func
if self.patch_func is not None:
final_patch_func = self.patch_func
# 应用所有装饰器
for wrapper in self.wrappers:
final_patch_func = wrapper(final_patch_func)
~~~
* MindSpeedPatchesManager类
`MindSpeedPatchesManager`作为全局单例管理所有patch
~~~python
class MindSpeedPatchesManager:
patches_info: Dict[str, Patch] = {}
~~~
* Feature集成模式
各个Feature通过继承`MindSpeedFeature`基类集成patch系统
~~~python
class MindSpeedFeature:
"""Base class for mindspeed features."""
def __init__(self, feature_name: str, optimization_level: int = 2):
self.feature_name = feature_name.lower().strip().replace('-', '_')
self.optimization_level = optimization_level
self.default_patches = self.optimization_level == 0
def is_need_apply(self, args):
"""Check the feature is need to apply."""
return (self.optimization_level <= args.optimization_level and getattr(args, self.feature_name, None)) \
or self.default_patches
def register_args(self, parser: ArgumentParser):
"""Register cli arguments to enable the feature."""
pass
def pre_validate_args(self, args: Namespace):
"""Validate the arguments of mindspeed before megatron args validation
and store some arguments of the mindspeed temporarily,
incase that megatron validate faile.
for example:
```python
origin_context_parallel_size = args.context_parallel_size
args.context_parallel_size = 1
```
"""
pass
def validate_args(self, args: Namespace):
"""Restore the arguments of the mindspeed.
for example:
```python
args.context_parallel_size = origin_context_parallel_size
```
"""
pass
def post_validate_args(self, args: Namespace):
"""validate mindspeed arguments after megatron arguments validation."""
pass
def pre_register_patches(self, patch_manager: MindSpeedPatchesManager, args: Namespace):
"""Register all patch functions before import megatron"""
pass
def register_patches(self, patch_manager: MindSpeedPatchesManager, args: Namespace):
"""Register all patch functions the feature is related."""
pass
def incompatible_check(self, global_args, check_args):
"""Register all incompatible functions the feature is related."""
if getattr(global_args, self.feature_name, None) and getattr(global_args, check_args, None):
raise AssertionError('{} and {} are incompatible.'.format(self.feature_name, check_args))
def dependency_check(self, global_args, check_args):
"""Register all dependency functions the feature is related."""
if getattr(global_args, self.feature_name, None) and not getattr(global_args, check_args, None):
raise AssertionError('{} requires {}.'.format(self.feature_name, check_args))
@staticmethod
def add_parser_argument_choices_value(parser, argument_name, new_choice):
"""Add a new choice value to the existing choices of a parser argument."""
for action in parser._actions:
exist_arg = isinstance(action, argparse.Action) and argument_name in action.option_strings
if exist_arg and action.choices is not None and new_choice not in action.choices:
action.choices.append(new_choice)
~~~
##### 参数特性支持
| verl参数 | 简介|
| --- | --- |
| `actor_rollout_ref.actor.megatron.optimizer_offload` |是否卸载模型优化器到CPU,默认值为False|
| `actor_rollout_ref.actor.megatron.use_mbridge` |是否使用mbridge进行权重转换|
| `actor_rollout_ref.actor.megatron.param_offload` |是否卸载模型权重到CPU,默认值为False|
| `actor_rollout_ref.actor.megatron.tensor_model_parallel_size` | 张量并行大小;默认值为1。|
| `actor_rollout_ref.actor.megatron.pipeline_model_parallel_size` |流水并行大小,默认值为1|
| `actor_rollout_ref.actor.megatron.expert_model_parallel_size` | 专家并行大小,默认值为1|
| `actor_rollout_ref.actor.megatron.expert_tensor_parallel_size`|TP拓展EP大小,默认值为null|
| `actor_rollout_ref.actor.context_parallel_size`|序列并行大小,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs`|张量在发送到下一个pp stage后,输出数据被释放,降低显存峰值,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm` |是否使用持久化 LayerNorm,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm` |是否使用持Group GEMM,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype` |用于路由和专家输出加权平均的数据类型。使用 fp32 或 fp64 可以提高稳定性,尤其是在专家数量较多时,默认值为fp32|
| `actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split` |如果设置为 True,在流水线并行的划分和放置策略中,loss 层会被视为一个标准的 Transformer 层来处理。默认为False。|
| `actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split` |如果设置为 True,在流水线并行的划分和放置策略中,输入embedding 层会被视为一个标准的 Transformer 层来处理。默认为False。|
| `actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity` |重新计算激活的粒度,可选项为'full', 'selective' and 'none'。其中full代表重新计算整个transformer layer,selective代表只计算transformer layer中的核心注意力部分。默认为'none'。|
| `actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method` |该参数需将recompute_granularity设置为'full'才生效,可选项为'uniform', 'block'。默认为None。|
| `actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers` |该参数需将recompute_granularity设置为'full'才生效,默认为None。若recompute_method设置为uniform,该参数含义为每个均匀划分的重新计算单元的transformer layers数量。例如你可以指定为--recompute_granularity full --recompute_method uniform --recompute_num_layers 4。recompute_num_layers越大,显存占用越小,计算成本越大。注意:当前进程中的模型层数需能被recompute_num_layers整除。默认为None。|
| `actor_rollout_ref.actor.megatron.use_dist_checkpointing` |是否使用分布式权重,默认值为False|
| `actor_rollout_ref.actor.megatron.dist_checkpointing_path` |分布式权重路径,默认值为null|
| `actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn` |是否使用fa,默认值为true|
| `actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_rotary_pos_emb` |是否使用融合旋转位置编码,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_swiglu` |是否使用融合swiglu,默认值为False|
| `actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage` |第一个pipeline stage 的层数,默认值为none|
| `actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage` |最后一个pipeline stage 的层数,默认值为none|
================================================
FILE: docs/ascend_tutorial/features/ascend_consistency.rst
================================================
推理一致性指导
====================================
在昇腾设备上对齐verl和vLLM两个框架下的推理结果。
Last updated: 11/17/2025.
这是一份在昇腾设备上对齐verl和vLLM两个框架下推理结果的教程。
环境变量配置
~~~~~~~~~~~~
在多卡通信情况下:
- HCCL通信下(默认场景):
- export CLOSE_MATMUL_K_SHIFT=1
- export ATB_MATMUL_SHUFFLE_K_ENABLE=0
- export HCCL_DETERMINISTIC="true"
- export VLLM_ENABLE_V1_MULTIPROCESSING=0
- LCCL通信下(通过export HCCL_OP_EXPANSION_MODE="AIV"使能):
- export CLOSE_MATMUL_K_SHIFT=1
- export ATB_MATMUL_SHUFFLE_K_ENABLE=0
- export LCCL_DETERMINISTIC=1
- export ATB_LLM_LCOC_ENABLE=0
- export VLLM_ENABLE_V1_MULTIPROCESSING=0
在单卡无通信情况下:
- HCCL和LCCL通信下:
- export CLOSE_MATMUL_K_SHIFT=1
- export ATB_MATMUL_SHUFFLE_K_ENABLE=0
- export VLLM_ENABLE_V1_MULTIPROCESSING=0
vLLM初始化参数
~~~~~~~~~~~~
需要对 SamplingParams 参数里单独设置seed, 保持vLLM和verl推理结果一致, 举例修改如下:
.. code:: yaml
sampling_params = SamplingParams(n=1,
logprobs=0, # can be set to 0 and let actor to recompute
max_tokens=config.response_length,
repetition_penalty=config.get("repetition_penalty", 1.0),
seed=1234)
================================================
FILE: docs/ascend_tutorial/profiling/ascend_profiling_en.rst
================================================
Profiling Data Collection Guide
==========================================================================================
Last updated: 12/20/2025.
This is a tutorial for data collection using the GRPO or DAPO algorithm
based on FSDP or MindSpeed(Megatron) on Ascend devices.
Configuration
-------------
Leverage two levels of configuration to control data collection:
- **Global profiler control**: Use parameters in ``verl/trainer/config/ppo_trainer.yaml`` (FSDP) or ``verl/trainer/config/ppo_megatron_trainer.yaml`` (MindSpeed) to control the collection mode and steps.
- **Role profile control**: Use parameters in each role's ``profile`` field to control various parameters.
Global collection control
~~~~~~~~~~~~~~~~~~~~~~~~~
Use parameters in ppo_trainer.yaml to control the collection mode
and steps.
- global_profiler: Control the ranks and mode of profiling
- tool: The profiling tool to use, options are nsys, npu, torch,
torch_memory.
- steps: This parameter can be set as a list that has
collection steps, such as [2, 4], which means it will collect steps 2
and 4. If set to null, no collection occurs.
- save_path: The path to save the collected data. Default is
"outputs/profile".
Role collection control
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In each role's ``profiler`` field, you can control the collection mode for that role.
- enable: Whether to enable profiling for this role.
- all_ranks: Whether to collect data from all ranks.
- ranks: A list of ranks to collect data from. If empty, no data is collected.
- tool_config: Configuration for the profiling tool used by this role.
Use parameters in each role's ``profiler.tool_config.npu`` to control npu profiler behavior:
- level: Collection level—options are level_none, level0, level1, and
level2
- level_none: Disables all level-based data collection (turns off profiler_level).
- level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration.
- level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU.
- level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics.
- contents: A list of options to control the collection content, such as
npu, cpu, memory, shapes, module, stack.
- npu: Whether to collect device-side performance data.
- cpu: Whether to collect host-side performance data.
- memory: Whether to enable memory analysis.
- shapes: Whether to record tensor shapes.
- module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead.
- stack: Whether to record operator call stack information.
- analysis: Enables automatic data parsing.
- discrete: Whether to enable discrete mode.
Examples
--------
Disabling collection
~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: null # disable profile
End-to-End collection
~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: [1, 2, 5]
save_path: ./outputs/profile
actor_rollout_ref:
actor: # Set actor role profiler collection configuration parameters
profiler:
enable: True
all_ranks: True
tool_config:
npu:
discrete: False
contents: [npu, cpu] # Control collection list, default cpu, npu, can configure memory, shapes, module, etc.
# rollout & ref follow actor settings
Discrete Mode Collection
~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: [1, 2, 5]
save_path: ./outputs/profile
actor_rollout_ref:
actor:
profiler:
enable: True # Set to True to profile training
all_ranks: False
ranks: [0] # Global Rank 0
tool_config:
npu:
discrete: True
contents: [npu, cpu]
rollout:
profiler:
enable: True # Set to True to profile inference
all_ranks: False
ranks: [0] # In Agent Loop mode, this is the Replica Rank (e.g., 0-th instance)
tool_config:
npu:
discrete: True # Must be enabled in Agent Loop mode
# ref follow actor settings
**Agent Loop Mode Description**:
When Rollout runs in `Agent Loop <../advance/agent_loop.rst>`_ mode, performance data for the Rollout phase **must be collected using discrete mode**. In this case, the Profiler is triggered by the inference engine backend.
1. Rank Definition: ranks in the Rollout configuration refers to Replica Rank (inference instance index), not Global Rank.
2. Inference Engine Support: Currently, vLLM and SGLang engines are supported without additional settings. Specific details are as follows:
- vLLM Engine: Automatically collects AsyncLLM scheduling stacks and inference process performance data. Does not support setting analysis (defaults to no analysis, requires offline analysis) and profiler_level (defaults to level1).
- SGLang Engine: Automatically collects inference process performance data. Does not support the memory option in contents. Does not support setting analysis (defaults to enabled) and profiler_level (defaults to level0).
Visualization
-------------
Collected data is stored in the user-defined save_path and can be
visualized by using the `MindStudio Insight `_ tool.
Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin `_ ,offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows:
- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately.
- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface.
- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience.
- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference.
If the analysis parameter is set to False, offline parsing is required after data collection:
.. code:: python
import torch_npu
# Set profiler_path to the parent directory of the "localhost.localdomain___ascend_pt" folder
torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
Advanced Guide: Fine-grained Collection
---------------------------------------
Background and Challenges
~~~~~~~~~~~~~~~~~~~~~~~~~
Although the configuration-based collection method mentioned above is convenient, it faces challenges in training scenarios with **long sequences (Long Context)** or **large global batch sizes (Large Global Batch Size)**. Within a complete training step (Step), model computation exhibits high-frequency and repetitive characteristics:
1. **Rollout phase**: Sequence generation (Generate Sequence) is an autoregressive process involving thousands of forward computations of the Decoder model.
2. **Training phase**: To control peak memory usage, verl typically adopts a Micro-Batch strategy, dividing large data streams into multiple micro-batches for computation.
- **compute_log_prob (Actor/Ref)**: Involves multiple rounds of pure forward propagation.
- **update_policy (Actor/Critic)**: Involves multiple rounds of forward and backward propagation.
This characteristic leads to massive and repetitive operator records from full profiling. As shown in the image below:
.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
Even with ``discrete`` mode enabled, performance data files for a single stage can still reach several TB, leading to **parsing failures** or **visualization tool lag**.
Solution: Critical Path Sampling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To solve the above problems, we can adopt a **critical path sampling** strategy: Based on the API interface provided by `torch_npu.profiler `_, directly modify Python source code to collect only representative data segments (such as specific Decode Steps or the first Micro-Batch).
**Important Notes**
1. This chapter involves direct source code modification. It is recommended to back up files before modification and restore them after debugging.
2. When using code instrumentation for collection, be sure to **disable global collection** (``global_profiler: steps: null``) in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yaml`` to avoid Profiler conflicts.
1. Fine-grained Collection in Rollout Phase
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For vLLM or SGLang inference engines, we can control the ``schedule`` parameter to collect model forward propagation performance data for specific tokens.
**vLLM Engine**
- **Reference Version**: vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
- **Modified File**: ``vllm-ascend/vllm_ascend/worker/worker_v1.py``
.. code-block:: diff
class NPUWorker(WorkerBase):
def __init__(self, *args, **kwargs):
# ... existing code ...
+ # Initialize profiler
+ import torch_npu
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ export_type=torch_npu.profiler.ExportType.Db, # You can choose torch_npu.profiler.ExportType.Text format
+ )
+ self.profiler_npu = torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+ with_modules=False, # Collect call stack
+ profile_memory=False, # Collect memory
+ experimental_config=experimental_config,
+ # Skip first step, warmup one step, collect 3 steps, repeat 1 time. If you want to collect decode steps 30~70, set schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+ schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # Data save path and whether to parse online
+ )
+ self.profiler_npu.start()
# ... existing code ...
def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
# ... existing code ...
output = self.model_runner.execute_model(scheduler_output,
intermediate_tensors)
+ self.profiler_npu.step() # Drive schedule to collect partial decode steps
# ... existing code ...
**SGLang Engine**
- **Reference Version**: SGLang master branch
- **Modified File**: ``sglang/python/sglang/srt/model_executor/model_runner.py``
.. code-block:: diff
# ... existing imports ...
+ import torch_npu
class ModelRunner:
def __init__(self, *args, **kwargs):
# ... existing init code ...
+ # Initialize profiler (same configuration as above, omitted)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.profiler_npu = torch_npu.profiler.profile(
+ # ...
+ # Skip first step, warmup one step, collect 3 steps, repeat 1 time.
+ schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+ )
+ self.profiler_npu.start()
def forward(self, forward_batch, **kwargs):
# ... existing code ...
+ self.profiler_npu.step() # Drive schedule to collect partial decode steps
return output
2. Fine-grained Collection in compute_log_prob (Actor & Ref) Phase
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This phase computes probability distributions for new and old policies.
**FSDP Backend**
The FSDP backend allows fine-grained control at the Micro-Batch level.
- **Modified File**: ``verl/workers/actor/dp_actor.py``
.. code-block:: diff
# ... import dependencies ...
+ import torch_npu
class DataParallelPPOActor(BasePPOActor):
def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+ role = "Ref" if self.actor_optimizer is None else "Actor"
+ # Prepare profiler (same configuration as above, omitted)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # wait=0, warmup=0, active=1: directly collect first micro-batch
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+ )
+ # This function is shared by ref and actor, set role flag to distinguish. If you want to collect actor_compute_log_prob, set if role=="Actor":
+ if role=="Ref":
+ self.prof_npu.start()
for micro_batch in micro_batches:
# ... original computation logic ...
with torch.no_grad():
entropy, log_probs = self._forward_micro_batch(...)
+ # Drive schedule to collect micro batch
+ if role=="Ref":
+ self.prof_npu.step()
# ...
**Megatron Backend**
The Micro-Batch scheduling in the Megatron backend is managed internally by the framework and does not currently support fine-grained collection at the Micro-Batch level through simple code instrumentation. It is recommended to use global configuration for collection.
3. Fine-grained Collection in update_policy (Actor & Critic) Phase
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Update phase includes forward and backward propagation.
**FSDP Backend**
The FSDP backend supports collection at both Mini-Batch and Micro-Batch granularities.
- **Modified File**: ``verl/workers/actor/dp_actor.py``
.. code-block:: diff
# ... import dependencies ...
+ import torch_npu
class DataParallelPPOActor(BasePPOActor):
def update_policy(self, data: DataProto):
+ # Prepare profiler (same configuration as above, omitted)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # Only collect first Mini Batch (including all Micro-Batch computations and one optimizer update)
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+ )
+ self.prof_npu.start()
# ... PPO Epochs loop ...
for _ in range(self.config.ppo_epochs):
# ... Mini Batch loop ...
for batch_idx, mini_batch in enumerate(mini_batches):
# ... mini_batches split ...
for i, micro_batch in enumerate(micro_batches):
# ... Original Forward & Backward logic ...
# ... loss.backward() ...
pass
grad_norm = self._optimizer_step()
+ # Drive schedule to collect mini batch, if you want micro batch collection, move self.prof_npu.step() inside the micro_batch loop
+ self.prof_npu.step()
**Megatron Backend**
The Megatron backend supports collection at the Mini-Batch granularity.
- **Modified File**: ``verl/workers/actor/megatron_actor.py``
.. code-block:: diff
class MegatronPPOActor(BasePPOActor):
def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
# ...
+ # Prepare profiler (same configuration as above, omitted)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # Only collect computation of first Mini Batch (including all Micro-Batches) and one optimizer update
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+ )
+ self.prof_npu.start()
for data in dataloader:
# ... internally calls self.forward_backward_batch for computation ...
# ... metric_micro_batch = self.forward_backward_batch(...)
# ... self.actor_optimizer.step() ...
+ # Drive schedule to collect mini batch
+ self.prof_npu.step()
================================================
FILE: docs/ascend_tutorial/profiling/ascend_profiling_zh.rst
================================================
Profiling采集指导
==================================================================================
Last updated: 12/20/2025.
这是一份在昇腾设备上基于FSDP或MindSpeed(Megatron)后端,使用GRPO或DAPO算法进行数据采集的教程。
配置
----
使用两级profile设置来控制数据采集
- 全局采集控制:使用verl/trainer/config/ppo_trainer.yaml(FSDP),或verl/trainer/config/ppo_megatron_trainer.yaml(MindSpeed)中的配置项控制采集的模式和步数。
- 角色profile控制:通过每个角色中的配置项控制等参数。
全局采集控制
~~~~~~~~~~~~
通过 ppo_trainer.yaml 中的参数控制采集步数和模式:
- global_profiler: 控制采集的rank和模式
- tool: 使用的采集工具,选项有 nsys、npu、torch、torch_memory。
- steps: 此参数可以设置为包含采集步数的列表,例如 [2, 4],表示将采集第2步和第4步。如果设置为 null,则不进行采集。
- save_path: 保存采集数据的路径。默认值为 "outputs/profile"。
角色profiler控制
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
在每个角色的 ``profiler`` 字段中,您可以控制该角色的采集模式。
- enable: 是否为此角色启用性能分析。
- all_ranks: 是否从所有rank收集数据。
- ranks: 要收集数据的rank列表。如果为空,则不收集数据。
- tool_config: 此角色使用的性能分析工具的配置。
通过每个角色的 ``profiler.tool_config.npu`` 中的参数控制具体采集行为:
- level: 采集级别—选项有 level_none、level0、level1 和 level2
- level_none: 禁用所有基于级别的数据采集(关闭 profiler_level)。
- level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后,level0是推荐的默认配置。
- level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。
- level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。
- contents: 控制采集内容的选项列表,例如
npu、cpu、memory、shapes、module、stack。
- npu: 是否采集设备端性能数据。
- cpu: 是否采集主机端性能数据。
- memory: 是否启用内存分析。
- shapes: 是否记录张量形状。
- module: 是否记录框架层Python调用栈信息。相较于stack,更推荐使用module记录调用栈信息,因其产生的性能膨胀更低。
- stack: 是否记录算子调用栈信息。
- analysis: 启用自动数据解析。
- discrete: 使用离散模式。
示例
----
禁用采集
~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: null # disable profile
端到端采集
~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: [1, 2, 5]
save_path: ./outputs/profile
actor_rollout_ref:
actor: # 设置 actor role 的 profiler 采集配置参数
profiler:
enable: True
all_ranks: True
tool_config:
npu:
discrete: False
contents: [npu, cpu] # 控制采集列表,默认cpu、npu,可配置memory、shapes、module等
# rollout & ref follow actor settings
离散模式采集
~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
global_profiler:
steps: [1, 2, 5]
save_path: ./outputs/profile
actor_rollout_ref:
actor:
profiler:
enable: True # 设置为 True 以采集训练阶段
all_ranks: False
ranks: [0] # 全局 Rank 0
tool_config:
npu:
discrete: True
contents: [npu, cpu]
rollout:
profiler:
enable: True # 设置为 True 以采集推理阶段
all_ranks: False
ranks: [0] # 在 Agent Loop 模式下,此处指推理实例的 Replica Rank (例如第 0 个实例)
tool_config:
npu:
discrete: True # Agent Loop 模式下必须开启离散模式
# ref follow actor settings
**Agent Loop 模式说明**:
在 `Agent Loop <../advance/agent_loop.rst>`_ 模式下,Rollout 阶段的性能数据 **必须使用离散模式** 采集,此时 Profiler 由推理引擎后端触发。
1. Rank 定义:Rollout 配置中的 ranks 指代 Replica Rank(推理实例索引),而非全局 Rank。
2. 推理引擎支持:当前支持vLLM和SGLang引擎,无需额外设置。具体说明如下:
- vLLM 引擎:自动采集 AsyncLLM 调度栈及推理进程性能数据。不支持设置 analysis(默认不解析,需离线解析)和 profiler_level(默认 level1)。
- SGLang 引擎:自动采集推理进程性能数据。不支持 contents 中的 memory 配置项。不支持设置 analysis(默认解析)和 profiler_level(默认 level0)。
可视化
------
采集后的数据存放在用户设置的save_path下,可通过 `MindStudio Insight `_ 工具进行可视化。
另外在Linux环境下,MindStudio Insight工具提供了 `JupyterLab插件 `_ 形态,提供更直观和交互式强的操作界面。JupyterLab插件优势如下:
- 无缝集成:支持在Jupyter环境中直接运行MindStudio Insight工具,无需切换平台,无需拷贝服务器上的数据,实现数据即采即用。
- 快速启动:通过JupyterLab的命令行或图形界面,可快速启动MindStudio Insight工具。
- 运行流畅:在Linux环境下,通过JupyterLab环境启动MindStudio Insight,相较于整包通信,有效解决了运行卡顿问题,操作体验显著提升。
- 远程访问:支持远程启动MindStudio Insight,可通过本地浏览器远程连接服务直接进行可视化分析,缓解了大模型训练或推理数据上传和下载的困难。
如果analysis参数设置为False,采集之后需要进行离线解析:
.. code:: python
import torch_npu
# profiler_path请设置为"localhost.localdomain___ascend_pt"目录的上一级目录
torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
进阶指南:精细化采集
--------------------
背景与挑战
~~~~~~~~~~
上述基于配置文件的采集方式虽然便捷,但在 **长序列 (Long Context)** 或 **大全局批量 (Large Global Batch Size)** 的训练场景中面临挑战。
在一个完整的训练步 (Step) 内,模型计算呈现出高频次、重复性的特征:
1. Rollout 阶段:序列生成 (Generate Sequence) 是一个自回归过程,涉及成千上万次 Decoder 模型的前向计算。
2. Training 阶段:为了控制显存峰值,verl 通常采用 Micro-Batch 策略,将庞大的数据流切分为多个微批次进行计算。
- compute_log_prob (Actor/Ref):涉及多轮纯前向传播。
- update_policy (Actor/Critic):涉及多轮前向与反向传播。
这种特性会导致全量 Profiling 产生海量且重复的算子记录。如下图所示:
.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
即使使用了 ``discrete`` 模式,单个阶段的性能数据文件仍可能达到数 TB,导致 **解析失败** 或 **可视化工具卡顿** 。
解决方案:关键路径采样
~~~~~~~~~~~~~~~~~~~~~~
为了解决上述问题,我们可以采用 **关键路径采样** 策略:基于 `torch_npu.profiler `_ 提供的API接口,直接修改 Python 源码,仅采集具有代表性的数据片段(如特定 Decode Step 或首个 Micro-Batch)。
**重要提示**
1. 本章节涉及直接修改源码。建议修改前备份文件,调试完成后恢复。
2. 使用代码插桩采集时,请务必在 ``ppo_trainer.yaml`` 或 ``ppo_megatron_trainer.yaml`` 中**禁用全局采集** (``global_profiler: steps: null``),以避免 Profiler 冲突。
1. Rollout 阶段精细化采集
~~~~~~~~~~~~~~~~~~~~~~~~~
对于 vLLM 或 SGLang 推理引擎,我们可以通过控制 ``schedule`` 参数来控制采集模型在特定token的前向传播性能数据。
**vLLM 引擎**
- **参考版本**:vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
- **修改文件**:``vllm-ascend/vllm_ascend/worker/worker_v1.py``
.. code-block:: diff
class NPUWorker(WorkerBase):
def __init__(self, *args, **kwargs):
# ... existing code ...
+ # Initialize profiler
+ import torch_npu
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ export_type=torch_npu.profiler.ExportType.Db, # 可选择torch_npu.profiler.ExportType.Text格式
+ )
+ self.profiler_npu = torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+ with_modules=False, # 采集调用栈
+ profile_memory=False, # 采集内存
+ experimental_config=experimental_config,
+ # 跳过第一步,warmup一步,采集3步,重复1次。如果想采集第30~70个decode step,可以设置为schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+ schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # 采集数据保存路径,是否在线解析
+ )
+ self.profiler_npu.start()
# ... existing code ...
def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
# ... existing code ...
output = self.model_runner.execute_model(scheduler_output,
intermediate_tensors)
+ self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集
# ... existing code ...
**SGLang 引擎**
- **参考版本**:SGLang master 分支
- **修改文件**:``sglang/python/sglang/srt/model_executor/model_runner.py``
.. code-block:: diff
# ... existing imports ...
+ import torch_npu
class ModelRunner:
def __init__(self, *args, **kwargs):
# ... existing init code ...
+ # Initialize profiler (配置同上,略)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.profiler_npu = torch_npu.profiler.profile(
+ # ...
+ # 跳过第一步,warmup一步,采集3步,重复1次。
+ schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+ )
+ self.profiler_npu.start()
def forward(self, forward_batch, **kwargs):
# ... existing code ...
+ self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集
return output
2. compute_log_prob (Actor & Ref) 阶段精细化采集
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
该阶段计算新旧策略的概率分布。
**FSDP 后端**
FSDP 后端允许在 Micro-Batch 级别进行精细控制。
- **修改文件**:``verl/workers/actor/dp_actor.py``
.. code-block:: diff
# ... 引入依赖 ...
+ import torch_npu
class DataParallelPPOActor(BasePPOActor):
def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+ role = "Ref" if self.actor_optimizer is None else "Actor"
+ # 准备 profiler (配置同上,略)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # wait=0, warmup=0, active=1: 直接采集第一个 micro-batch
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+ )
+ # 此函数ref和actor共用,设置role标志位来区分。如果想采集actor_compute_log_prob,可设置if role=="Actor":
+ if role=="Ref":
+ self.prof_npu.start()
for micro_batch in micro_batches:
# ... 原始计算逻辑 ...
with torch.no_grad():
entropy, log_probs = self._forward_micro_batch(...)
+ # 驱动 schedule,对micro batch进行采集
+ if role=="Ref":
+ self.prof_npu.step()
# ...
**Megatron 后端**
Megatron 后端的 Micro-Batch 调度由框架内部管理,暂不支持通过简单的代码插桩进行 Micro-Batch 级别的精细化采集。建议使用全局配置进行采集。
3. update_policy (Actor & Critic) 阶段精细化采集
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Update 阶段包含前向和反向传播。
**FSDP 后端**
FSDP 后端支持设置对 Mini-Batch 和 Micro-Batch 的粒度进行采集。
- **修改文件**:``verl/workers/actor/dp_actor.py``
.. code-block:: diff
# ... 引入依赖 ...
+ import torch_npu
class DataParallelPPOActor(BasePPOActor):
def update_policy(self, data: DataProto):
+ # 准备 profiler (配置同上,略)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # 仅采集第一个 Mini Batch(包含所有 Micro-Batch 的计算和一次优化器更新)
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+ )
+ self.prof_npu.start()
# ... PPO Epochs 循环 ...
for _ in range(self.config.ppo_epochs):
# ... Mini Batch 循环 ...
for batch_idx, mini_batch in enumerate(mini_batches):
# ... mini_batches 切分 ...
for i, micro_batch in enumerate(micro_batches):
# ... 原始 Forward & Backward 逻辑 ...
# ... loss.backward() ...
pass
grad_norm = self._optimizer_step()
+ # 驱动 schedule,对mini batch进行采集,如果想对micro batch进行,则将self.prof_npu.step()移动到micro_batch的循环内
+ self.prof_npu.step()
**Megatron 后端**
Megatron 后端支持以 Mini-Batch 的粒度进行采集。
- **修改文件**:``verl/workers/actor/megatron_actor.py``
.. code-block:: diff
class MegatronPPOActor(BasePPOActor):
def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
# ...
+ # 准备 profiler (配置同上,略)
+ experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ self.prof_npu = torch_npu.profiler.profile(
+ # ...
+ # 仅采集第一个 Mini Batch 的计算(含所有 Micro-Batch)和一次优化器更新
+ schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+ )
+ self.prof_npu.start()
for data in dataloader:
# ... 内部会调用 self.forward_backward_batch 进行计算 ...
# ... metric_micro_batch = self.forward_backward_batch(...)
# ... self.actor_optimizer.step() ...
+ # 驱动 schedule,对mini batch进行采集
+ self.prof_npu.step()
================================================
FILE: docs/ascend_tutorial/quick_start/ascend_quick_start.rst
================================================
Ascend Quickstart
===================================
Last updated: 03/03/2026.
关键更新
----------------------------------
2025/12/11:verl 存量场景目前支持自动识别 NPU 设备类型, GPU 脚本在昇腾上运行,原则上不再需要显式设置 trainer.device=npu 参数,新增特性通过设置 trainer.device 仍可优先使用,逐步适配自动识别能力。
[说明] 自动识别 NPU 设备类型的前提,是运行程序所在环境包含 torch_npu 软件包。如不包含该软件包,仍需显式指定 trainer.device=npu 参数。
硬件支持
-----------------------------------
Atlas 200T A2 Box16
Atlas 900 A2 PODc
Atlas 800T A3
安装流程
-----------------------------------
DockerFile镜像构建 & 获取 & 使用
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
如需要通过 DockerFile 构建镜像,或希望使用基于 verl 构建的镜像,请参考 `文档 `_
如果想直接获取镜像,请前往`quay.io/ascend/verl `_ 进行获取,镜像中已包含基础环境和依赖软件包。
安装基础环境
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1. 基础环境涉及以下软件包,请参考 `文档 `_ 安装。
+---------------+----------------------+
| software | version |
+---------------+----------------------+
| Python | >= 3.10, <3.12 |
+---------------+----------------------+
| CANN | == 8.5.0 |
+---------------+----------------------+
| torch | == 2.8.0 |
+---------------+----------------------+
| torch_npu | == 2.8.0 |
+---------------+----------------------+
2. (可选)在 x86 平台安装时,pip 需要配置额外的源,指令如下:
.. code-block:: bash
pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"
安装其他软件包
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
基础环境准备完毕后,需要通过指令安装以下软件包:
+---------------+----------------------+
| torchvision | == 0.22.1 |
+---------------+----------------------+
| triton-ascend | == 3.2.0 |
+---------------+----------------------+
| transformers | == 4.57.6 |
+---------------+----------------------+
tips: verl is not support transformers 5.0.0 or higher
安装指令:
.. code-block:: bash
# 安装torchvision,版本需要和torch匹配
pip install torchvision==0.22.1
# 清理环境上可能存在的历史triton/triton-ascend软件包残留
pip uninstall -y triton triton-ascend
# 安装triton-ascend,不需要单独安装triton
pip install triton-ascend==3.2.0
安装 vllm & vllm-ascend
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1. 需确保CANN ascend-toolkit 和 nnal 环境变量被激活,对于CANN默认安装路径 /usr/local/Ascend 而言,激活指令如下:
.. code-block::
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
2. vllm 源码安装指令:
.. code-block:: bash
git clone --depth 1 --branch v0.13.0 https://github.com/vllm-project/vllm.git
cd vllm && pip install -r requirements/build.txt
VLLM_TARGET_DEVICE=empty pip install -v -e. && cd ..
3. vllm-ascend 源码安装指令:
.. code-block:: bash
git clone -b releases/v0.13.0 https://github.com/vllm-project/vllm-ascend.git
cd vllm-ascend && pip install -r requirements.txt
export COMPILE_CUSTOM_KERNELS=1 && pip install -v -e . && cd ..
安装 MindSpeed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
MindSpeed 源码安装指令:
.. code-block:: bash
# 下载 MindSpeed,切换到指定commit-id,并下载 Megatron-LM
git clone https://gitcode.com/Ascend/MindSpeed.git
cd MindSpeed && git checkout 2.3.0_core_r0.12.1 && cd ..
git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
# 安装 MindSpeed & Megatron
pip install -e MindSpeed
pip install -e Megatron-LM
# 安装 mbridge
pip install mbridge
MindSpeed 对应 Megatron-LM 后端使用场景,使用方式如下:
1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ,例如 ``actor_rollout_ref.actor.strategy=megatron``。
2. MindSpeed 自定义入参可通过 ``override_transformer_config`` 参数传入,例如对 actor 模型开启 FA 特性可使用 ``+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True``。
3. 更多特性信息可参考 `MindSpeed & verl 文档 `_ 。
安装verl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: bash
git clone --recursive https://github.com/volcengine/verl.git
cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd ..
# (可选)提示:为了更佳的使用体验,最好将recipe子模块更新至最新commit
cd recipe && git checkout main && cd ..
昇腾暂不支持生态库说明
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
verl 中昇腾暂不支持生态库如下:
+---------------+----------------+
| software | description |
+---------------+----------------+
| flash_attn | not supported |
+---------------+----------------+
| liger-kernel | not supported |
+---------------+----------------+
1. 不支持通过 flash_attn 使能 flash attention 加速,支持通过 transformers 使用。
2. 不支持 liger-kernel 使能。
快速开始
-----------------------------------
正式使用前,建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。
1.下载数据集并将数据集预处理为parquet格式,以便包含计算RL奖励所需的必要字段
.. code-block:: bash
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
2.执行训练
.. code-block:: bash
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=128 \
data.max_prompt_length=512 \
data.max_response_length=128 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.entropy_coeff=0.001 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=console \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
算法支持现状
-----------------------------------
**表1** RL类算法
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| algorithm | model | download link | actor.strategy | rollout.name | shell location | hardware |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_7b_grpo_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen2.5-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_32b_grpo_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen2.5-VL-3B-instruct |`3B `_ | FSDP | vllm-ascend |`qwen2_5_vl_3b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen2.5-VL-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_vl_7b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen2.5-VL-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_vl_32b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen3-4B |`4B `_ | FSDP | vllm-ascend |`qwen3-4B_npu `_ | Atlas 800T A3 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_vllm_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen3-8B |`8B `_ | FSDP | sglang |`qwen3_8b_sglang_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | Qwen3-32B |`32B `_ | FSDP | vllm-ascend |`qwen3-32B_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| GRPO | DeepSeekv3-671B |`671B `_ | Megatron | vllm-ascend |`deepseek_v3_megatron_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2.5_7b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen2.5-32B |`32B `_ | FSDP | vllm-ascend |`qwen2.5_32b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen3-8B-base |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen3-14B-base |`14B `_ | FSDP | vllm-ascend |`qwen3_14b_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen3-30B-A3B-base |`30B `_ | FSDP | vllm-ascend |`qwen3_30b_fsdp_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| DAPO | Qwen3-30B-A3B-base |`30B `_ | Megatron | vllm-ascend |`qwen3_30b_megatron_npu `_ | Atlas 200T A2 Box16 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| PPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_ppo_npu `_ | Atlas 900 A2 PODc |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
| One_Step_Off_Policy | Qwen3-8B |`8B `_ | FSDP2 | vllm-ascend |`qwen3_8b_fsdp2_npu `_ | Atlas 800T A3 |
+-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
**表2** SFT类算法
+-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| algorithm | model | download link | actor.strategy | shell location | hardware |
+-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| SFT-PEFT | Qwen3-8B |`8B `_ | FSDP |`sft_peft_sp2_npu `_ | Atlas 900 A2 PODc |
+-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| ReTool-SFT| Qwen2-7B-instruct |`7B `_ | FSDP |`qwen2_7b_sft_npu `_ | Atlas 900 A2 PODc |
+-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
声明
-----------------------------------
verl中提供的ascend支持代码、Dockerfile、镜像皆为参考样例,如在生产环境中使用请通过官方正式途径沟通,谢谢。
================================================
FILE: docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst
================================================
Ascend Quickstart with SGLang Backend
===================================
Last updated: 01/27/2026.
我们在 verl 上增加对华为昇腾设备的支持。
硬件支持
-----------------------------------
Atlas 200T A2 Box16
Atlas 900 A2 PODc
Atlas 800T A3
安装
-----------------------------------
关键支持版本
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------+-----------------+
| software | version |
+===========+=================+
| Python | == 3.11 |
+-----------+-----------------+
| HDK | >= 25.3.RC1 |
+-----------+-----------------+
| CANN | >= 8.3.RC1 |
+-----------+-----------------+
| torch | >= 2.7.1 |
+-----------+-----------------+
| torch_npu | >= 2.7.1.post2 |
+-----------+-----------------+
| sglang | v0.5.8 |
+-----------+-----------------+
从 Docker 镜像进行安装
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
我们提供了DockerFile进行构建,详见 `dockerfile_build_guidance `_ ,请根据设备自行选择对应构建文件
从自定义环境安装
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
**1. 安装HDK&CANN依赖并激活**
异构计算架构CANN(Compute Architecture for Neural Networks)是昇腾针对AI场景推出的异构计算架构, 为了使训练和推理引擎能够利用更好、更快的硬件支持, 我们需要安装以下 `先决条件 `_
+-----------+-------------+
| HDK | >= 25.3.RC1 |
+-----------+-------------+
| CANN | >= 8.3.RC1 |
+-----------+-------------+
安装完成后请激活环境
.. code-block:: bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
**2. 创建conda环境**
.. code-block:: bash
# create conda env
conda create -n verl-sglang python==3.11
conda activate verl-sglang
**3. 然后,执行我们在 verl 中提供的脚本** `install_sglang_mcore_npu.sh `_
如果在此步骤中遇到错误,请检查脚本并手动按照脚本中的步骤操作。
.. code-block:: bash
git clone https://github.com/volcengine/verl.git
# Make sure you have activated verl conda env
# NPU_DEVICE=A3 or A2 depends on your device
# USE_MEGATRON=1 if you need to install megatron backend
NPU_DEVICE=A3 USE_MEGATRON=1 bash verl/scripts/install_sglang_mcore_npu.sh
**4. 安装verl**
.. code-block:: bash
cd verl
pip install --no-deps -e .
pip install -r requirements-npu.txt
快速开始
-----------------------------------
**1.当前NPU sglang脚本一览**
.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
.. _Qwen3-8B-1k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
.. _Qwen3-8B-32k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
+-----------------+----------------+----------+-------------------+
| 模型 | 推荐NPU型号 | 节点数量 | 训推后端 |
+=================+================+==========+===================+
| `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron |
+-----------------+----------------+----------+-------------------+
| `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP |
+-----------------+----------------+----------+-------------------+
| `Qwen3-8B-1k`_ | Atlas A3/A2 | 1 | SGLang + FSDP |
+-----------------+----------------+----------+-------------------+
| `Qwen3-8B-32k`_ | Atlas A3/A2 | 1 | SGLang + FSDP |
+-----------------+----------------+----------+-------------------+
**2.最佳实践**
我们提供基于verl+sglang `Qwen3-30B`_ 以及 `Qwen2.5-32B`_ 的 `最佳实践 `_ 作为参考
**3.环境变量与参数**
当前NPU上支持sglang后端必须添加以下环境变量
.. code-block:: bash
#支持NPU单卡多进程 https://www.hiascend.com/document/detail/zh/canncommercial/850/commlib/hcclug/hcclug_000091.html
export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
#规避ray在device侧调用无法根据is_npu_available接口识别设备可用性
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
#根据当前设备和需要卡数定义
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
#使能推理EP时需要
export SGLANG_DEEPEP_BF16_DISPATCH=1
当前verl已解析推理常见参数, 详见 `async_sglang_server.py `_ 中 ServerArgs初始化传参,其他 `sglang参数 `_ 均可通过engine_kwargs 进行参数传递
vllm后端推理脚本转换为sglang, 需要添加修改以下参数
.. code-block:: bash
#必须
actor_rollout_ref.rollout.name=sglang
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
#可选
#使能推理EP,详细使用方法见 https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README_CN.md
++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto"
++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep"
#Moe模型多DP时必须设置为True
+actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
#chunked_prefill默认关闭
+actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
================================================
FILE: docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst
================================================
Ascend Dockerfile Build Guidance
===================================
Last updated: 03/03/2025.
镜像获取 & 公开镜像地址
--------------------
昇腾在 `quay.io/ascend/verl `_ 中托管每日构建的 A2/A3 镜像,基于上述 Dockerfile 构建。
每日构建镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-latest
verl release版本镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-{verl release版本号}
镜像硬件支持
-----------------------------------
Atlas 200T A2 Box16
Atlas 900 A2 PODc
Atlas 800T A3
镜像内各组件版本信息清单
----------------
================= ============
组件 版本
================= ============
基础镜像 Ubuntu 22.04
Python 3.11
CANN 8.5.0
torch 2.8.0
torch_npu 2.8.0
torchvision 0.22.1
vLLM 0.13.0
vLLM-ascend 0.13.0
Megatron-LM v0.12.1
MindSpeed 2.3.0_core_r0.12.1
triton-ascend 3.2.0
mbridge latest version
SGLang v0.5.8
sgl-kernel-npu (46b73de)
================= ============
Dockerfile构建镜像脚本清单
---------------------------
============== ============== ============== ==============================================================
设备类型 基础镜像版本 推理后端 参考文件
============== ============== ============== ==============================================================
A2 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a2 `_
A2 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a2 `_
A2 8.5.0 vLLM `Dockerfile.ascend_8.5.0_a2 `_
A2 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a2 `_
A3 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a3 `_
A3 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a3 `_
A3 8.5.0 vLLM `Dockerfile.ascend_8.5.0_a3 `_
A3 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a3 `_
============== ============== ============== ==============================================================
镜像构建命令示例
--------------------
.. code:: bash
# Navigate to the directory containing the Dockerfile
cd {verl-root-path}/docker/ascend
# Build the image
# vLLM
docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend:8.3.rc1-a2 .
# SGLang
docker build -f Dockerfile.ascend.sglang_8.3.rc1_a2 -t verl-ascend-sglang:8.3.rc1-a2 .
声明
--------------------
verl中提供的ascend相关Dockerfile、镜像皆为参考样例,可用于尝鲜体验,如在生产环境中使用请通过官方正式途径沟通,谢谢。
================================================
FILE: docs/blog/v0.7.md
================================================
# verl 0.7 release blog
**Author:** verl team
Last updated: 01/03/2026.
## Overview
verl adopts a Hybrid-Controller architecture (also known as HybridFlow). Sharing design principles with asynchronous sharded dataflow systems like Google Pathways, verl models Reinforcement Learning (RL) algorithms, such as PPO, GRPO, DAPO, and others, as a multi-stage, multi-model and parallelizable dataflow graph.
To balance flexibility with performance, verl unifies two distinct programming models:
**High-Level Single-Controller (MPMD)**: At the orchestration level, a single process `RLTrainer` manages the global computation graph. It handles macro-tasks such as scheduling rollout generation, triggering reward scoring, and dispatching distributed training jobs.
**Internal Multi-Controller (SPMD)**: Internally, the Model Engine operates in standard distributed training mode. Workers execute identical programs, via trainer backends like FSDP, Megatron, or VeOmni, or rollout executors (not rollout server) like vLLM/SGLang/TensorRT-LLM, to perform heavy distributed computation, synchronizing via collective communication.
This hybrid approach offers significant advantages:
**Flexible Orchestration**: The single-controller design allows verl to dynamically manage complex constraints within the computation graph, including flexible data dependencies, diverse resource allocation and model placement, and fine-grained asynchronous staleness control.
**Abstraction of Complexity**: We encapsulate complex parallel strategies—such as 5D parallelism (DP, TP, CP, PP, and EP)—strictly within the Model Engine. This allows users to focus entirely on RL algorithm implementation without getting bogged down by the details of distributed training.
Furthermore, leveraging Ray placement groups, verl provides `ResourcePool` and `WorkerGroup` abstractions. These enable flexible GPU sharing among the various roles in the RL process—such as actor, critic, reward, and rollout—allowing components to share resources efficiently while remaining isolated.
As illustrated in the diagram below, the overall architecture of verl is divided into two layers:
- **verl-core**: provides four components required for the RL pipeline: model engine, rollout engine, checkpoint engine, and transfer queue. Each component exposes abstract interfaces, making them both extensible and pluggable.
- **verl-trainer**: builds upon these components, construct various RL pipelines—such as on-policy, one-step-off-policy, and fully asynchronous—tailored to meet the demands of diverse scenarios.
## verl-core
### Model Engine
The Model Engine serves as verl's core training engine, defining a set of abstract interfaces that support pluggable backends. It operates in SPMD mode:
- SFT: Workers are launched via torchrun.
- RL: Workers are executed via the WorkerGroup API, invoked by the single-controller.
The abstract interfaces include methods like `initialize`, `forward`, `optimizer_step`, and `load`/`offload`. Integrating a new training engine simply requires inheriting and implementing these interfaces. Crucially, because all backends adhere to this unified abstraction, adding a new Model Engine requires absolutely no code modification on the caller side. The RLTrainer remains completely agnostic to the backend's specific parallel strategy when calling these interfaces, while the WorkerGroup automatically handles data dispatch and collection based on the underlying parallelism.
Currently, the Model Engine supports the following backends (more backend maybe supported in future, e.g torchtitan):
|Backend|Parallelism|Performance|Support Model|New Model Support Time
|-----|-----|----|----|----|
|FSDP| FSDP+SP|Dense medium/MoE low| all transformer models|Day 0
|MCore| DP+TP+PP+EP+CP|High| see [Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) support model list|few weeks or month
|VeOmni| FSDP+SP+EP|Medium| see [VeOmni](https://github.com/ByteDance-Seed/VeOmni) support model list|~1 week
```python
class BaseEngine:
def initialize(self):
"""Instantiate or load the model, optimizer, and learning rate scheduler."""
raise NotImplementedError
def optimizer_zero_grad(self):
"""Zero the gradients of the optimizer."""
raise NotImplementedError
def optimizer_step(self):
"""Perform an optimization step using the optimizer."""
raise NotImplementedError
def lr_scheduler_step(self):
"""Advance the learning rate scheduler by one step."""
raise NotImplementedError
def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forward_only=False) -> Any:
"""Perform a forward pass and optionally a backward pass on a batch of data."""
raise NotImplementedError
def get_per_tensor_param(self) -> tuple[Generator[tuple[str, torch.Tensor], None, None], Optional[dict]]:
"""Get a generator that yields per-tensor parameters and optional peft config."""
raise NotImplementedError
def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
"""Move model parameters, optimizer states, or both to the specified device."""
raise NotImplementedError
```
### Rollout Engine
As LLM reinforcement learning evolves from single-turn, static tasks to multi-turn, dynamic, and interactive agentic tasks, the legacy SPMD rollout mode previously used by verl has become insufficient. Consequently, in verl v0.7, we have removed the SPMD rollout mode and switched to rollout server mode by default.
In the server mode, the LLM server operates as online serving rather than the traditional offline batch inference. Clients send per-sample requests to the server, enabling the engine to utilize dynamic batching. This significantly enhances throughput efficiency for multi-turn conversation. Furthermore, the server-based approach eliminates the need for intrusive modifications to the LLM inference engine, allowing for the seamless integration of modern inference backends such as vLLM, SGLang, and TensorRT-LLM.
On the client side, verl introduces an extensible **AgentLoop** abstraction designed to define custom agentic task loops. This abstraction manages the cycle of requesting responses from the LLM server and interacting with external environments to obtain feedback. We provide two default implementations:
- **SingleTurnAgentLoop**: Designed for standard single-turn tasks.
- **ToolAgentLoop**: Designed for classic ReAct architectures involving multi-turn tool invocation.
Users can implement custom AgentLoop logic tailored to their specific needs, such as [SWEAgentLoop](https://github.com/volcengine/verl/pull/4080) or GUIAgentLoop.
```python
class AgentLoopBase(ABC):
@abstractmethod
async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
"""Run agent loop to interact with LLM server and environment.
Args:
sampling_params (Dict[str, Any]): LLM sampling params.
**kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`.
Returns:
AgentLoopOutput: Agent loop output.
"""
raise NotImplementedError
```
### TransferQueue
As mentioned, verl uses a global single-controller RLTrainer to orchestrate the computation graph. A major limitation in the current implementation is that the RLTrainer handles both control and data flow, creating a bottleneck when dispatching data between components. This issue is amplified by the massive data volumes in multimodal training (images, video, audio) and complex algorithms like router replay, which requires transmitting large tensors per sample. Our earlier attempt to solve this using the Ray object store yielded poor performance due to the lack of tensor optimization and fine-grained column access.
In v0.7, we experimentally introduced **TransferQueue** to decouple control flow from data flow. The RLTrainer now only dispatch instructions and metadata, while TransferQueue handles data transmission via reference passing. TransferQueue is specifically optimized for PyTorch tensors (supporting zero-copy and RDMA) and allows for backend extensions like ZeroMQ, NIXL, and Ray RDT. We plan to make this the default transmission method in v0.8.
```python
# In PPOTrainer
def fit(self):
batch = next(dataloader)
gen_batch: BatchMeta = self.rollout_manager.generate_sequences(batch)
output: BatchMeta = self.actor_rollout_wg.compute_log_prob(gen_batch)
gen_batch = gen_batch.union(output)
output = self.actor_rollout_wg.update_actor(gen_batch)
# In Worker
def compute_log_prob(self, batch: BatchMeta) -> BatchMeta:
data = tq.get(batch)
output = self.actor.infer_batch(data=data)
return tq.put(output)
```
### Checkpoint Engine
With the increase in LLM context lengths and the evolution of agentic tasks, the "long-tail" problem in rollout has become prominent, limiting the overall efficiency of RL training.
To mitigate this, a viable strategy is moving from on-policy synchronous training to off-policy asynchronous training, e.g [Laminar](https://arxiv.org/abs/2510.12633), [Areal](https://arxiv.org/abs/2505.24298), [StreamRL](https://arxiv.org/abs/2504.15930), [LlamaRL](https://arxiv.org/pdf/2505.24034), [PipelineRL](https://arxiv.org/abs/2509.19128). This involves separating the rollout and model engines onto different nodes (a disaggregated architecture, as opposed to colocated), with data transmitted via queues. This separation alleviates the rollout long-tail issue and enables rollout elastic scaling, fault tolerance, and heterogeneous hardware. However, it introduces a new challenge: efficient cross-node parameter synchronization.
To address this, we introduce the Checkpoint Engine: a unified abstraction layer designed to synchronize weights between various training and inference backends.
- It provides three unified APIs to implement the streaming transmission of parameters.
- Users can extend the Transport Layer implementation based on their specific infrastructure requirements (device, network, local cache, etc.).
Currently, we provide two transport backends: NCCL (for broadcast collective communication) and NIXL (for P2P point-to-point communication).
```python
class CheckpointEngine(ABC):
@abstractmethod
async def send_weights(self, weights: Generator[tuple[str, torch.Tensor], None, None]):
"""Send the weights of the model.
Args:
weights: A generator that yields the name of the weight tensor and the tensor itself.
"""
raise NotImplementedError
@abstractmethod
async def receive_weights(self) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Receive the weights of the model.
Yields:
A tuple of the name of the weight tensor and the tensor itself.
"""
raise NotImplementedError
```
## verl-trainer
Building upon the four core components provided by verl-core, verl-trainer constructs several RL training pipelines tailored to specific scenarios. These pipelines are designed to address training efficiency challenges across varying scales and requirements:
**On-policy (Synchronous)**
- Main Features: Executes rollout and training serially, typically sharing GPU resources (Colocate). It strictly adheres to standard on-policy algorithm definitions, where training must wait for all samples to be generated.
- Scenarios: Best for baseline implementations, scenarios where strict algorithmic correctness is prioritized over training throughput.
**One-step-off-policy (Async)**
- Main Features: Parallelizes generation and training by overlapping the current training step with the next batch's generation. It employs resource isolation and uses parameters from the previous step for rollout to minimize GPU idle time.
- Scenarios: Ideal for scenarios requiring moderate efficiency gains (20%–40%) while maintaining training stability very close to strict on-policy methods.
**Fully async (Decoupled & Streaming)**
- Main Features: Completely decouples the Trainer and Rollouter onto separate nodes. It utilizes streaming data transfer, staleness control, and partial rollout mechanisms to maximize throughput and mitigate long-tail generation latency.
- Scenarios: Essential for large-scale training (e.g., 128+ GPUs) or complex reasoning tasks (e.g., long chain-of-thought) where generation latency significantly bottlenecks performance.
## roadmap
### v0.7 release
**Model Engine**
- Integrate Megatron-Bridge and support LoRA/PEFT, see blog post: [How We Build Trillion Parameter Reasoning RL with 10% GPUs](https://macaron.im/mindlab/research/building-trillion-parameter-reasoning-rl-with-10-gpus)
- Support experimental fp8 training for megatron backend
- Support new model for megatron backend: GPT-OSS, Qwen3-Next
- Comprehensive support for new mode engine, FSDP and Megatron engine are production ready.
- Dispatch tensordict with nested tensor instead of padded DataProto
- Add TrainingWorker that resembles Tinker-like API
- Add VLM support for model engine, SFT and RL trainer
- Add model engine based critic model
- Implement ActorRolloutRefWorker by TrainingWorker, support different backend in one worker
- New VeOmni engine added, still in alpha status.
**Rollout Engine**
- Remove SPMD rollout mode
- Support blockwise fp8 rollout for vllm and sglang; support online quant for vllm with torchao
- Experimental router replay support for vllm
- Optimize multi-modal data fetch and preprocess, support video input
- Upgrade to vllm==0.12.0; sglang==0.5.6
**Reward**
- Support hybrid reward scenarios, including generative, discriminative, rule-based rewards, and their combinations.
- Refactor reward models into server mode, supporting both colocated and standalone deployments.
- Introduce new reward managers to handle more complex scenarios, limited mode for request rate control and remote mode for CPU-intensive tasks.
**Algorithm**
- Add [CISPO](https://arxiv.org/pdf/2506.13585): Clipped IS-weight Policy Optimization
- Add [SAPO](https://arxiv.org/abs/2511.20347): Soft Adaptive Policy Optimization
**Recipe**
- [NEW] VLA: add experimental support for VLA model
- [NEW] [rhymerl](https://arxiv.org/abs/2508.18588): History Rhymes: Accelerating LLM Reinforcement Learning with RhymeRL
- TransferQueue: support multiple data partition and optimize tensor zero-copy serialization
- One-step-off-policy/Fully async: optimize weight synchronization by checkpoint engine with bucket and pipeline support.
### v0.8
**Model Engine**
- Deprecate DataProto by Tensordict for zero padding transmission
- Switch default to new model engine, mark legacy engine (fsdp_workers.py, megatron_workers.py) as deprecated
- Feature parity between new and legacy model engine: LoRA/PEFT, etc
- Polish VeOmni engine to production ready status
- Support MTP RL training
- Optimize GPU memory for long context: fine-grained activation recompuation/offload
- New model support: DeepSeek V3.2, etc
**Rollout Engine**
- New rollout engine TensorRT-LLM
- Separate vllm worker from trainer process, update weights by cuda ipc
**TransferQueue**
- Merge TransferQueue recipe into main
- Optimize e2e image/video vlm training pipeline by TransferQueue
- Optimize router replay transmission by TransferQueue
**Checkpoint Engine**
- Add checkpoint engine abstract interface
- Add NCCL and NIXL transport backend
- Add more transport backend
### v0.9
**Trainer**
- Merge Full async into main: refactor with verl-core component
**Model Engine**
- Remove legacy model engine (fsdp_workers.py, megatron_workers.py)
- Support omni-model RL training: Qwen3-Omni, BAGEL, etc
**Rollout Engine**
- New rollout engine vllm-omni
**More agentic training recipe**
- SWEAgent
- GUIAgent
================================================
FILE: docs/conf.py
================================================
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = "verl"
copyright = "2024 ByteDance Seed Foundation MLSys Team"
author = "Guangming Sheng, Chi Zhang, Yanghua Peng, Haibin Lin"
# -- General configuration ---------------------------------------------------
# The master toctree document.
master_doc = "index"
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"myst_parser",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.autosectionlabel",
"sphinx.ext.napoleon",
"sphinx.ext.viewcode",
]
# MyST-Parser settings
myst_enable_extensions = [
"dollarmath", # Enables $...$ and $$...$$ syntax
"amsmath", # Enables amsmath environments
]
# Use Google style docstrings instead of NumPy docstrings.
napoleon_google_docstring = True
napoleon_numpy_docstring = False
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
source_suffix = {
".rst": "restructuredtext",
".md": "markdown",
}
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# Add the JavaScript file
html_js_files = [
"js/runllm-widget.js",
"js/resizable-sidebar.js",
]
# Add custom CSS file for full-width layout
html_css_files = [
"custom.css",
]
exclude_patterns += ["README.md", "README_vllm0.7.md"]
suppress_warnings = ["ref.duplicate", "ref.myst"]
================================================
FILE: docs/data/transfer_queue.md
================================================
# TransferQueue Data System
Last updated: 01/07/2026.
This doc introduce [TransferQueue](https://gitcode.com/Ascend/TransferQueue), an asynchronous streaming data management system for efficient post-training.
🔥 **Now TransferQueue is formally open-sourced at [GitCode](https://gitcode.com/Ascend/TransferQueue). We will soon provide a [Github Mirror Repo](https://github.com/Ascend/TransferQueue) for community contributions. You are welcome to submit contributions or propose new ideas on either platform!**
> At the mean time, the early development history remains accessible at: https://github.com/TransferQueue/TransferQueue.
Overview
TransferQueue is a high-performance data storage and transfer module with panoramic data visibility and streaming scheduling capabilities, optimized for efficient dataflow in post-training workflows.
TransferQueue offers **fine-grained, sample-level** data management and **load-balancing** (on the way) capabilities, serving as a data gateway that decouples explicit data dependencies across computational tasks. This enables a divide-and-conquer approach, significantly simplifies the algorithm controller design.
Updates
- **Dec 30, 2025**: **TransferQueue x verl** integration is tested with the DAPO algorithm at scale **(64 nodes, 1024 cards)**. It significantly optimizes host memory utilization and accelerates data transfers. Stay tuned for more details!
- **Dec 20, 2025**: 🔥 The official [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) is released! Feel free to check it out.
- **Nov 10, 2025**: We disentangle the data retrieval logic from TransferQueueController [PR#101](https://github.com/TransferQueue/TransferQueue/pull/101). Now you can implement your own `Sampler` to control how to consume the data.
- **Nov 5, 2025**: We provide a `KVStorageManager` that simplifies the integration with KV-based storage backends [PR#96](https://github.com/TransferQueue/TransferQueue/pull/96). The first available KV-based backend is [Yuanrong](https://gitee.com/openeuler/yuanrong-datasystem).
- **Nov 4, 2025**: Data partition capability is available in [PR#98](https://github.com/TransferQueue/TransferQueue/pull/98). Now you can define logical data partitions to manage your train/val/test datasets.
- **Oct 25, 2025**: We make storage backends pluggable in [PR#66](https://github.com/TransferQueue/TransferQueue/pull/66). You can try to integrate your own storage backend with TransferQueue now!
- **Oct 21, 2025**: Official integration into verl is ready [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649). Following PRs will optimize the single controller architecture by fully decoupling data & control flows.
- **July 22, 2025**: We present a series of Chinese blogs on Zhihu 1, 2.
- **July 21, 2025**: We started an RFC on verl community [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662).
- **July 2, 2025**: We publish the paper [AsyncFlow](https://arxiv.org/abs/2507.01663).
Components
### Control Plane: Panoramic Data Management
In the control plane, `TransferQueueController` tracks the **production status** and **consumption status** of each training sample as metadata. When all the required data fields are ready (i.e., written to the `TransferQueueStorageManager`), we know that this data sample can be consumed by downstream tasks.
For consumption status, we record the consumption records for each computational task (e.g., `generate_sequences`, `compute_log_prob`, etc.). Therefore, even when different computation tasks require the same data field, they can consume the data independently without interfering with each other.
To make the data retrieval process more customizable, we provide a `Sampler` class that allows users to define their own data retrieval and consumption logic. Refer to the [Customize](#customize) section for details.
> In the future, we plan to support **load-balancing** and **dynamic batching** capabilities in the control plane. Additionally, we will support data management for disaggregated frameworks where each rank manages the data retrieval by itself, rather than coordinated by a single controller.
### Data Plane: Distributed Data Storage
In the data plane, we provide a pluggable design that enables TransferQueue to integrate with different storage backends according to user requirements.
Specifically, we provide a `TransferQueueStorageManager` abstraction class that defines the core APIs as follows:
- `async def put_data(self, data: TensorDict, metadata: BatchMeta) -> None`
- `async def get_data(self, metadata: BatchMeta) -> TensorDict`
- `async def clear_data(self, metadata: BatchMeta) -> None`
This class encapsulates the core interaction logic within the TransferQueue system. You only need to write a simple subclass to integrate your own storage backend. Refer to the [Customize](#customize) section for details.
Currently, we support the following storage backends:
- SimpleStorageUnit: A basic CPU memory storage with minimal data format constraints and easy usability.
- [Yuanrong](https://gitcode.com/openeuler/yuanrong-datasystem) (beta, [#PR107](https://github.com/TransferQueue/TransferQueue/pull/107), [#PR96](https://github.com/TransferQueue/TransferQueue/pull/96)): An Ascend native data system that provides hierarchical storage interfaces including HBM/DRAM/SSD.
- [Mooncake Store](https://github.com/kvcache-ai/Mooncake) (alpha, [#PR162](https://github.com/TransferQueue/TransferQueue/pull/162)): A high-performance, KV-based hierarchical storage that supports RDMA transport between GPU and DRAM.
- [Ray Direct Transport](https://docs.ray.io/en/master/ray-core/direct-transport.html) (alpha, [#PR167](https://github.com/TransferQueue/TransferQueue/pull/167)): Ray's new feature that allows Ray to store and pass objects directly between Ray actors.
Among them, `SimpleStorageUnit` serves as our default storage backend, coordinated by the `AsyncSimpleStorageManager` class. Each storage unit can be deployed on a separate node, allowing for distributed data management.
`SimpleStorageUnit` employs a 2D data structure as follows:
- Each row corresponds to a training sample, assigned a unique index within the corresponding global batch.
- Each column represents the input/output data fields for computational tasks.
This data structure design is motivated by the computational characteristics of the post-training process, where each training sample is generated in a relayed manner across task pipelines. It provides an accurate addressing capability, which allows fine-grained, concurrent data read/write operations in a streaming manner.
### User Interface: Asynchronous & Synchronous Client
The interaction workflow of TransferQueue system is as follows:
1. A process sends a read request to the `TransferQueueController`.
2. `TransferQueueController` scans the production and consumption metadata for each sample (row), and dynamically assembles a micro-batch metadata according to the load-balancing policy. This mechanism enables sample-level data scheduling.
3. The process retrieves the actual data from distributed storage units using the metadata provided by the controller.
To simplify the usage of TransferQueue, we have encapsulated this process into `AsyncTransferQueueClient` and `TransferQueueClient`. These clients provide both asynchronous and synchronous interfaces for data transfer, allowing users to easily integrate TransferQueue into their framework.
> In the future, we will provide a `StreamingDataLoader` interface for disaggregated frameworks as discussed in [issue#85](https://github.com/TransferQueue/TransferQueue/issues/85) and [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662). Leveraging this abstraction, each rank can automatically get its own data like `DataLoader` in PyTorch. The TransferQueue system will handle the underlying data scheduling and transfer logic caused by different parallelism strategies, significantly simplifying the design of disaggregated frameworks.
🔥 Showcases
### General Usage
The primary interaction points are `AsyncTransferQueueClient` and `TransferQueueClient`, serving as the communication interface with the TransferQueue system.
Core interfaces:
- `(async_)get_meta(data_fields: list[str], batch_size:int, partition_id: str, mode: str, task_name:str, sampling_config: Optional[dict[str, Any]]) -> BatchMeta`
- `(async_)get_data(metadata: BatchMeta) -> TensorDict`
- `(async_)put(data: TensorDict, metadata: Optional[BatchMeta], partition_id: Optional[str])`
- `(async_)clear_partition(partition_id: str)` and `(async_)clear_samples(metadata: BatchMeta)`
**Refer to our [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) for detailed examples.**
### verl Example
The primary motivation for integrating TransferQueue to verl now is to **alleviate the data transfer bottleneck of the single controller `RayPPOTrainer`**. Currently, all `DataProto` objects must be routed through `RayPPOTrainer`, resulting in a single point bottleneck of the whole post-training system.

Leveraging TransferQueue, we separate experience data transfer from metadata dispatch by
- Replacing `DataProto` with `BatchMeta` (metadata) and `TensorDict` (actual data) structures
- Preserving verl's original Dispatch/Collect logic via BatchMeta (maintaining single-controller debuggability)
- Accelerating data transfer by TransferQueue's distributed storage units

You may refer to the [recipe](https://github.com/TransferQueue/TransferQueue/tree/dev/recipe/simple_use_case), where we mimic the verl usage in both async & sync scenarios. Official integration to verl is also available now at [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649) (with subsequent PRs to further optimize the integration).
### Use Python package
```bash
pip install TransferQueue
```
### Build wheel package from source code
Follow these steps to build and install:
1. Clone the source code from the GitHub repository
```bash
git clone https://github.com/TransferQueue/TransferQueue/
cd TransferQueue
```
2. Install dependencies
```bash
pip install -r requirements.txt
```
3. Build and install
```bash
python -m build --wheel
pip install dist/*.whl
```
> Note: The above benchmark for TransferQueue is based on our naive `SimpleStorageUnit` backend. By introducing high-performance storage backends and optimizing serialization/deserialization, we expect to achieve even better performance. Warmly welcome contributions from the community!
For detailed performance benchmarks, please refer to [this blog](https://www.yuque.com/haomingzi-lfse7/hlx5g0/tml8ke0zkgn6roey?singleDoc#).
We also provide a [stress test report](https://www.yuque.com/haomingzi-lfse7/hlx5g0/ydbwgo5k2umaag78?singleDoc#) that demonstrates **768 concurrent clients writing 1.4 TB of data** into TransferQueue across 4 nodes. The system remains stable without any crashes or data loss, achieving 80% bandwidth.
🛠️ Customize TransferQueue
### Define your own data retrieval logic
We provide a `BaseSampler` abstraction class, which defines the following interface:
```python3
@abstractmethod
def sample(
self,
ready_indexes: list[int],
batch_size: int,
*args: Any,
**kwargs: Any,
) -> tuple[list[int], list[int]]:
"""Sample a batch of indices from the ready indices.
Args:
ready_indexes: List of global indices for which all required fields of the
corresponding samples have been produced, and the samples are not labeled as
consumed in the corresponding task.
batch_size: Number of samples to select
*args: Additional positional arguments for specific sampler implementations
**kwargs: Additional keyword arguments for specific sampler implementations
Returns:
List of sampled global indices of length batch_size
List of global indices of length batch_size that should be labeled as consumed
(will never be retrieved in the future)
Raises:
ValueError: If batch_size is invalid or ready_indexes is insufficient
"""
raise NotImplementedError("Subclasses must implement sample")
```
In this design, we separate data retrieval and data consumption through the two return values, which enables us to easily control sample replacement. We have implemented two reference designs: `SequentialSampler` and `GRPOGroupNSampler`.
The `Sampler` class or instance should be passed to the `TransferQueueController` during initialization. During each `get_meta` call, you can provide dynamic sampling parameters to the `Sampler`.
```python3
from transfer_queue import TransferQueueController, TransferQueueClient, GRPOGroupNSampler, process_zmq_server_info
# Option 1: Pass the sampler class to the TransferQueueController
controller = TransferQueueController.remote(GRPOGroupNSampler)
# Option 2: Pass the sampler instance to the TransferQueueController (if you need custom configuration)
your_own_sampler = YourOwnSampler(config)
controller = TransferQueueController.remote(your_own_sampler)
# Use the sampler
batch_meta = client.get_meta(
data_fields=["input_ids", "attention_mask"],
batch_size=8,
partition_id="train_0",
task_name="generate_sequences",
sampling_config={"n_samples_per_prompt": 4} # Put the required sampling parameters here
)
```
**Refer to [tutorial/04_custom_sampler.py](https://github.com/TransferQueue/TransferQueue/blob/main/tutorial/04_custom_sampler.py) for more details.**
### How to integrate a new storage backend
The data plane is organized as follows:
```text
transfer_queue/
├── storage/
│ ├── __init__.py
│ │── simple_backend.py # Default distributed storage backend (SimpleStorageUnit) by TQ
│ ├── managers/ # Managers are upper level interfaces that encapsulate the interaction logic with TQ system.
│ │ ├── __init__.py
│ │ ├──base.py # TransferQueueStorageManager, KVStorageManager
│ │ ├──simple_backend_manager.py # AsyncSimpleStorageManager
│ │ ├──yuanrong_manager.py # YuanrongStorageManager
│ │ ├──mooncake_manager.py # MooncakeStorageManager
│ │ └──factory.py # TransferQueueStorageManagerFactory
│ └── clients/ # Clients are lower level interfaces that directly manipulate the target storage backend.
│ │ ├── __init__.py
│ │ ├── base.py # TransferQueueStorageKVClient
│ │ ├── yuanrong_client.py # YuanrongStorageClient
│ │ ├── mooncake_client.py # MooncakeStorageClient
│ │ ├── ray_storage_client.py # RayStorageClient
│ │ └── factory.py # TransferQueueStorageClientFactory
```
To integrate TransferQueue with a custom storage backend, start by implementing a subclass that inherits from `TransferQueueStorageManager`. This subclass acts as an adapter between the TransferQueue system and the target storage backend. For KV-based storage backends, you can simply inherit from `KVStorageManager`, which can serve as the general manager for all KV-based backends.
Distributed storage backends often come with their own native clients serving as the interface of the storage system. In such cases, a low-level adapter for this client can be written, following the examples provided in the `storage/clients` directory.
Factory classes are provided for both `StorageManager` and `StorageClient` to facilitate easy integration. Adding necessary descriptions of required parameters in the factory class helps enhance the overall user experience.
✏️ Contribution Guide
**Contributions are warmly welcome!**
New ideas, feature suggestions, and user experience feedback are all encouraged—feel free to submit issues or PRs. We will respond as soon as possible.
We recommend using pre-commit for better code format.
```bash
# install pre-commit
pip install pre-commit
# run the following command in your repo folder, then fix the check before committing your code
pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always
```
Citation
Please kindly cite our paper if you find this repo is useful:
```bibtex
@article{han2025asyncflow,
title={AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training},
author={Han, Zhenyu and You, Ansheng and Wang, Haibo and Luo, Kui and Yang, Guang and Shi, Wenqi and Chen, Menglong and Zhang, Sicheng and Lan, Zeshun and Deng, Chunshi and others},
journal={arXiv preprint arXiv:2507.01663},
year={2025}
}
```
================================================
FILE: docs/examples/config.rst
================================================
.. _config-explain-page:
Config Explanation
===================
Last updated: 06/18/2025.
ppo_trainer.yaml for RL FSDP Backend
-------------------------------------
Data
~~~~
.. code:: yaml
data:
tokenizer: null
train_files: ~/data/rlhf/gsm8k/train.parquet
val_files: ~/data/rlhf/gsm8k/test.parquet
train_max_samples: -1 # set to -1 to use full dataset
val_max_samples: -1 # set to -1 to use full dataset
prompt_key: prompt
max_prompt_length: 512
max_response_length: 512
train_batch_size: 1024
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
return_raw_chat: False
return_full_prompt: False
shuffle: True
seed: 42
filter_overlong_prompts: False
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
trust_remote_code: True
custom_cls:
path: null
name: null
- ``data.train_files``: Training set parquet. Can be a list or a single
file. The program will read all files into memory, so it can't be too
large (< 100GB). The path can be either local path or HDFS path. For
HDFS path, we provide utils to download it to DRAM and convert the
HDFS path to local path.
- ``data.val_files``: Validation parquet. Can be a list or a single
file.
- ``data.train_max_samples``: Maximum number of samples to use from the
training dataset. Set to -1 to use the full dataset.
- ``data.val_max_samples``: Maximum number of samples to use from the
validation dataset. Set to -1 to use the full dataset.
- ``data.prompt_key``: The field in the dataset where the prompt is
located. Default is 'prompt'.
- ``data.max_prompt_length``: Maximum prompt length. All prompts will be
left-padded to this length. An error will be reported if the length is
too long
- ``data.max_response_length``: Maximum response length. Rollout in RL
algorithms (e.g. PPO) generates up to this length
- ``data.train_batch_size``: Batch size sampled for one training
iteration of different RL algorithms.
- ``data.return_raw_input_ids``: Whether to return the original
input_ids without adding chat template. This is mainly used to
accommodate situations where the reward model's chat template differs
from the policy. It needs to be decoded first, then apply the RM's
chat template. If using a model-based RM, and the policy and RM
chat_templates are different, this flag needs to be set
- ``data.return_raw_chat``: Whether to return the original chat (prompt)
without applying chat template.
- ``data.return_full_prompt``: Whether to return the full prompt with chat template
- ``data.shuffle``: Whether to shuffle the data in the dataloader.
- ``data.seed``: An integer seed to use when shuffling the data. If not set or set to
`null`, the data shuffling will not be seeded, resulting in a different data order on each run.
- ``data.filter_overlong_prompts``: Default don't filter.
- ``data.filter_overlong_prompts_workers``: For large-scale dataset, filtering
overlong prompts could be timeconsuming. You cat set the ``filter_overlong_prompts_workers``
to use multiprocessing for speed up. Default to 1.
- ``data.truncation``: Truncate the input_ids or prompt length if they
exceed max_prompt_length. Default is 'error', not allow exceed the
max_prompt_length. The users should increase the max_prompt_length if
throwing the error. You can also set ``left``, ``right`` and ``middle``.
When ``middle`` is selected, the logic splits the allowed max length roughly in half
and keeps the head and tail of the sequence, effectively discarding the middle section.
- ``data.image_key``: The field in the multi-modal dataset where the image is
located. Default is 'images'.
- ``data.trust_remote_code``: If the remote tokenizer has python file, we can use this field to allow
using remote tokenizer. For example: moonshotai/Moonlight-16B-A3B-Instruct
Customized Dataset
~~~~~~~~~~~~~~~~~~~~~~~~~~
Customized dataset extension is implemented for the SFT trainer and can be extended to other trainers with similar changes.
.. code:: yaml
custom_cls:
path: null
name: null
- ``data.custom_cls.path``: The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
- ``data.custom_cls.name``: The name of the dataset class within the specified file.
Actor/Rollout/Reference Policy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
actor_rollout_ref:
hybrid_engine: True
model:
path: ~/models/deepseek-llm-7b-chat
external_lib: null
override_config:
attn_implementation: flash_attention_2 # or eager, sdpa - attention implementation override
model_config: {}
moe_config: # Megatron only, can adjust moe configuration
freeze_moe_router: False # Megatron only, can freeze moe router (no grad)
enable_gradient_checkpointing: False
enable_activation_offload: False
trust_remote_code: False
use_remove_padding: False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 8
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.0
use_kl_loss: False # True for GRPO
# Rollout Correction (corrects distribution mismatch between rollout and training)
rollout_correction:
rollout_is: token # IS weights
rollout_is_threshold: 2.0 # Upper threshold for IS weights
rollout_rs: null # Rejection sampling
rollout_rs_threshold: null # RS upper threshold
use_torch_compile: True # False to disable torch compile
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
data_loader_seed: null
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: 0.0 # only used with cosine lr scheduler, default to 0.0
num_cycles: 0.5 # only used with cosine lr scheduler, default to 0.5
lr_scheduler_type: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
checkpoint:
# What to include in saved checkpoints
# with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
save_contents: ['model', 'optimizer', 'extra']
# For more flexibility, you can specify the contents to load from the checkpoint.
load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 16
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 16
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
do_sample: True
engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
vllm: {}
sglang: {}
n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo
calculate_log_probs: False # set to True for computing log probs via rollouts
val_kwargs:
# sampling parameters for validation
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1.0
temperature: 0
n: 1
do_sample: False # default eager for validation
agent:
custom_async_server: # Use custom async server implementation for rollout
path: null
name: null
**Common config for actor, rollout and reference model**
- ``actor_rollout_ref.hybrid_engine``: Whether it's a hybrid engine,
currently only supports hybrid engine
- ``actor_rollout_ref.model.path``: Huggingface model path. This can be
either local path or HDFS path. For HDFS path, we provide utils to
download it to DRAM and convert the HDFS path to local path.
- ``actor_rollout_ref.model.external_libs``: Additional Python packages
that need to be imported. Used to register models or tokenizers into
the Huggingface system.
- ``actor_rollout_ref.model.override_config``: Used to override some of
the model's original configurations. Common overrides include:
- ``attn_implementation``: Override the attention implementation. Default is ``flash_attention_2``.
Supported values: ``flash_attention_2``, ``eager``, ``sdpa``. Use ``eager`` for debugging or
compatibility issues. See :ref:`attention-implementation-override` for detailed usage.
- ``actor_rollout_ref.model.enable_gradient_checkpointing``: FSDP only, decide
Whether to enable gradient checkpointing for the actor,
Megatron uses recompute options in ``override_transformer_config`` to set this
- ``actor_rollout_ref.model.enable_activation_offload``: Whether to enable
activation offloading for the actor
- ``actor_rollout_ref.model.trust_remote_code``: Whether to enable loading
a remote code model
- ``actor_rollout_ref.model.use_fused_kernels``: Whether to use fused
kernels in the model. If set to True, the following parameters will be
used.
- ``actor_rollout_ref.model.fused_kernel_options.impl_backend``: The
implementation backend for fused kernels. Options: "triton" or
"torch". Default is "torch".
While in megatron, we only support "triton" as the
implementation backend, so there is no need for this option.
- ``actor_rollout_ref.model.use_remove_padding``: Whether to use remove
padding in the model. If set to True, the model will remove padding
tokens in the input_ids and response_ids. This helps a lot in improving model running efficiency.
- ``actor_rollout_ref.model.tiled_mlp``: TiledMLP configuration for memory-efficient
MLP computation. Reduces peak memory by processing MLP forward/backward in tiles.
Only compatible with FSDP2 (requires ``actor_rollout_ref.actor.strategy=fsdp2``).
- ``actor_rollout_ref.model.tiled_mlp.enabled``: Whether to enable TiledMLP.
Default is False.
- ``actor_rollout_ref.model.tiled_mlp.num_shards``: Number of shards to split
the input. Higher values reduce peak memory but may slightly impact performance.
Default is 4.
**Actor model**
- ``actor_rollout_ref.actor.strategy``: fsdp or megatron. In this
example, we use fsdp backend.
- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
updates. The ppo_mini_batch_size is a global num across all workers/gpus
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu]
Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass,
trading speed for GPU memory. The value represent the global view.
- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient
accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed
for GPU memory. The value represent the local num per gpu.
- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
updates
- ``actor_rollout_ref.actor.use_kl_loss``: to use kl loss in actor. When used, we are not applying KL in the reward function.
- ``actor_rollout_ref.actor.clip_ratio``: PPO clip ratio
- ``actor_rollout_ref.actor.use_torch_compile``: Whether to use torch compile in actor
- ``actor_rollout_ref.actor.entropy_coeff``: The weight of entropy when
calculating PPO loss. The default value is changed to 0.0 since v0.3.x
- ``actor_rollout_ref.actor.ppo_epochs``: Number of epochs for PPO
updates on one set of sampled data
- ``actor_rollout_ref.actor.data_loader_seed``: From torch 2.6.0 Megatron backend can get wrong seed generated by pytorch
between cp ranks and cause misalignment between data on these ranks, so we shall manually set the seed to avoid hanging
issue. if ``actor_rollout_ref.actor.shuffle`` is not null, this must be set.
- ``actor_rollout_ref.actor.shuffle``: Whether to shuffle data when
there are multiple epochs
- ``actor_rollout_ref.actor.optim``: Actor's optimizer parameters
- ``actor_rollout_ref.actor.fsdp_config``: FSDP config for actor
training
- ``wrap_policy``: FSDP wrap policy. By default, it uses Huggingface's
wrap policy, i.e., wrapping by DecoderLayer
- No need to set transformer_layer_cls_to_wrap, so we comment it.
- ``*_offload``: Whether to enable parameter, gradient and optimizer
offload
- Trading speed for GPU memory.
- ``actor_rollout_ref.actor.use_kl_loss``: Whether to enable kl loss. Default is False.
- ``actor_rollout_ref.actor.kl_loss_coef``: The coefficient of kl loss. Default is 0.001.
- ``actor_rollout_ref.actor.kl_loss_type``: Support ``kl`` (``k1``), ``abs``, ``mse`` (``k2``), ``low_var_kl`` (``k3``) and ``full``. Appending ``+`` in the end (e.g., ``k1+`` and ``k3+``) would use straight-through to employ ``k2`` for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty()` in `core_algos.py `_ . See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
- ``actor_rollout_ref.actor.checkpoint``: The configurations of checkpoint function in actor
- ``save_contents``: The contents to save in the checkpoint. By default, we save model, optimizer and extra information in the checkpoint.
The extra information includes Rng states currently, FSDP supported lr_scheduler, and Megatron opt_param_scheduler will coming soon.
We do not store hf_model in checkpoint by default, but we provide a tool in ``scripts/model_merge.py`` to convert checkpoint format to hf format.
- ``load_contents``: The contents to load in the checkpoint, you can specify different checkpoint loading contents. By default, it is the same with ``save_checkpoint``.
**Reference Model**
Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.use_kl_in_reward`` is/are True.
- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
larger than 7B, it's recommended to turn on offload for ref by
default**
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num.
- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size
for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu.
**Rollout Model**
- ``actor_rollout_ref.rollout.name``: hf/vllm/sglang.
- Rollout (Auto-regressive) parameters. The key should be equal to the
property name in vLLM's ``SamplingParams``.
- ``temperature``, ``top_k``, ``top_p`` and others: Sampling
parameters in ``SamplingParams``.
- ``actor_rollout_ref.rollout.dtype``: Rollout model parameters type. This should be align with
the actor model parameter type in FSDP/Megatron backend.
- ``actor_rollout_ref.rollout.gpu_memory_utilization``:
- For vLLM v0.7.0 and later: The fraction of **total** GPU memory to be used for the vLLM instance.
- For SGLang: Corresponding to ``mem_fraction_static``, the fraction of the free GPU memory used for **static** memory like model weights and KV cache.
- ``actor_rollout_ref.rollout.tensor_model_parallel_size``: TP size for rollout. Only effective
for vllm.
- ``actor_rollout_ref.rollout.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num.
- ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for
one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu.
- ``actor_rollout_ref.rollout.do_sample``: Whether to sample during training rollout. If set to False, the rollout model
will perform greedy sampling.
- ``actor_rollout_ref.rollout.val_kwargs```: Sampling parameters used specifically during validation.
- ``top_k``: Top-k sampling parameter. Default to -1 for vLLM rollout or 0 for HF rollout.
- ``top_p``: Top-p sampling parameter. Default is 1.0 (disabled).
- ``temperature``: Sampling temperature. Default is 0 (deterministic greedy).
- ``n``: Number of responses to generate during validation. Default is 1.
- ``do_sample``: Whether to use sampling during validation. Default is False for
deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters
(top_k, top_p, temperature) to control the sampling behavior.
- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail
- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail
- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS
token and continue generating tokens after the EOS token is generated.
- ``actor_rollout_ref.rollout.free_cache_engine``: Offload the KVCache
after rollout generation stage. Default is True. When set to True,
for vllm v0.5.4 and v0.6.3, we need to disable the usage of CUDAGraph
(set ``enforce_eager`` to True.)
- ``actor_rollout_ref.rollout.enforce_eager``: Whether to use CUDAGraph
in vLLM generation. Default set to True to disable CUDAGraph.
- ``actor_rollout_ref.rollout.load_format``: Which weight loader to use
to load the actor model weights to the rollout model.
- ``auto``: Use Megatron weight loader.
- ``megatron``: Use Megatron weight loader. Deployed with Megatron
backend. The input model ``state_dict()`` is already partitioned
along TP dimension and already gathered along PP dimension. This
weight loader requires that the Rollout model and Actor model's
parameters shape and name should be identical.
- ``dtensor``: Default solution when using Huggingface weight loader.
Deployed with FSDP backend and the state_dict_type is
``StateDictType.SHARDED_STATE_DICT``. Recommend to use this weight
loader
- ``hf``: Use Huggingface weight loader. Deployed with FSDP backend
and the state_dict_type is ``StateDictType.FULL_STATE_DICT``. This
solution doesn't need to rewrite the weight loader for each model
implemented in vLLM but it results in larger peak memory usage.
- ``dummy_hf``, ``dummy_megatron``, ``dummy_dtensor``: Random
initialization.
.. note:: **NOTED**: In this config field, users only need to select from ``dummy_megatron``, ``dummy_dtensor``, ``dummy_hf`` for rollout initialization and our hybrid engine will select the corresponding weight loader (i.e., ``megatron``, ``dtensor``, ``hf``) during actor/rollout weight synchronization.
Megatron Optimizer and Optimizer Parameter Scheduler
____________________________________________________
.. code:: yaml
optim:
optimizer: adam
lr: 1e-6
clip_grad: 1.0
total_training_steps: -1 # must be override by program
lr_warmup_init: 0.0 # initial learning rate for warmup, default to 0.0
lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
lr_decay_steps: null
lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
min_lr: 0.0 # minimum learning rate, default to 0.0
weight_decay: 0.01
weight_decay_incr_style: constant # select from constant/linear/cosine
lr_wsd_decay_style: exponential # select from constant/exponential/cosine
lr_wsd_decay_steps: null
use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
Notice that there are some differences in APIs between Megatron optimizer and FSDP optimizer.
- Megatron optimizer scheduler names the period after lr_warmup as lr_decay_steps, so the ``lr_scheduler_type`` actually means the style of lr decay after warmup.
- Megatron optimizer also support weight decay decay mechanism
- ``use_checkpoint_opt_param_scheduler`` determines whether to use the checkpoint optimizer parameter scheduler. If set to True, the optimizer parameter scheduler will be saved in the checkpoint and loaded from the checkpoint during resuming training.
For learning rate decay, original Megatron pretrain default option of ``lr_decay_style`` is ``linear``,
meaning that the learning rate will be linearly decayed from the initial learning rate to ``min_lr`` within the
``lr_decay_steps``. However, in verl, to align with FSDP's default behavior, we set the default
``lr_decay_style`` to ``constant``, meaning that the learning rate will be kept constant after the warmup stage.
Critic Model
~~~~~~~~~~~~
Most parameters for Critic are similar to Actor Model.
Reward Model
~~~~~~~~~~~~
.. code:: yaml
reward_model:
enable: False
model:
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
path: ~/models/Anomy-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
trust_remote_code: False
fsdp_config:
min_num_params: 0
param_offload: False
micro_batch_size_per_gpu: 16
max_length: null
reward_manager: naive
- ``reward_model.enable``: Whether to enable reward model. If False, we
compute the reward only with the user-defined reward functions. In
GSM8K and Math examples, we disable reward model. For RLHF alignment
example using full_hh_rlhf, we utilize reward model to assess the
responses. If False, the following parameters are not effective.
- ``reward_model.model``
- ``input_tokenizer``: Input tokenizer. If the reward model's chat
template is inconsistent with the policy, we need to first decode to
plaintext, then apply the rm's chat_template. Then score with RM. If
chat_templates are consistent, it can be set to null.
- ``path``: RM's HDFS path or local path. Note that RM only supports
AutoModelForSequenceClassification. Other model types need to define
their own RewardModelWorker and pass it from the code.
- ``trust_remote_code``: Whether to enable loading a remote code model,
default to False.
- ``reward_model.reward_manager``: Reward Manager. This defines the mechanism
of computing rule-based reward and handling different reward sources. Default
is ``naive``. If all verification functions are multiprocessing-safe, the reward
manager can be set to ``prime`` for parallel verification.
Customized Reward Function
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
custom_reward_function:
path: null
name: compute_score
- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used.
- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'.
Algorithm
~~~~~~~~~
.. code:: yaml
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: gae
use_kl_in_reward: False
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.005
horizon: 10000
target_kl: 0.1
# Rollout Correction
rollout_correction:
rollout_is: null # IS weights
rollout_is_threshold: 2.0 # Upper threshold for IS weights
rollout_rs: null # Rejection sampling
rollout_rs_threshold: null # RS upper threshold
- ``gamma``: discount factor
- ``lam``: Trade-off between bias and variance in the GAE estimator
- ``adv_estimator``: Support ``gae``, ``grpo``, ``reinforce_plus_plus``, ``reinforce_plus_plus_baseline``, ``rloo``, ``rloo_vectorized``, ``grpo_vectorized``
- ``use_kl_in_reward``: Whether to enable in-reward kl penalty. Default is False.
- ``kl_penalty``: Support ``kl``, ``abs``, ``mse``, ``low_var_kl`` and ``full``. How to
calculate the kl divergence between actor and reference policy. For
specific options, refer to `kl_penalty()` in `core_algos.py `_ .
- ``kl_ctrl``: Config for in-reward kl_penalty controller
- ``kl_coef``: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
- ``type``: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
- ``horizon`` and ``target_kl``: See source code of AdaptiveKLController for details.
- ``rollout_correction``: Rollout Correction configuration (nested dict). Set to ``null`` to disable.
When enabled, contains:
- ``rollout_is``: IS weights aggregation level, ``null`` to disable IS weights.
- ``rollout_is_threshold``: Upper threshold for IS weights (e.g., 2.0).
- ``rollout_rs``: Rejection sampling mode, ``null`` to disable RS.
- ``rollout_rs_threshold``: RS upper threshold.
Note: Rollout Correction requires setting ``actor_rollout_ref.rollout.calculate_log_probs=True``.
Trainer
~~~~~~~
.. code:: yaml
trainer:
total_epochs: 30
project_name: verl_examples
experiment_name: gsm8k
logger: ['console', 'wandb']
log_val_generations: 0
nnodes: 1
n_gpus_per_node: 8
save_freq: -1
val_before_train: True
test_freq: 2
critic_warmup: 0
default_hdfs_dir: null # hdfs checkpoint path
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} # local checkpoint path
resume_mode: auto # or disable or resume_path if resume_from_path is set
resume_from_path: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
ray_wait_register_center_timeout: 300
- ``trainer.total_epochs``: Number of epochs in training.
- ``trainer.project_name``: For wandb, swanlab, mlflow
- ``trainer.experiment_name``: For wandb, swanlab, mlflow
- ``trainer.logger``: Support console and wandb, swanlab, mlflow, tensorboard, trackio
- ``trainer.log_val_generations``: The number of logged generation during validation (default ``0``)
- ``trainer.nnodes``: Number of nodes used in the training.
- ``trainer.n_gpus_per_node``: Number of GPUs per node.
- ``trainer.save_freq``: The frequency (by iteration) to save checkpoint
of the actor and critic model.
- ``trainer.val_before_train``: Whether to run validation before training.
- ``trainer.test_freq``: The validation frequency (by iteration).
- ``trainer.critic_warmup``: The number of iteration to train the critic
model before actual policy learning.
- ``trainer.resume_mode``: The mode of resuming training. Support
``disable``, ``auto`` and ``resume_path``. If set to ``auto`` as default, the
program will automatically resume from the latest checkpoint in the
``default_local_dir``. If set to ``resume_path``, the program will resume
from the path specified in ``resume_from_path``.
- ``trainer.resume_from_path``: The path to resume training from. Only
effective when ``resume_mode`` is set to ``resume_path``.
- ``trainer.remove_previous_ckpt_in_save``: Whether to remove previous
checkpoints in the save directory. Default is False.
- ``trainer.del_local_ckpt_after_load``: Whether to delete local
checkpoints after loading them. Default is False.
- ``trainer.ray_wait_register_center_timeout``: The timeout for waiting
for the ray register center to be ready. Default is 300 seconds.
This figure illustrates how the configurations affect the training.
https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
evaluation.yaml
---------------
Data
~~~~
.. code:: yaml
data:
path: /tmp/math_Qwen2-7B-Instruct.parquet
prompt_key: prompt
response_key: responses
data_source_key: data_source
reward_model_key: reward_model
- ``data.path``: Path to the dataset file (Parquet format).
- ``data.prompt_key``: The field in the dataset where the prompt is located. Default is 'prompt'.
- ``data.response_key``: The key holds the generated responses. This should be a list of strings representing the responses. Default is 'responses'.
- ``data.data_source_key``: This is used to separate metric calculations for different data sources, ensuring that metrics are calculated independently for each source.
- ``data.reward_model_key``: The key holds the reference answers. These reference answers typically serve as the ground truth or test cases for the task.
Customized Reward Function
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
custom_reward_function:
path: null
name: compute_score
- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used.
- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'.
sft_trainer.yaml for SFT FSDP Backend
--------------------------------------
Optim
~~~~~~~
.. code:: yaml
optim:
optimizer: AdamW
optimizer_impl: torch.optim
lr: 1e-5
weight_decay: 0.01
lr_warmup_steps_ratio: 0.1
clip_grad: 1.0
lr_scheduler: cosine
override_optimizer_config: null
- ``optimizer``: Optimizer class name (e.g., ``"AdamW"``, ``"AdamW8bit"``, ``"_AdamW"``). The class name as it appears in the module.
- ``optimizer_impl``: Module path to import optimizer from (e.g., ``"torch.optim"``, ``"torchao.optim"``, ``"bitsandbytes.optim"``).
- ``optim.lr``: Learning rate for the optimizer.
- ``optim.weight_decay``: Weight decay for the optimizer.
- ``optim.lr_warmup_steps_ratio``: Ratio of warmup steps to total training steps.
- ``optim.clip_grad``: Gradient clipping value.
- ``optim.lr_scheduler``: Learning rate scheduler type. Options:
- ``cosine``: Cosine learning rate scheduler with warmup (default).
- ``wsd``: Warmup-Stable-Decay scheduler that provides a stable learning rate phase between warmup and decay phases.
- ``override_optimizer_config``: Dictionary of additional optimizer-specific keyword arguments. For example, to use ``torchao.optim``'s ``_AdamW`` with BF16 stochastic rounding: ``{"bf16_stochastic_round": true}``
Model
~~~~~~~~~~~~
Most parameters for Model are similar to Reward Model.
.. code:: yaml
model:
partial_pretrain: ~/models/gemma-1.1-7b-it
fsdp_config:
model_dtype: fp32
wrap_policy:
min_num_params: 0
cpu_offload: False
offload_params: False
external_lib: null
enable_gradient_checkpointing: False
trust_remote_code: False
lora_rank: 0
lora_alpha: 16
target_modules: all-linear
use_liger: False
- ``partial_pretrain``: HDFS path or local path for the pretrained model.
- ``fsdp_config``
- ``model_dtype``: Model parameters type, default to ``fp32``.
Support: ``bf16``, ``fp16``, ``fp32``.
- ``cpu_offload``: Whether to enable CPU offloading for FSDP. If True,
the offload_params will be used as argument.
- ``offload_params``: Whether to offload parameters to CPU
when not involved in computation. If True, then this offloads gradients
to CPU as well, meaning that the optimizer step runs on CPU.
- ``lora_rank``: The rank of the LoRA model, default to 0. If ``lora_rank``>0,
we will train LoRA modules instead of tuning the full model.
- ``lora_alpha``: The alpha parameter for LoRA scaling, default to 16.
- ``target_modules``: The names of the modules to apply the adapter to,
default to ``all-linear``. See `peft docs `_ for detail.
- ``use_liger``: Whether to enable Liger kernel, default to False. If True,
we apply Liger kernel to the model (depends on `liger-kernel`).
================================================
FILE: docs/examples/gsm8k_example.rst
================================================
GSM8K Example
=============
Last updated: 03/25/2025.
Introduction
------------
In this example, we train an LLM to tackle the GSM8k task.
Paper: https://arxiv.org/pdf/2110.14168
Dataset: https://huggingface.co/datasets/openai/gsm8k
Note that the original paper mainly focuses on training a verifier (a
reward model) to solve math problems via Best-of-N sampling. In this
example, we train an RLHF agent using a rule-based reward model.
Dataset Introduction
--------------------
GSM8k is a math problem dataset. The prompt is an elementary school
problem. The LLM model is required to answer the math problem.
The training set contains 7473 samples and the test set contains 1319
samples.
**An example**
Prompt
Katy makes coffee using teaspoons of sugar and cups of water in the
ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups
of water, calculate the number of teaspoonfuls of sugar she used.
Solution
The total ratio representing the ingredients she used to make the
coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the
number of teaspoons she used is 7/20, she used 7/20\ *120 =
<<7/20*\ 120=42>>42 #### 42
Step 1: Prepare dataset
-----------------------
.. code:: bash
cd examples/data_preprocess
python3 gsm8k.py --local_save_dir ~/data/gsm8k
Step 2: Download Model
----------------------
There're three ways to prepare the model checkpoints for post-training:
- Download the required models from huggingface or modelscope
.. code:: bash
hf download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False
# or
modelscope download --model deepseek-ai/deepseek-math-7b-instruct --local_dir ~/models/deepseek-math-7b-instruct
- Already store your store model in the local directory or HDFS path.
- Also, you can directly use the model name in huggingface (e.g.,
deepseek-ai/deepseek-math-7b-instruct) in
``actor_rollout_ref.model.path`` and ``critic.model.path`` field in
the run script. You can also download models from modelscope by setting environmental variable ``VERL_USE_MODELSCOPE=True``.
See examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh for example.
Noted that users should prepare checkpoints for actor, critic and reward
model.
[Optional] Step 3: SFT your Model
---------------------------------
We provide a SFT Trainer using PyTorch FSDP in
`sft_trainer.py `_.
Users can customize their own SFT
script using our FSDP SFT Trainer.
We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory `_.
.. code:: shell
set -x
torchrun -m verl.trainer.sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.messages_key=messages \
data.micro_batch_size_per_gpu=8 \
model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger='["console","wandb"]'
If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
.. code-block:: bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
Step 4: Perform PPO training with your model on GSM8K Dataset
-------------------------------------------------------------
- Prepare your own run.sh script. Here's an example for GSM8k dataset
and deepseek-llm-7b-chat model.
- Users could replace the ``data.train_files`` ,\ ``data.val_files``,
``actor_rollout_ref.model.path`` and ``critic.model.path`` based on
their environment.
- See :doc:`config` for detailed explanation of each config field.
**Reward Model/Function**
We use a rule-based reward model. We force the model to produce a final
answer following 4 “#” as shown in the solution. We extract the final
answer from both the solution and model's output using regular
expression matching. We compare them and assign a reward of 1 to correct
answer, 0.1 to incorrect answer and 0 to no answer.
**Training Script**
The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory.
.. code:: bash
cd ../ppo_trainer
bash run_deepseek7b_llm.sh
The script of run_deepseek7b_llm.sh
.. code:: bash
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@
If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
.. code-block:: bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
If you encounter any issues in using AMD GPUs running VeRL, feel free to contact me - `Yusheng Su `_.
================================================
FILE: docs/examples/multi_modal_example.rst
================================================
Multi-Modal Example Architecture
=================================
Last updated: 04/28/2025.
Introduction
------------
Now, verl has supported multi-modal training. You can use fsdp and
vllm/sglang to start a multi-modal RL task. Megatron supports is also
on the way.
Follow the steps below to quickly start a multi-modal RL task.
Step 1: Prepare dataset
-----------------------
.. code:: python
# it will be saved in the $HOME/data/geo3k folder
python examples/data_preprocess/geo3k.py
Step 2: Download Model
----------------------
.. code:: bash
# download the model from huggingface
python3 -c "import transformers; transformers.pipeline(model='Qwen/Qwen2.5-VL-7B-Instruct')"
Step 3: Perform GRPO training with multi-modal model on Geo3K Dataset
---------------------------------------------------------------------
.. code:: bash
# run the task
bash examples/grpo_trainer/run_qwen2_5_vl-7b.sh
================================================
FILE: docs/examples/ppo_code_architecture.rst
================================================
PPO Example Architecture
========================
Last updated: 02/17/2025.
Let's start with the Proximal Policy Optimization algorithm, which is
most widely used algorithm in LLM post-training.
The main entry point of the PPO algorithm example is:
`main_ppo.py `_.
In this tutorial, we will go through the code architecture in `main_ppo.py `_.
Define the data
---------------
Users need to preprocess and store the dataset in parquet files.
And we implement `RLHFDataset` to load and tokenize the parquet files.
For ``RLHFDataset`` (Default), at least 1 fields are required:
- ``prompt``: Contains the string prompt
We already provide some examples of processing the datasets to parquet
files in `data_preprocess directory `_. Currently, we support
preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for
more information.
Define the reward functions for different datasets
--------------------------------------------------
In this main entry point, the users only need to define their own reward
function based on the datasets (or applications) utilized in PPO
training.
For example, we already provide reward functions for `GSM8k `_
and `MATH `_
datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we
will compute the reward score based on the data_source to select
corresponding reward functions. For some RLHF datasets (e.g.,
full_hh_rlhf), the reward model is utilized to assess the responses
without any reward functions. In this case, the ``RewardManager`` will
return the ``rm_score`` computed by the reward model directly.
See `reward functions `_ for detailed implementation.
Define worker classes
---------------------
.. code:: python
if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: # for FSDP backend
assert config.critic.strategy in {"fsdp", "fsdp2"}
from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
from verl.single_controller.ray import RayWorkerGroup
ray_worker_group_cls = RayWorkerGroup
elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend
assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM
else:
raise NotImplementedError
from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
role_worker_mapping = {
Role.ActorRollout: ActorRolloutRefWorker,
Role.Critic: CriticWorker,
Role.RefPolicy: ActorRolloutRefWorker
}
global_pool_id = 'global_pool'
resource_pool_spec = {
global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
}
mapping = {
Role.ActorRollout: global_pool_id,
Role.Critic: global_pool_id,
Role.RefPolicy: global_pool_id,
}
Step 1: Construct the mapping between roles and workers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A role represents a group of workers in the same process. We have
pre-defined several roles in `ray_trainer.py `_.
.. code:: python
class Role(Enum):
"""
To create more roles dynamically, you can subclass Role and add new members
"""
Actor = 0 # This worker only has Actor
Rollout = 1 # This worker only has Rollout
ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine
Critic = 3 # This worker only has critic
RefPolicy = 4 # This worker only has reference policy
RewardModel = 5 # This worker only has reward model
ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously
Step 2: Define the worker class corresponding to this role
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- We have pre-implemented the ``ActorRolloutRefWorker``. Through
different configs, it can be a standalone actor, a standalone rollout,
an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine
- We also pre-implemented workers for ``Actor``, ``Rollout``,
``Critic``, ``Reward Model`` and ``Reference model`` on two different
backend: PyTorch FSDP
and Megatron-LM.
See `FSDP Workers `_
and `Megatron-LM Workers `_
for more information.
Step 3: Define resource pool id and resource pool spec
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Resource pool is a division of global GPU resources,
``resource_pool_spec`` is a dict, mapping from id to # of GPUs
- In the above example, we defined a global resource pool:
global_pool_id, and then put all roles on this one resource pool
with all the GPUs in this post-training task. This refers to
*co-locate* placement where all the models share the same set of
GPUs.
- See resource pool and placement for advance usage.
Defining reward model/function
------------------------------
.. code:: python
# we should adopt a multi-source reward function here
# - for rule-based rm, we directly call a reward score
# - for model-based rm, we call a model
# - for code related prompt, we send to a sandbox if there are test cases
# - finally, we combine all the rewards together
# - The reward type depends on the tag of the data
if config.reward_model.enable:
from verl.workers.fsdp_workers import RewardModelWorker
role_worker_mapping[Role.RewardModel] = RewardModelWorker
mapping[Role.RewardModel] = global_pool_id
reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
# Note that we always use function-based RM for validation
val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
Since not all tasks use model-based RM, users need to define here
whether it's a model-based RM or a function-based RM
- If it's a model-based RM, directly add the ``RewardModel`` role in the
resource mapping and add it to the resource pool mapping.
- Note that the pre-defined ``RewardModelWorker`` only supports models
with the structure of huggingface
``AutoModelForSequenceClassification``. If it's not this model, you
need to define your own RewardModelWorker in `FSDP Workers `_
and `Megatron-LM Workers `_.
- If it's a function-based RM, the users are required to classified the
reward function for each datasets.
.. code:: python
def _select_rm_score_fn(data_source):
if data_source == 'openai/gsm8k':
return gsm8k.compute_score
elif data_source == 'lighteval/MATH':
return math.compute_score
else:
raise NotImplementedError
See reward functions implemented in `directory `_
for more information.
Define, init and run the PPO Trainer
------------------------------------
.. code:: python
trainer = RayPPOTrainer(config=config,
tokenizer=tokenizer,
role_worker_mapping=role_worker_mapping,
resource_pool_manager=resource_pool_manager,
ray_worker_group_cls=ray_worker_group_cls,
reward_fn=reward_fn,
val_reward_fn=val_reward_fn)
trainer.init_workers()
trainer.fit()
- We first initialize the ``RayPPOTrainer`` with user config, tokenizer
and all the above worker mapping, resource pool, worker group and
reward functions
- We first call the ``trainer.init_workers()`` to initialize the models
on the allocated GPUs (in the resource pool)
- The actual PPO training will be executed in ``trainer.fit()``
verl can be easily extended to other RL algorithms by reusing the Ray
model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for
more information.
Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`.
================================================
FILE: docs/examples/sandbox_fusion_example.rst
================================================
Sandbox Fusion Example
============================
Last updated: 06/27/2025.
Introduction
------------
Sandbox Fusion is a remote code sandbox service that provides a secure environment for running and evaluating code generated by Large Language Models (LLMs). This example demonstrates how to train an LLM and use Sandbox Fusion to verify generated code, enhancing both security and performance.
By leveraging a remote code sandbox service with greater CPU resources for concurrent code verification, you can reduce the reward stage time by 10-30%, depending on the quality of the generated code.
Step 1: Prepare the Dataset
---------------------------
We use the Eurus-2-RL-Data dataset for training. This dataset combines math and code questions, making it suitable for LLM training tasks. You can download it from HuggingFace: `Eurus-2-RL-Data Dataset `_.
Step 2: Set Up the Sandbox Fusion Service
-----------------------------------------
Sandbox Fusion is a remote code sandbox service designed to securely run and evaluate LLM-generated code. To use it:
1. **Access Full Documentation**: For detailed setup instructions, refer to the `Sandbox Fusion Documentation `_.
2. **Deploy the Service**: Choose one of the following deployment methods:
- **Local Deployment**: Follow the guide `here `_.
- **FaaS Instance (Volcengine)**: Create an instance using the `Volcengine Documentation `_.
After deployment, you will receive an API endpoint in the format: ``https:///run_code``.
Step 3: Configure the Training Script
-------------------------------------
To integrate Sandbox Fusion into your training script, configure the following parameters:
**Key Settings for Sandbox Fusion**
- ``reward_model.sandbox_fusion.url=''``: Enable Sandbox Fusion by specifying the API endpoint (must end with ``/run_code``).
- ``reward_model.sandbox_fusion.max_concurrent=256``: Set the maximum number of concurrent API requests to the Sandbox Fusion service.
- ``reward_model.sandbox_fusion.memory_limit_mb=1024``: Set the memory limit (in MB) for each sandbox instance. Defaults to 1024MB if not specified.
**Additional Optimization**
To further reduce code verification time, enable parallel processing with:
- ``reward_model.reward_manager=prime``: The Prime reward manager verifies code across multiple subprocesses concurrently.
**Example Script**
For a practical implementation, refer to the example script:
``examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh``
Once you’ve set your API endpoint in the script, you can start the training job.
================================================
FILE: docs/examples/skypilot_examples.rst
================================================
SkyPilot Examples
=================
Last updated: 09/04/2025.
This guide provides examples of running VERL reinforcement learning training on Kubernetes clusters or cloud platforms with GPU nodes using `SkyPilot `_.
Installation and Configuration
-------------------------------
Step 1: Install SkyPilot
~~~~~~~~~~~~~~~~~~~~~~~~~
Choose the installation based on your target platform:
.. code-block:: bash
# For Kubernetes only
pip install "skypilot[kubernetes]"
# For AWS
pip install "skypilot[aws]"
# For Google Cloud Platform
pip install "skypilot[gcp]"
# For Azure
pip install "skypilot[azure]"
# For multiple platforms
pip install "skypilot[kubernetes,aws,gcp,azure]"
Step 2: Configure Your Platform
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
See https://docs.skypilot.co/en/latest/getting-started/installation.html
Step 3: Set Up Environment Variables
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Export necessary API keys for experiment tracking:
.. code-block:: bash
# For Weights & Biases tracking
export WANDB_API_KEY="your-wandb-api-key"
# For HuggingFace gated models (if needed)
export HF_TOKEN="your-huggingface-token"
Examples
--------
All example configurations are available in the `examples/skypilot/ `_ directory on GitHub. See the `README `_ for additional details.
PPO Training
~~~~~~~~~~~~
.. code-block:: bash
sky launch -c verl-ppo verl-ppo.yaml --secret WANDB_API_KEY -y
Runs PPO training on GSM8K dataset using Qwen2.5-0.5B-Instruct model across 2 nodes with H100 GPUs. Based on examples in ``examples/ppo_trainer/``.
`View verl-ppo.yaml on GitHub `_
GRPO Training
~~~~~~~~~~~~~
.. code-block:: bash
sky launch -c verl-grpo verl-grpo.yaml --secret WANDB_API_KEY -y
Runs GRPO (Group Relative Policy Optimization) training on MATH dataset using Qwen2.5-7B-Instruct model. Memory-optimized configuration for 2 nodes. Based on examples in ``examples/grpo_trainer/``.
`View verl-grpo.yaml on GitHub `_
Multi-turn Tool Usage Training
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
sky launch -c verl-multiturn verl-multiturn-tools.yaml \
--secret WANDB_API_KEY --secret HF_TOKEN -y
Single-node training with 8xH100 GPUs for multi-turn tool usage with Qwen2.5-3B-Instruct. Includes tool and interaction configurations for GSM8K. Based on examples in ``examples/sglang_multiturn/`` but uses vLLM instead of sglang.
`View verl-multiturn-tools.yaml on GitHub `_
Configuration
-------------
The example YAML files are pre-configured with:
- **Infrastructure**: Kubernetes clusters (``infra: k8s``) - can be changed to ``infra: aws`` or ``infra: gcp``, etc.
- **Docker Image**: VERL's official Docker image with CUDA 12.6 support
- **Setup**: Automatically clones and installs VERL from source
- **Datasets**: Downloads required datasets during setup phase
- **Ray Cluster**: Configures distributed training across nodes
- **Logging**: Supports Weights & Biases via ``--secret WANDB_API_KEY``
- **Models**: Supports gated HuggingFace models via ``--secret HF_TOKEN``
Launch Command Options
----------------------
- ``-c ``: Cluster name for managing the job
- ``--secret KEY``: Pass secrets for API keys (can be used multiple times)
- ``-y``: Skip confirmation prompt
Monitoring Your Jobs
--------------------
Check Cluster Status
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
sky status
View Logs
~~~~~~~~~
.. code-block:: bash
sky logs verl-ppo # View logs for the PPO job
SSH into Head Node
~~~~~~~~~~~~~~~~~~
.. code-block:: bash
ssh verl-ppo
Access Ray Dashboard
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
sky status --endpoint 8265 verl-ppo # Get dashboard URL
Stop a Cluster
~~~~~~~~~~~~~~
.. code-block:: bash
sky down verl-ppo
================================================
FILE: docs/faq/faq.rst
================================================
Frequently Asked Questions
====================================
Last updated: 09/24/2025.
Ray related
------------
How to add breakpoint for debugging with distributed Ray?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Please checkout the official debugging guide from Ray: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html
"Unable to register worker with raylet"
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The cause of this issue is due to some system setting, e.g., SLURM added some constraints on how the CPUs are shared on a node.
While `ray.init()` tries to launch as many worker processes as the number of CPU cores of the machine,
some constraints of SLURM restricts the `core-workers` seeing the `raylet` process, leading to the problem.
To fix this issue, you can set the config term ``ray_init.num_cpus`` to a number allowed by your system.
Distributed training
------------------------
How to run multi-node post-training with Ray?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can start a ray cluster and submit a ray job, following the official guide from Ray: https://docs.ray.io/en/latest/ray-core/starting-ray.html
Then in the configuration, set the ``trainer.nnode`` config to the number of machines for your job.
How to use verl on a Slurm-managed cluster?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Ray provides users with `this `_ official
tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>`
on a Slurm cluster under a multi-node setting with the following steps.
1. [Optional] If your cluster support `Apptainer or Singularity `_ and you wish
to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package
manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support `_) available to you.
.. code:: bash
apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
3. Modify `examples/slurm/ray_on_slurm.slurm `_ with your cluster's own information.
4. Submit the job script to the Slurm cluster with `sbatch`.
Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's
`Slurm user guide `_ for common caveats.
If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary.
Install related
------------------------
NotImplementedError: TensorDict does not support membership checks with the `in` keyword.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Detail error information:
.. code:: bash
NotImplementedError: TensorDict does not support membership checks with the `in` keyword. If you want to check if a particular key is in your TensorDict, please use `key in tensordict.keys()` instead.
Cause of the problem: There is no suitable version of tensordict package for the linux-arm64 platform. The confirmation method is as follows:
.. code:: bash
pip install tensordict==0.6.2
Output example:
.. code:: bash
ERROR: Could not find a version that satisfies the requirement tensordict==0.6.2 (from versions: 0.0.1a0, 0.0.1b0, 0.0.1rc0, 0.0.2a0, 0.0.2b0, 0.0.3, 0.1.0, 0.1.1, 0.1.2, 0.8.0, 0.8.1, 0.8.2, 0.8.3)
ERROR: No matching distribution found for tensordict==0.6.2
Solution 1st:
Install tensordict from source code:
.. code:: bash
pip uninstall tensordict
git clone https://github.com/pytorch/tensordict.git
cd tensordict/
git checkout v0.6.2
python setup.py develop
pip install -v -e .
Solution 2nd:
Temperally modify the error takeplace codes: tensordict_var -> tensordict_var.keys()
Illegal memory access
---------------------------------
If you encounter the error message like ``CUDA error: an illegal memory access was encountered`` during rollout, please check the vLLM documentation for troubleshooting steps specific to your vLLM version.
Checkpoints
------------------------
If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``verl/model_merger``.
Triton ``compile_module_from_src`` error
------------------------------------------------
If you encounter triton compilation error similar to the stacktrace below, please set the ``use_torch_compile`` flag according to
https://verl.readthedocs.io/en/latest/examples/config.html to disable just-in-time compilation for fused kernels.
.. code:: bash
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 338, in run
return self.fn.run(*args, **kwargs)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 607, in run
device = driver.active.get_current_device()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 23, in __getattr__
self._initialize_obj()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 20, in _initialize_obj
self._obj = self._init_fn()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 9, in _create_driver
return actives[0]()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 371, in __init__
self.utils = CudaUtils() # TODO: make static
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 80, in __init__
mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 57, in compile_module_from_src
so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/build.py", line 48, in _build
ret = subprocess.check_call(cc_cmd)
File "/data/lbh/conda_envs/verl/lib/python3.10/subprocess.py", line 369, in check_call
raise CalledProcessError(retcode, cmd)
What is the meaning of train batch size, mini batch size, and micro batch size?
------------------------------------------------------------------------------------------
This figure illustrates the relationship between different batch size configurations.
https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
How to generate ray timeline to analyse performance of a training job?
------------------------------------------------------------------------------------------
To generate the ray timeline file, you can set the config term ``ray_init.timeline_json_file`` to a json file path.
For example:
.. code:: bash
ray_init.timeline_json_file=/tmp/ray_timeline.json
The file will be generated in the specified path at the end of a training job.
You can use tools like chrome://tracing or the Perfetto UI and view the ray timeline file.
This figure shows the ray timeline file generated by from a training job on 1 node with 4 GPUs
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray_timeline.png?raw=true
How to set proxy only for wandb?
------------------------------------------------------------------------------------------
If you need a proxy to access wandb, you can add below config in your training job script.
Comparing to using global https_proxy env variable, this approach won't mess up other http requests, such as ChatCompletionScheduler.
.. code:: bash
+trainer.wandb_proxy=http://
Missmatch between inference and training sequence (high actor/grad_norm)
------------------------------------------------------------------------------------------
If you encounter the issue of actor/grad_norm metric continuously increasing during training, it might be caused by a significant precision mismatching between the inference engine and training. You can use the following parameter to confirm this:
.. code:: bash
actor_rollout_ref.rollout.calculate_log_probs=True
This parameter will add metrics like training/rollout_probs_diff_mean , which can be used to verify if there is a precision difference between inference and training.
Under normal circumstances, the value of training/rollout_probs_diff_mean should be below 0.005. If you observe this value to be higher than 0.01, it indicates a precision issue from the inference engine.
The precision issue is known to occur under the following conditions:
1. Using non-Hopper architecture GPUs, such as A100, L20, B200, etc.
2. Using vLLM `with issue 22103 `_ as the inference engine.
3. The input and output texts are long, for example, in multi-turn scenarios using reasioning models like Qwen3 for RL training.
If all three conditions above are met and you observe that rollout_probs_diff_mean is too high, it is recommended to add the following parameter to resolve the precision issue:
.. code:: bash
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_cascade_attn=True
The root cause of this issue is a bug in the flash attention used by vLLM. Although it has been fixed, the fix has not yet been released in the latest version of vLLM (v0.10.2).
For a more detailed explanation of this issue, please refer to `Fix LSE output error in FA2 kv-split `_.
Until vLLM releases a new version with this fix, it is recommended to use the configuration above to disable cascade attention as a workaround.
================================================
FILE: docs/hybrid_flow.rst
================================================
=========================================================
HybridFlow Programming Guide
=========================================================
Last updated: 06/02/2025.
.. _vermouth: https://github.com/vermouth1992
Author: `Chi Zhang `_
verl is an open source implementation of the paper `HybridFlow `_ [1]_. In this section, we will introduce the basic concepts of HybridFlow, the motivation and how to program with verl APIs.
Motivation and Design
------------------------
We use dataflow to represent RL systems. [4]_.
DataFlow
~~~~~~~~~~~~~~~~~~~~
Dataflow is an abstraction of computations. Neural Network training is a typical dataflow. It can be represented by computational graph.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/dataflow.jpeg?raw=true
:alt: The dataflow graph from CS231n 2024 lecture 4
This figure [2]_ represents the computation graph of a polynomial function followed by a sigmoid function. In the data flow of neural network computation, each node represents an operator, and each edge represents the direction of forward/backward propagation. The computation graph determines the architecture of the neural network.
RL as a dataflow problem
++++++++++++++++++++++++++++++++++++++++++++++
Reinforcement learning (RL) training can also be represented as a dataflow. Below is the dataflow graph that represents the PPO algorithm used in RLHF [3]_:
.. image:: https://picx.zhimg.com/70/v2-cb8ab5ee946a105aab6a563e92682ffa_1440w.avis?source=172ae18b&biz_tag=Post
:alt: PPO dataflow graph, credit to Zhihu 低级炼丹师
However, the dataflow of RL has fundamental differences compared with dataflow of neural network training as follows:
+--------------------------+--------------------------------------------------+---------------------+
| Workload | Node | Edge |
+--------------------------+--------------------------------------------------+---------------------+
| Neural Network Training | Operator (+/-/matmul/softmax) | Tensor movement |
+--------------------------+--------------------------------------------------+---------------------+
| Reinforcement Learning | High-level operators (rollout/model forward) | Data Movement |
+--------------------------+--------------------------------------------------+---------------------+
In the case of tabular reinforcement learning, each operator is a simple scalar math operation (e.g., bellman update). In deep reinforcement learning(DRL), each operator is a high-level neural network computation such as model inference/update. This makes RL a two-level dataflow problem:
- Control flow: defines how the high-level operators are executed (e.g., In PPO, we first perform rollout. Then, we perform advantage computation. Finally, we perform training). It expresses the **core logics of RL algorithms**.
- Computation flow: defines the dataflow of **neural network computation** (e.g., model forward/backward/optimizer).
Design Choices
~~~~~~~~~~~~~~~~~~~~
The model size used in DRL before the LLM era is typically small. Thus, the high-level neural network computation can be done in a single process. This enables embedding the computation flow inside the control flow as a single process.
However, in the LLM era, the computation flow (e.g., training neural network) becomes a multi-process program. This naturally leads to two design choices:
1. Convert the control flow into a multi-process program as well. Then colocate with computation flow (unified multi-controller)
- Advantages:
- Achieves the **optimal performance** under fixed computation flow and control flow as the communication overhead in both training and data transfer is minimized.
- Disadvantages:
- The computation and/or control flow is **hard to reuse** from software perspective as computation code is coupled with specific controller code. For example, the training loop of PPO is generic. Say we have an PPO training flow implemented with a specific computation flow such as FSDP. Neither the control flow or computation flow can be reused if we want to switch the computation flow from FSDP to Megatron, due to the coupling of control and computation flows.
- Requires more efforts from the user under flexible and dynamic control flows, due to the multi-process nature of the program.
2. Separate the flows: single process for the control flow and multi-process for computation flow
- Advantages:
- The computation flow defined elsewhere can be **easily reused** after the decoupling.
- The controller runs on a single process. Implementing a new RL algorithm with a **different control flow is simple and easy**.
- Disadvantages:
- Additional **data communication overhead** each time the controller process and computatation processes interact. The data has to be sent back and forth.
In verl, the latter strategy with separate control flow and computation flow is adopted. verl is designed to decouple the control flow of RL algorithms, and the implementation of computation engines.
Overall Execution Diagram
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Below is a simplified diagram denoting the execution of a reinforcement learning job. In the diagram, the controller runs on a single process, while the generator/actor workers, critic workers run on multiple processes, placed with specific resource groups. For rollout, the controller passes the data to the generator to perform sample generation. When the rollout is done, the data is passed back to controller for the next step of the algorithm. Similar execution is done for other workers. With the hybrid controller design, the data flow and computation is decoupled to provide both efficiency in computation and flexibility in defining algorithm training loops.
.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/driver_worker.png?raw=true
:alt: The execution diagram
Codebase walkthrough (PPO)
------------------------------------------------
Entry function
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Code: https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py
In this file, we define a remote function `main_task` that serves as the controller (driver) process as shown in the above figure. We also define a ``RewardManager``, where users can customize their reward function based on the data source in the dataset. Note that `RewardManager` should return the final token-level reward that is optimized by RL algorithms. Note that users can combine model-based rewards and rule-based rewards.
The ``main_task`` constructs a RayPPOTrainer instance and launch the fit. Note that ``main_task`` **runs as a single process**.
We highly recommend that the ``main_task`` is NOT scheduled on the head of the ray cluster because ``main_task`` will consume a lot of memory but the head usually contains very few resources.
Ray trainer
~~~~~~~~~~~~~~~~~~~~
Code: https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py
The RayPPOTrainer manages
- Worker and WorkerGroup construction
- Runs the main loop of PPO algorithm
Note that, the fit function of RayPPOTrainer **runs as a single process**.
Worker and WorkerGroup construction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each workerGroup manages a list of workers that runs remotely. Note that the worker group runs in the process of its constructor.
Each worker inside the WorkerGroup runs on a GPU. The worker group serves as a proxy for the controller process to interact with a list of workers, in order to perform certain computations. **In order to do so, we have to bind the methods of the worker into the method of the WorkerGroup and define the data dispatch and data collection**. This is done via simple decoration that will be introduced in the Worker definition section.
For example, in PPO, we define 3 worker groups:
- ActorRolloutRef: manages actor, rollout and reference policy. ActorRolloutRefWorker can be instantiated as a single actor, a single rollout, a single reference policy, a combined actor/rollout or a combined actor/rollout/ref. This design is aimed for the maximum code reuse in various scenarios. The reason for colocating actor and rollout is for fast weight transfer using nccl. The reason for coloating actor and reference is to implement an efficient lora PPO as the reference policy is simply the base model of PPO in lora. The colocation is done via ``verl.single_controller.ray.base.create_colocated_worker_cls``, where it creates a single ray remote class exposing all class methods from these roles.
- Critic: manages the critic model
- Reward: manages the reward model
The worker group will be constructed on the resource pool it designates. The resource pool is a set of GPUs in the ray cluster.
Worker definition
~~~~~~~~~~~~~~~~~~~~
.. _ActorRolloutRefWorker: https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py
We take `ActorRolloutRefWorker `_ for an example.
The APIs it should expose to the controller process are:
- init_model: build the underlying model
- generate_sequences: given prompts, generate responses
- compute_log_prob: compute the log-probability of a generated sequence using actor
- compute_ref_log_prob: compute the log-probability of a generated sequence using reference policy
- save_checkpoint: save the checkpoint
Note that these methods are defined in the worker that can only be invoked via remote calls. For example, if the controller process wants to initialize the model, it has to call
.. code-block:: python
for worker in actor_rollout_ref_wg:
worker.init_model.remote()
If the controller process wants to generate sequences, it has to call
.. code-block:: python
data = xxx
# split the data into dp chunks
data_dp_lst = data.split(dp_size)
output_dp_lst = []
for i, worker in enumerate(actor_rollout_ref_wg):
output_future = worker.generate_sequences.remote(data_dp_lst[i])
output_dp_lst.append(output_future)
output = torch.cat(ray.get(output_dp_lst), dim=0)
We observe that controller process calling worker group methods in general can be divided into 3 parts:
- Split the data into data parallel sizes
- Dispatch the corresponding data into each worker
- Collect and concatenate the data when the computation finishes
In verl, we design a syntax sugar to encapsulate the 3 processes into a single call from the controller process.
.. code-block:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(data):
...
# on the driver
output = actor_rollout_ref_wg.generate_sequences(data)
We decorate the method of the worker with a ``register`` that explicitly defines how the input data should be split and dispatched to each worker, and how the output data should be collected and concatenated by the controller. For example, ``Dispatch.DP_COMPUTE_PROTO`` splits the input data into dp chunks, dispatch each data to each worker, collect the output and concatenate the results. Note that this function requires the input and output to be a DataProto defined here (https://github.com/volcengine/verl/blob/main/verl/protocol.py).
PPO main loop
~~~~~~~~~~~~~~~~~~~~
With the aforementioned APIs, we can implement the main loop of PPO as if it is a single process program
.. code-block:: python
for prompt in dataloader:
output = actor_rollout_ref_wg.generate_sequences(prompt)
old_log_prob = actor_rollout_ref_wg.compute_log_prob(output)
ref_log_prob = actor_rollout_ref_wg.compute_ref_log_prob(output)
values = critic_wg.compute_values(output)
rewards = reward_wg.compute_scores(output)
# compute_advantages is running directly on the control process
advantages = compute_advantages(values, rewards)
output = output.union(old_log_prob)
output = output.union(ref_log_prob)
output = output.union(values)
output = output.union(rewards)
output = output.union(advantages)
# update actor
actor_rollout_ref_wg.update_actor(output)
critic.update_critic(output)
Takeaways
~~~~~~~~~~~~~~~~~~~~
- This programming paradigm enables users to use different computation backend without modification of the control process.
- This programming paradigm enables flexible placement (by changing the mapping of WorkerGroup and ResourcePool) without modification of the control process.
Repository organization
------------------------------------------------
Important code files in the repository are organized as below:
.. code-block:: bash
verl # the verl package
trainer
main_ppo.py # the entrypoint for RL training
ppo
ray_trainer.py # the training loop for RL algorithms such as PPO
sft_trainer.py # the SFT trainer with FSDP backend
config
generation.yaml # configuration template for rollout
ppo_trainer.yaml # configuration template for the RL trainer
workers
protocol.py # the interface of DataProto
fsdp_workers.py # the FSDP worker interfaces: ActorRolloutRefWorker, CriticWorker, RewardModelWorker
megatron_workers.py # the Megatron worker interfaces: ActorRolloutRefWorker, CriticWorker, RewardModelWorker
actor
dp_actor.py # data parallel actor with FSDP backend
megatron_actor.py # nD parallel actor with Megatron backend
critic
dp_critic.py # data parallel critic with FSDP backend
megatron_critic.py # nD parallel critic with FSDP backend
reward_model
megatron
reward_model.py # reward model with Megatron backend
rollout
vllm
vllm_rollout.py # rollout with vllm backend
hf_rollout.py # rollout with huggingface TGI backend
sharding_manager
fsdp_ulysses.py # data and model resharding when using FSDP + ulysses
fsdp_vllm.py # data and model resharding when using FSDP + ulysses + vllm
megatron_vllm.py # data and model resharding when using Megatron + vllm
utils
dataset # datasets for SFT/RM/RL
reward_score # function based reward
gsm8k.py # reward function for gsm8k dataset
math.py # reward function for math dataset
seqlen_balancing.py # the sequence balance optimization
models
llama # Megatron implementation for llama, deepseek, mistral, etc
transformers # ulysses integration with transformer models such as llama, qwen, etc
weight_loader_registery.py # registry of weight loaders for loading hf ckpt into Megatron
third_party
vllm # adaptor for vllm's usage in RL
vllm_spmd # vllm >= v0.7 adaptor
examples # example scripts
tests # integration and unit tests
.github # the configuration of continuous integration tests
.. [1] HybridFlow: A Flexible and Efficient RLHF Framework: https://arxiv.org/abs/2409.19256v2
.. [2] Data flow graph credit to CS231n 2024 lecture 4: https://cs231n.stanford.edu/slides/2024/lecture_4.pdf
.. [3] PPO dataflow graph credit to 低级炼丹师 from Zhihu: https://zhuanlan.zhihu.com/p/635757674
.. [4] RLFlow
================================================
FILE: docs/index.rst
================================================
Welcome to verl's documentation!
================================================
verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the `HybridFlow `_ paper.
verl is flexible and easy to use with:
- **Easy extension of diverse RL algorithms**: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
- **Flexible device mapping and parallelism**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
- Ready integration with popular HuggingFace models
verl is fast with:
- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
--------------------------------------------
.. _Contents:
.. toctree::
:maxdepth: 2
:caption: Quickstart
start/install
start/quickstart
start/multinode
start/ray_debug_tutorial
start/more_resources
start/agentic_rl
.. toctree::
:maxdepth: 2
:caption: Programming guide
hybrid_flow
single_controller
.. toctree::
:maxdepth: 1
:caption: Data Preparation
preparation/prepare_data
preparation/reward_function
.. toctree::
:maxdepth: 2
:caption: Configurations
examples/config
.. toctree::
:maxdepth: 1
:caption: PPO Example
examples/ppo_code_architecture
examples/gsm8k_example
examples/multi_modal_example
examples/skypilot_examples
.. toctree::
:maxdepth: 1
:caption: Algorithms
algo/ppo.md
algo/grpo.md
algo/collabllm.md
algo/dapo.md
algo/spin.md
algo/sppo.md
algo/entropy.md
algo/opo.md
algo/baseline.md
algo/gpg.md
algo/rollout_corr.md
algo/rollout_corr_math.md
algo/otb.md
algo/dppo.md
.. toctree::
:maxdepth: 1
:caption: PPO Trainer and Workers
workers/ray_trainer
workers/fsdp_workers
workers/megatron_workers
workers/automodel_workers
workers/sglang_worker
workers/trtllm_worker
workers/model_engine
.. toctree::
:maxdepth: 1
:caption: Performance Tuning Guide
perf/dpsk.md
perf/best_practices
perf/perf_tuning
perf/perf_tuning_on_ascend.rst
README_vllm0.8.md
perf/device_tuning
perf/verl_profiler_system.md
perf/nsight_profiling.md
perf/torch_profiling.md
.. toctree::
:maxdepth: 1
:caption: Adding new models
advance/fsdp_extension
advance/megatron_extension
.. toctree::
:maxdepth: 1
:caption: Advanced Features
advance/checkpoint
advance/rope
advance/attention_implementation
advance/ppo_lora.rst
sglang_multiturn/multiturn.rst
sglang_multiturn/interaction_system.rst
advance/placement
advance/dpo_extension
examples/sandbox_fusion_example
advance/rollout_trace.rst
advance/rollout_skip.rst
advance/one_step_off
advance/agent_loop
advance/reward_loop
advance/fully_async
data/transfer_queue.md
advance/grafana_prometheus.md
advance/fp8.md
advance/async-on-policy-distill
advance/mtp.md
.. toctree::
:maxdepth: 2
:caption: Hardware Support
amd_tutorial/amd_build_dockerfile_page.rst
amd_tutorial/amd_vllm_page.rst
ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst
ascend_tutorial/quick_start/ascend_quick_start.rst
ascend_tutorial/quick_start/dockerfile_build_guidance.rst
ascend_tutorial/quick_start/ascend_sglang_quick_start.rst
ascend_tutorial/features/ascend_consistency.rst
ascend_tutorial/features/ascend_backend_features.md
ascend_tutorial/profiling/ascend_profiling_zh.rst
ascend_tutorial/profiling/ascend_profiling_en.rst
ascend_tutorial/examples/gspo_optimization_practice.md
ascend_tutorial/examples/ascend_performance_analysis_guide.md
ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
ascend_tutorial/examples/ascend_sglang_best_practices.rst
ascend_tutorial/examples/ascend_retool_best_pratice.rst
ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md
ascend_tutorial/faq/faq.rst
.. toctree::
:maxdepth: 1
:caption: API References
api/data
api/single_controller.rst
api/trainer.rst
api/utils.rst
.. toctree::
:maxdepth: 1
:caption: Blog
blog/v0.7.md
.. toctree::
:maxdepth: 2
:caption: FAQ
faq/faq
.. toctree::
:maxdepth: 1
:caption: Development Notes
sglang_multiturn/sandbox_fusion.rst
Contribution
-------------
verl is free software; you can redistribute it and/or modify it under the terms
of the Apache License 2.0. We welcome contributions.
Join us on `GitHub `_, `Slack `_ and `Wechat `_ for discussions.
Contributions from the community are welcome! Please check out our `project roadmap `_ and `good first issues `_ to see where you can contribute.
Code Linting and Formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We use pre-commit to help improve code quality. To initialize pre-commit, run:
.. code-block:: bash
pip install pre-commit
pre-commit install
To resolve CI errors locally, you can also manually run pre-commit by:
.. code-block:: bash
pre-commit run
Adding CI tests
^^^^^^^^^^^^^^^^^^^^^^^^
If possible, please add CI test(s) for your new feature:
1. Find the most relevant workflow yml file, which usually corresponds to a ``hydra`` default config (e.g. ``ppo_trainer``, ``ppo_megatron_trainer``, ``sft_trainer``, etc).
2. Add related path patterns to the ``paths`` section if not already included.
3. Minimize the workload of the test script(s) (see existing scripts for examples).
We are HIRING! Send us an `email `_ if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
================================================
FILE: docs/perf/best_practices.rst
================================================
Verl LLM Best Practices (DAPO + Qwen3-235B)
===========================================
Last updated: 11/03/2025.
Purpose
-------
This guide uses DAPO training on Qwen3-235B as a concrete example. We unpack every parameter that appears in the optimization objective, map it to Verl configuration entries, and share field-tested recommendations so you can derive sensible settings for your own workloads.
.. note::
1. The guide only covers the subset of parameters required to reproduce the DAPO experiments discussed here. For the full list, refer to the ``config`` components in the Verl source tree: https://github.com/volcengine/verl/tree/main/verl/trainer/config
2. PPO and GRPO introduce KL-constrained policies. We therefore include that setup in the explanations below. You can treat all configurations mentioned here as a DAPO pipeline augmented with a KL penalty.
Optimization Objectives
-----------------------
DAPO objective
~~~~~~~~~~~~~~
.. math::
\begin{aligned}
\mathcal{J}_{\mathrm{DAPO}}(\theta)= & \mathbb{E}_{(q, a) \sim \mathcal{D},\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{\text {old }}}(\cdot \mid q)} \
{\left[\frac{1}{\sum_{i=1}^G\left|o_i\right|} \sum_{i=1}^G \sum_{t=1}^{\left|o_i\right|} \min \left(r_{i, t}(\theta) \hat{A}_{i, t}, \operatorname{clip}\left(r_{i, t}(\theta), 1-\varepsilon_{\text {low }}, 1+\varepsilon_{\text {high }}\right) \hat{A}_{i, t}\right)\right] } \\
\end{aligned}
.. math::
\text { s.t. } \quad 0<\mid\left\{o_i \mid \text { is_equivalent }\left(a, o_i\right)\right\} \mid 2 * model_parameters`` (bf16/fp16). Increase TP gradually to expand KV cache capacity while watching communication cost—especially once TP > 8.
- ``actor_rollout_ref.rollout.temperature`` / ``top_p`` / ``top_k``:
Sampling knobs for rollout. Keep enough randomness; ``temperature=1.0``, ``top_p=1.0``, ``top_k=-1`` are good defaults.
- ``actor_rollout_ref.rollout.val_kwargs.temperature`` / ``top_p`` / ``top_k`` / ``do_sample`` / ``n``:
Sampling options for validation. Set ``temperature > 0`` to prevent repetitive thinking chains. For small test sets (e.g., AIME24) raise ``n`` (64 is a common choice) to reduce variance. A practical starting point is ``temperature=1.0``, ``top_p=0.7``, ``top_k=-1``, ``do_sample=True``, ``n=1`` and then increase ``n`` as needed.
- ``+actor_rollout_ref.rollout.engine_kwargs.vllm.*`` / ``+actor_rollout_ref.rollout.engine_kwargs.sglang.*``:
Extra backend options injected via the ``+`` syntax. Consult backend docs for exact semantics. Some switches (for example ``pipeline_parallel_size``) may not be supported yet; when TP=32, ``enable_expert_parallel=True`` can even slow down DeepSeek-V3 rollout, so benchmark carefully.
:math:`\pi_\theta`
- ``data.train_batch_size``:
Total batch size per training iteration. Each rollout produces ``train_batch_size * n`` samples. Larger values reduce the number of rollouts but increase off-policy drift.
- ``actor_rollout_ref.actor.ppo_mini_batch_size``:
Mini-batch size per optimization step. Tune it the same way you would for standard deep learning workloads.
- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``:
Samples processed per forward pass on one GPU group (a Megatron group contains TP * PP * CP GPUs). Keep it ≤ ``ppo_mini_batch_size`` and as large as memory allows.
- ``actor_rollout_ref.actor.use_dynamic_bsz``:
Enable dynamic batch sizing to adapt to sequence length and improve throughput.
- ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``:
Maximum tokens per GPU when computing log probabilities under dynamic batching. Set it to at least a multiple of ``max_prompt_length + max_response_length`` to prevent truncation.
- Megatron parallelism parameters (``pipeline_model_parallel_size`` / ``tensor_model_parallel_size`` / ``expert_model_parallel_size`` / ``expert_tensor_parallel_size`` / ``context_parallel_size``):
Balance PP/TP/EP/ETP/CP to match memory and network constraints. In bf16/fp16, each parameter consumes roughly ``2 / TP`` bytes; if you keep FP32 master weights or skip optimizer offload, reserve another 4–8 bytes for Adam. Activations scale with ``micro_batch_size × sequence_length × hidden_size`` and can be mitigated with gradient checkpointing, dynamic batches, or offload. Prefer increasing TP first, add PP when necessary, extend sequence capacity with CP, align EP/ETP with TP for MoE models, and keep DP minimal on constrained clusters while combining with offload. Always align the setup with hardware topology and communication cost.
- ``actor_rollout_ref.model.use_fused_kernels``:
Enable Verl’s fused kernels for supported models to squeeze out additional performance.
:math:`\hat{A}_{i,t}`
- ``algorithm.adv_estimator``:
Advantage estimator. Set to ``grpo`` for DAPO/GRPO.
:math:`R_i`
- ``reward_model.reward_manager``:
Reward aggregation strategy. Use ``dapo`` for DAPO and ``naive`` for GRPO.
:math:`D_{KL}`
- ``algorithm.use_kl_in_reward``:
Whether to add a KL term to the reward. ``True`` for PPO, ``False`` for GRPO and DAPO.
- ``actor_rollout_ref.actor.use_kl_loss``:
Whether to include a KL loss term. ``False`` for PPO, ``True`` for GRPO, ``False`` for DAPO.
:math:`\beta`
- ``actor_rollout_ref.actor.kl_loss_coef``:
Weight of the KL loss. Start around 0.001. Larger values curb reward hacking but reduce exploration.
- ``algorithm.kl_ctrl.kl_coef``:
KL coefficient applied within the reward. Adjust to match your tolerance for divergence.
:math:`\pi_{old}`
- ``actor_rollout_ref.rollout.log_prob_use_dynamic_bsz``:
Enable dynamic batching when the old policy computes log-probabilities. Recommended.
:math:`\pi_{ref}`
- ``actor_rollout_ref.ref.log_prob_use_dynamic_bsz``:
Enable dynamic batching for the reference policy. Recommended.
- Reference Megatron parallelism:
Keep ``pipeline_model_parallel_size``, ``tensor_model_parallel_size``, ``expert_model_parallel_size``, ``expert_tensor_parallel_size``, and ``context_parallel_size`` in sync with the actor.
- ``actor_rollout_ref.ref.megatron.param_offload``:
Offload reference parameters to CPU when the actor does so. Even without gradients or optimizer states, parity helps with capacity planning.
:math:`o_i` / :math:`|o_i|`
- ``actor_rollout_ref.actor.loss_agg_mode``:
Loss aggregation mode. Token-level ``token-mean`` matches the recommendations from Dr.GRPO and DAPO; use ``seq-mean-token-mean`` to reproduce the original GRPO behavior.
:math:`\pi_\theta(o_{i,t} \mid q_i,o_{i,`_
- `SimonHuang `_
1.5B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2.5-1.5B
- GRPO-LoRA
- 1*H100
- 128
- fsdp
- vllm0.8.3
- `qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh `_
- `SimonHuang `_
3B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2.5-3B
- GRPO-LoRA
- 1*H100
- 62
- fsdp
- vllm0.8.3
- `qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh `_
- `SimonHuang `_
7B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-7B
- GRPO
- 2*H800
- \
- fsdp
- vllm0.8.2
- `qwen2-7b_grpo_2_h800_fsdp_vllm `_
- `Xiangyongan `_
* - MIN
- Qwen2.5-7B
- GRPO-LoRA
- 1*H100
- 16
- fsdp
- vllm0.8.3
- `qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh `_
- `SimonHuang `_
14B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-14B
- GRPO
- 4*H800
- \
- fsdp
- vllm0.8.2
- `qwen2-14b_grpo_4_h800_fsdp_vllm `_
- `Xiangyongan `_
* - MIN
- Qwen2.5-14B
- GRPO-LoRA
- 2*H100
- 116
- fsdp
- vllm0.8.3
- `qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh `_
- `SimonHuang `_
32B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-32B
- GRPO
- 8*H20
- \
- megatron
- vllm0.8.2
- `qwen2-32b_grpo_8_h20_megatron_vllm `_
- `Xiangyongan `_
* - MIN
- Qwen2.5-32B
- GRPO-LoRA
- 4*H100
- 180
- fsdp
- vllm0.8.3
- `qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh `_
- `SimonHuang `_
70B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-70B
- GRPO
- 32*H20
- \
- fsdp
- vllm0.8.2
- `qwen2-70b_grpo_32_h20_fsdp_vllm `_
- `Xiangyongan `_
* - MIN
- Qwen2-70B
- GRPO
- 32*H800
- \
- fsdp
- vllm0.8.3
- `qwen2-70b_grpo_32_h800_fsdp_vllm `_
- `Xiangyongan `_
* - MIN
- Qwen2.5-72B
- GRPO-LoRA
- 8*H100
- 176
- fsdp
- vllm0.8.3
- `qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh `_
- `SimonHuang `_
405B
~~~~
.. table::
:widths: auto
====== ====== ====== ======== ======== ====== ====== ======
tag model task resource MaxBatch train infer link
====== ====== ====== ======== ======== ====== ====== ======
\ \ \ \ \ \ \
====== ====== ====== ======== ======== ====== ====== ======
671B
~~~~
.. table::
:widths: auto
====== ====== ====== ======== ======== ====== ====== ======
tag model task resource MaxBatch train infer link
====== ====== ====== ======== ======== ====== ====== ======
\ \ \ \ \ \ \
====== ====== ====== ======== ======== ====== ====== ======
================================================
FILE: docs/perf/dpsk.md
================================================
# Training DeepSeek 671b
Last updated: 08/20/2025.
verl integrates Megatron to support large MoE models such as `Qwen3-235B-A22B` and `deepseek-ai/DeepSeek-V3`. This is an ongoing community effort.
In the journey the community added the following features and optimizations that enable verl with larger models:
- per tensor weight resharding between rollout and training
- context parallelism and expert parallelism enabled via megatron
- dynamic batch size (sequence balance) for megatron
- reduced ray-related serialization overhead
- optimizer offloading, recomputation, and efficient kernels
- various debugging metrics and utils
- hybrid optimizer
and the megatron backend now has a wider list of models supported:
- DeepSeek-V3
- Moonlight
- Qwen3
- Qwen2.5-VL (to be merged soon)
- Qwen2
- Mixtral
## Getting Started
### preparation
The recommended image with pre-built Megatron dependency is `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.13.0-preview`, which is built using the Dockerfile at [docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview](https://github.com/volcengine/verl/blob/main/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview).
The image is build in Hopper GPUs with DeepEP. It does not support None-Hopper GPUs, such as A100. You may need to reinstall DeepEP to work with A100.
With `OFFLOAD_FRACTION=1`, the system's minimum requirements are lowered. It can run on as few as 96 H20 (96GB) GPUs for DeepSeek-V3, and on as few as 32 H20 (96GB) GPUs for Qwen3-235B-A22B. However, this configuration will use 1.6TB CPU memory per node. If you run out of CPU memory or require faster training speed, you can add more nodes.
### DeepSeek 671b
For DeepSeek-V3 671b, please refer to [examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh).
MTP and quantilization is disabled during RL training.
To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER |
| -- | -- | -- | -- | -- | -- | -- | -- |
| 96 | 12 | 8 | 12 | 8 | 1. | False | 6 |
| 128 | 16 | 8 | 16 | 8 | 0.5 | True | 1 |
| 256 | 32 | 8 | 16 | 8 | 0. | True | 1 |
| 512 | 64 | 1 | 16 | 32 | 0 | True | 1 |
### Qwen3 235b
For Qwen3-235b, please refer to [examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh).
To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER |
| -- | -- | -- | -- | -- | -- | -- | -- |
| 32 | 4 | 4 | 8 | 4 | 1. | False | 6 |
| 64 | 8 | 4 | 8 | 4 | 0.5 | True | 6 |
| 128 | 16 | 4 | 8 | 4 | 0 | True | 6 |
| 256 | 32 | 4 | 8 | 4 | 0 | True | 6 |
### Benchmark
Here are some benchmark results for DeepSeek / Qwen3-235B. All configurations match the recommended settings based on the number of GPUs.
| model | num gpus | mean response length | rollout time(s) | GPU memory(GB) | CPU memory(GB) | MFU | step time(s) |
| -- | -- | -- | -- | -- | -- | -- | -- |
| DeepSeek 671b | 96 | 1960 | 1050 | 66 | 1500 | 0.19 | 1700 |
### Qwen3-30B-A3B MOE
For Qwen3-30b, please refer to [examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh).
To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | MFU |
| -- | -- | -- | -- | -- | -- | -- | -- |
| 8 | 1 | 1 | 1 | 8 | 1. | True | 0.4 |
| 16 | 2 | 1 | 1 | 8 | 1. | True | 0.37 |
| 32 | 4 | 1 | 1 | 8 | 1. | True | 0.31 |
## Upcoming Optimizations
The community continue to optimize large MoE models further, ongoing efforts include:
- further optimizing memory consumption, and provide recommended/tuned configurations with various machine types
- optimizing long context RL training performance
- performance improvement with SGLang x Megatron
We invite the community to try and improve verl together. Get connected with us on [slack](https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA)/[wechat](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG)/[Github issues](https://github.com/volcengine/verl/issues/708)!
## Acknowledgement
@vermouth1992 @ISEEKYAN @ETOgaosion @yzlnew @ShareLer @BearBiscuit05 @ccclyu @ann-qin-lu @SwordFaith @zzong2006 @zhaochenyang20 @ocss884 @eric-haibin-lin @chenhaiq @techkang
================================================
FILE: docs/perf/nsight_profiling.md
================================================
# NVIDIA Nsight Systems profiling in verl
Last updated: 06/20/2025.
This guide explains how to use NVIDIA Nsight Systems for profiling verl training runs.
## Configuration
Profiling in verl can be configured through several parameters in the trainer configuration file (ppo_trainer.yaml or other files like dapo_trainer.yaml):
### Prerequisites
Nsight Systems version is important, please reference `docker/Dockerfile.vllm.sglang.megatron` for the version we used.
### Global profiling control
verl has one single controller process and multiple worker processes. Both controller and worker processes can be profiled. Since the controller process can be executed in any nodes in the cluster, there is a message printed in the logging to indicate the controller process node hostname and process id.
In `global_profiler`, three new config entries control the profiler behaviors:
* **`global_profiler.steps`**. List of step numbers at which profiling should be performed. For example: [1, 2, 5] will profile steps 1, 2, and 5. And ``null`` means no profiling.
* **`global_profiler.profile_continuous_steps`**. If true, and the following `global_profiler.discrete==False`, then the continuous steps in `global_profiler.steps` will be combined into one database. For example the above step 1 and 2 are in one database, and 5 in another. If false, every step occupies at least one database. The reason for this config is to observe the program behaviors between steps.
Nsys options in controller nodes and worker nodes are configured in `global_profiler.global_tool_config.nsys`:
* **`global_profiler.global_tool_config.nsys.controller_nsight_options`**. This config group is for the single controller. All fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. `ppo_trainer.yaml` provides a workable example. Users can reference [Nsight Systems manual](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Ray user guide](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html) for more details.
* **`global_profiler.global_tool_config.nsys.worker_nsight_options`**. This config group is for the worker processes. Similarly all fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. Capture range is used to control the profiler when to start and stop. So `capture-range: "cudaProfilerApi"` is fixed and does not change it. Users can change `capture-range-end` with some accurate calculation or just leave it `null`.
### Worker process profiling
Verl manages mulitiple RL roles, _Actor_, _Ref_, _Rollout_, _Critic_, _Reward_, which are implemented in different Worker classes. And these workers can be combined into one Ray Actor, running in a process group. Each RL role has its own profiling config group, `profiler`, which consists of three fields:
* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series ` worker_process_..nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
* **`discrete`**. When set `False`, all the roles actions in one training step will be dumped in one database. When set `True`, the actions annotated by `DistProfiler.annotate` will be dumped into a discrete database. In this case, each role's action occupies one ``.
* **Verl collocate mode**. Verl can combine two Worker sub classes to one Worker Actor. In this case, the user should take care that the combined Workers have consistent `discrete`. The Nsight Systems profiler uses a `torch.cuda.profiler.start()` and `stop()` pair to dump a `` database anyway.
### where to find the profiling data
By default the `*.nsys-rep` files are saved in the directory `/tmp/ray/session_latest/logs/nsight/` at each node. According to the Ray manual, this default directory is not changeable. ["however, Ray preserves the `--output` option of the default config"](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html).
Some users may think it is not convenient, but it is understandable that Ray may start hundreds of processes and it would be a big network file system pressure if we save the files in one central place.
## Usage Example
To enable profiling for specific components and steps, modify your ppo_trainer.yaml like this:
### Disable profiler
```yaml
profiler:
steps: null # disable profile
```
### Enable profiler and one database for one training step
```yaml
global_profiler:
steps: [1, 2, 5]
discrete: False
actor_rollout_ref:
actor:
profiler:
enable: True
all_ranks: True
# rollout & ref follow actor settings
critic:
profiler:
enable: True
all_ranks: True
reward_model:
profiler:
enable: True
all_ranks: True
```
### Enable profiler and multiple databases for one training step
```yaml
profiler:
steps: [1, 2, 5]
discrete: True
```
## Profiling Output
When profiling is enabled, verl will generate Nsight Systems profiles for the specified components and steps. The profiles will include:
- CUDA kernel execution
- Memory operations
- CPU-GPU synchronization
- NVTX markers for key operations
Nsight Systems supports multi-report view, to open multiple databases together. In this mode, different processes and steps can be aligned in one time line for better analysis.
================================================
FILE: docs/perf/perf_tuning.rst
================================================
Performance Tuning Guide
==============================
Last updated: 07/17/2025.
Author: `Guangming Sheng `_, `Jiali Zheng `_
In this section, we will discuss how to tune the performance of all the stages in verl, including:
1. Rollout generation throughput.
2. Enable ``use_remove_padding=True`` for sequence packing (i.e., data packing and remove padding).
3. Batch size tuning for forward and backward computation
4. Enable ``use_dynamic_bsz=True`` for higher throughput.
5. Utilize Ulysses Sequence Parallel for Long Context Training
6. LigerKernel for SFT performance optimization
7. Forward prefetch in FSDP training backend
8. Memory optimization for entropy calculation from logits
Rollout Generation Tuning
--------------------------
verl currently supports two rollout backends: vLLM and TGI (with SGLang support coming soon).
Below are key factors for tuning vLLM-based rollout. Before tuning, we recommend setting ``actor_rollout_ref.rollout.disable_log_stats=False`` so that rollout statistics are logged.
- Increase ``gpu_memory_utilization``.
- For vLLM v0.7.0 and later, the vLLM instance will only use gpu_memory_utilization of the **total** memory.
- For SGLang, it's the fraction of the free GPU memory used for **static** memory like model weights and KV cache. However, the remaining (1-gpu_memory_utilization) will also be used during inference.
However, if model parameters and optimizer states are not offloaded, using too high a fraction can lead to OOM.
A value between 0.5 and 0.7 often strikes a good balance between high throughput and avoiding OOM.
Note: since the definition of ``gpu_memory_utilization`` varies across inference engines, a value that works well for one engine may cause OOM for another.
- Adjust ``max_num_seqs`` or ``max_num_batched_tokens``.
If the GPU cache utilization is relatively low in the log, increase ``max_num_seqs`` or ``max_num_batched_tokens``
can enlarge the effective batch size in the decoding stage, allowing more concurrent requests per batch.
We recommend setting ``max_num_batched_tokens > 2048`` for higher throughput.
- Use a smaller ``tensor_parallel_size``.
When GPU resources allow, a smaller tensor parallel size spawns more vLLM replicas.
Data parallelism (DP) can yield higher throughput than tensor parallelism (TP), but also increases KVCache consumption.
Carefully balance the trade-off between more replicas and higher memory usage.
Our experiment in Sec. 8.4 of `HybridFlow paper `_ evaluate this trade-off.
- Balance performance and memory using ``cudagraph_capture_sizes``.
If ``cudagraph_capture_sizes`` is set, vLLM will try to capture the model execution graph for different batch sizes.
Since cudagraph memory can not be offloaded to cpu, The memory stay in gpu when update actor is running.
Using smaller batch sizes can avoid OOM but slightly reduce throughput.
Must to set ``enforce_eager=False`` to use ``cudagraph_capture_sizes``.
More tuning details such as dealing with Preemption and Chunked-prefill
can be found in `vLLM official tuning guide `_
For optimal performance, we recommend using vLLM v0.8.3 or later. See https://github.com/volcengine/verl/blob/main/docs/README_vllm0.8.md for details.
Enable remove padding (sequence packing)
-----------------------------------------
Currently, for llama, mistral, gemma1 and qwen based models, users can enable `use_remove_padding=True` to utilize the
sequence packing implementation provided by transformers library.
For other models, transformers library may also support it but we haven't tested it yet.
Users can add the desired model config to the `test_transformer.py `_ file.
And test its functionality by running the following command:
.. code-block:: bash
pytest -s tests/models/test_transformer.py
If the test passes, you can add your desired model into the model `registry.py `_ file.
Then, you can enjoy the performance boost of sequence packing
and welcome to PR your tested model to verl!
Batch Size Tuning
-----------------
To achieve higher throughput in experience preparation (i.e., model fwd) and model update (i.e., actor/critic fwd/bwd),
users may need to tune the ``*micro_batch_size_per_gpu`` for different computation.
In verl, the core principle for setting batch sizes is:
- **Algorithmic metrics** (train batch size, PPO mini-batch size) are *global* (from a single-controller perspective),
normalized in each worker. See the `normalization code `_.
- **Performance-related parameters** (micro batch size, max token length for dynamic batch size) are *local* parameters that define the per-GPU data allocations.
See the `normalization code `_.
.. note:: In your training script, please use ``*micro_batch_size_per_gpu`` instead of ``*micro_batch_size``.
So that you don't need to consider the normalization of the ``micro_batch_size`` and ``micro_batch_size`` will be deprecated.
Batch Size Tuning tips
""""""""""""""""""""""
Therefore, users may need to tune the ``*micro_batch_size_per_gpu`` to accelerate training. Here're some tips:
1. **Enable gradient checkpointing**:
Set ``actor_rollout_ref.model.enable_gradient_checkpointing=True`` and ``critic.model.enable_gradient_checkpointing=True``.
This often allows for larger micro-batch sizes and will be beneficial for large mini-batch training.
2. Increase the ``*micro_batch_size_per_gpu`` as much as possible till equals to normalized ``mini_batch_size``.
3. **Use larger forward-only parameters**:
Forward only parameter, such as ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``,
``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``, ``critic.forward_micro_batch_size_per_gpu`` could be larger (e.g., 2x) than training related micro batch sizes,
such as ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``, ``critic.ppo_micro_batch_size_per_gpu``.
4. **Allow larger micro-batch sizes for Critic and Reward models**:
micro batch size of Critic and Reward model could be larger than Actor model. This is because the actor model has much larger vocab size in the final layer.
5. **Enable activation offloading**:
Set ``actor_rollout_ref.model.enable_activation_offload=True`` and ``critic.model.enable_activation_offload=True``.
This often works together with gradient checkpointing to get larger micro-batch sizes and it's only available in FSDP backend now.
Tuning for Dynamic Batch Size
-----------------------------
Dynamic batch size is a technique that allows the model to process similar number of tokens in a single forward pass (with different actual batch sizes).
This can significantly improve the training efficiency and reduce the memory usage.
To utilize this technique, users can set ``use_dynamic_bsz=True`` in actor, ref, critic and reward models.
With ``use_dynamic_bsz=True``, users don't need to tune ``*micro_batch_size_per_gpu``.
Instead, users should tune the following parameters:
- ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``, ``critic.ppo_max_token_len_per_gpu``:
The maximum number of tokens to be processed in fwd and bwd of ``update_policy`` and ``update_critic``.
- ``actor_rollout_ref.ref.log_prob_max_token_len_per_gpu`` and ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``:
The maximum number of tokens to be processed in a the fwd computation of ``compute_log_prob`` and ``compute_ref_log_prob``.
- ``critic.forward_micro_batch_size_per_gpu``, ``reward_model.forward_micro_batch_size_per_gpu``:
The maximum number of tokens to be processed in a the fwd computation of ``compute_values``, ``compute_rm_score``.
Dynamic Batch Size Tuning tips
""""""""""""""""""""""""""""""
Here're some tips to tune the above parameters:
1. **Increase** ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``
Make it at least 2 x (max_prompt_length + max_response_length). We set it to 3x in `run_qwen2-7b_rm_seq_balance.sh `_.
Try to increase it to get higher throughput.
2. **Forward-only parameters can be larger**:
Similar to the non-dynamic-batch scenario, forward-only token limits can exceed those used in forward/backward operations.
3. **Use larger limits for Critic and Reward models**:
Critic and Reward parameters can be set at least 2× the Actor’s limits. For instance, we set them to 4× here:
`run_qwen2-7b_rm_seq_balance.sh `_
.. :math:`\text{critic.ppo_max_token_len_per_gpu} = 2 \times \text{actor.ppo_max_token_len_per_gpu})`.
Ulysses Sequence Parallel for Long Context Training
----------------------------------------------------
To utilize this technique, users can set ``ulysses_sequence_parallel_size>1`` in actor, ref, critic and reward models.
We support different model utilize different ulysses_sequence_parallel_size sizes.
To train long sequence (>32k), users may need to decrease the ``*micro_batch_size_per_gpu`` and ``*max_token_len_per_gpu`` to avoid OOM.
LigerKernel for SFT
----------------------
LigerKernel is a high-performance kernel for Supervised Fine-Tuning (SFT) that can improve training efficiency. To enable LigerKernel in your SFT training:
1. Install liger-kernel via ``pip3 install liger-kernel``. In your SFT configuration file (e.g., ``verl/trainer/config/sft_trainer.yaml``), set the ``use_liger`` parameter:
.. code-block:: yaml
model:
use_liger: True # Enable LigerKernel for SFT
2. The default value is ``False``. Enable it only when you want to use LigerKernel's optimizations.
3. LigerKernel is particularly useful for improving training performance in SFT scenarios.
Forward prefetch in FSDP training backend
----------------------
During the training phase, users can enable forward prefetching in FSDP by setting ``fsdp_config.forward_prefetch=True``. For example, ``actor_rollout_ref.actor.fsdp_config.forward_prefetch=True``. This configuration prefetches the next forward-pass all-gather operation before completing the current forward computation, overlapping communication with computation and improving efficiency. For further details, refer to the `FSDP forward_prefetch `_ documentation.
.. note::
Backward prefetch is unsupported because the ``BACKWARD_POST`` policy may prefetch incorrectly in nested-module cases. For details, see the `FSDP documentation `_
Migrating to FSDP2
----------------------
FSDP2 offers notable improvements over FSDP1. According to `PyTorch TorchTitan benchmarks `_:
- 7% lower GPU memory usage on average
- 1.5% throughput improvement with BF16 training
- Better composability with DTensor and per-parameter sharding
**Enabling FSDP2 in VERL:**
.. code-block:: python
# Enable FSDP2 in actor configuration
actor_rollout_ref.actor.strategy="fsdp2"
.. note::
FSDP2 requires PyTorch 2.1+ and is recommended for models with transformer architecture.
Memory optimization for entropy calculation from logits
----------------------
The ``logits`` tensor (typically of shape ``[bsz*seq_len, voc]``) can consume significant memory. When using ``compute_entropy_from_logits``, memory usage reaches approximately ``[bsz*seq_len, voc] × (4 bytes (float32) + 2 bytes (autocast for softmax+logsumexp) + 1 byte (softmax output))``.
To reduce this memory peak, enable chunked computation by setting:
``actor_rollout_ref.ref.entropy_from_logits_with_chunking = True``
This processes the tensor in chunks of shape ``[chunk_size, voc]`` (e.g., 2048) rather than the full sequence length, exclusively during the model's forward pass.
Additionally, during training, standard gradient checkpointing (``enable_gradient_checkpointing=True``) does not apply to entropy calculations. To reduce memory peaks in this context, set:
``actor_rollout_ref.actor.entropy_checkpointing = True``
This enables entropy recomputation specifically for the entropy calculation, lowering memory usage during training.
================================================
FILE: docs/perf/perf_tuning_on_ascend.rst
================================================
Performance Tuning Guide on Ascend
====================================
Last updated: 01/29/2026.
Author: `Xiaobo Hu `_, `Haozhe Li `_
`Perf Tuning `_ 中介绍的性能调优方法在昇腾设备中同样适用。本文重点介绍了昇腾特有的一些调优手段,包括融合算子优化、特定硬件配置和昇腾亲和特性等。
融合算子
--------------------------
常用融合算子列表
**********************************
融合算子的优化原理为,通过数学意义上的等价替换,将多个算子融为一个算子的计算,减少冗余计算,同时减少下发次数,从而提高性能。几个典型的NPU融合算子列举如下,目前均已在 npu_patch.py 中对 Qwen2、Qwen3 系列模型完成替换。
当前verl中使用的全量融合算子请查阅 `npu_patch.py `_
Matrix Computation-Communication operator fusion (MC2)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MC2 是 CANN 中一系列计算通信融合算子的统称,这些算子将原本串行的通信和计算操作融合在一起,通过内部的切分和流水线并行执行来优化性能。
在 vllm-ascend 中,可以通过指定环境变量:
.. code-block:: sh
export VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE=1
在前向计算的 ``RowParallelLinear`` 中使能 ``torch_npu.npu_mm_all_reduce_base`` ,将分离的 ``matmul`` 和 ``allreduce`` 合并为一个融合算子。
`RotaryMul&RotaryMulGrad `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
torch_npu 接口: ``torch_npu.npu_rotary_mul(x, r1, r2)``
参数说明:
- x: q,k,shape要求输入为4维,一般为 ``[B, N, S, D]`` 或 ``[B, S, N, D]`` 或 ``[S, B, N, D]`` 。
- r1: cos值 ,shape要求输入为4维,一般为 ``[1, 1, S, D]`` 或 ``[1, S, 1, D]`` 或 ``[S, 1, 1, D]`` 。
- r2: sin 值,shape要求输入为4维,一般为 ``[1, 1, S, D]`` 或 ``[1, S, 1, D]`` 或 ``[S, 1, 1, D]`` 。
`RmsNorm&RmsNormGrad `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
torch_npu 接口: ``torch_npu.npu_rms_norm(self, gamma, epsilon=1e-06) -> (Tensor, Tensor)``
参数说明:
- self: Tensor 类型,shape 支持 1-8 维。
- gamma: Tensor 类型,通常为weight,shape 要求与 self 的后几维保持一致。
- epsilon: Float 数据类型,用于防止除 0 错误。
输出说明:
- 第 1 个输出为 Tensor,计算公式的最终输出y。
- 第 2 个输出为 Tensor, rms_norm 的中间结果 rstd ,用于反向计算。
`Swiglu `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
torch_npu 接口: ``torch_npu.npu_swiglu(Tensor self, int dim=-1) -> (Tensor)``
参数说明:
- self: Tensor 类型,shape支持 1-8 维。
- dim: Int 类型,默认为 -1。
输出说明:
- 输出为 Tensor,计算公式的最终输出 y。
`GroupMatMul `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
函数原型:
.. code:: python
npu_grouped_matmul(
x,
weight,
*,
bias=None,
scale=None,
offset=None,
antiquant_scale=None,
antiquant_offset=None,
per_token_scale=None,
group_list=None,
activation_input=None,
activation_quant_scale=None,
activation_quant_offset=None,
split_item=0, group_type=None,
group_list_type=0,
act_type=0,
output_dtype=None,
tuning_config=None
) -> List[Tensor]
详细使用方法见标题文档链接
FSDP后端融合算子使用方法
**********************************
在 ``verl/models/transformers/npu_patch.py`` 目录中,已经把可用的融合算子通过 patch 的形式进行替换,无需进行其他操作即可默认进行使用
Megatron后端融合算子使用方法
**********************************
Megatron 的融合算子集成在 MindSpeed 中,需要添加特定参数开启:
1. **Flash Attention(必须开启)**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
++actor_rollout_ref.ref.megatron.override_transformer_config.use_flash_attn=True
2. **RotaryMul**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True
+actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_rotary_pos_emb=True
3. **RMSNorm**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_rmsnorm=True
4. **GroupMatMul**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True
5. **Swiglu**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_swiglu=True
6. **Permute/Unpermute**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.fused_permute_unpermute=True
7. **MC2**
::
+actor_rollout_ref.actor.megatron.override_transformer_config.use_ascend_mc2=True
昇腾通用配置
--------------------------
`算子下发 `_
************************************************************************************************************************************************************************************************************
通过 ``TASK_QUEUE_ENABLE`` 可配置 task_queue 算子下发队列优化等级,默认为 Level 1 优化。该配置可以减少host下发时间,可用于缓解由下发导致的整体free过大问题。
.. image :: https://github.com/verl-project/verl-data/blob/main/images/ascend/perf_tuning_task_queue.png
:width: 500px
Level 0 : 不开启下发流水优化。
Level 1 : \ 将算子下发任务分为两段,一部分任务(主要是 aclnn 算子的调用)放在新增的二级流水上,一、二级流水通过算子队列传递任务,相互并行,通过部分掩盖减少整体的下发耗时,提升端到端性能。
Level 2 : \ 基于 Level 1 的优化进一步平衡了一、二级流水的任务负载,主要是将 workspace 相关任务迁移至二级流水,掩盖效果更好,性能收益更大。该配置仅在二进制场景生效,建议配置值为 Level 2 优化。
`通讯算法编排展开 `_
************************************************************************************************************************************************************************************************************
使用环境变量 ``HCCL_OP_EXPANSION_MODE=AIV`` 用于配置通信算法的编排展开位置,支持如下取值:
- **AI_CPU:** 代表通信算法的编排展开位置在 Device 侧的 AI CPU,Device 侧根据硬件型号自动选择相应的调度器。
- **AIV:** 代表通信算法的编排展开位置在 Device 侧的 Vector Core,执行也在 Vector Core。
- **HOST:** 代表通信算法的编排展开位置为 Host 侧 CPU,Device 侧根据硬件型号自动选择相应的调度器。
- **HOST_TS:** 代表通信算法的编排展开位置为 Host 侧 CPU,Host 向 Device 的 Task Scheduler 下发任务,Device 的 Task Scheduler 进行任务调度执行。
推理阶段调优
--------------------------
Chunked Prefill in V1
***************************
VLLM 当前版本已默认启用 VLLM V1,使用以下配置启用 Chunked Prefill:
.. code-block:: sh
actor_rollout_ref.rollout.enable_chunked_prefill=True
原理参考 `VLLM 官方文档 `_。
Graph Mode
***************************
与 CUDA 类似,NPU 通过以下配置启用 **ACL Graph**:
.. code-block:: sh
actor_rollout_ref.rollout.enforce_eager=False
文档:`ACL Graph `_
.. note::
ACL Graph 与 ``taskqueue Level 2`` 原理冲突,**二者无法同时开启**。
训练阶段调优
--------------------------
FSDP
**********************************
.. csv-table::
:header: "FSDP", "说明"
:widths: 30, 60
"/","仅切分优化器(Zero-1)"
SHARD_GRAD_OP,切分梯度和优化器(Zero-2)
"HYBRID_SHARD","切分权重、梯度和优化器(Zero-3)"
"2D device_mesh+HYBRID_SHARD","又称HSDP(FSDP+DDP)例如device_mesh=[2,8], 每8个rank为一个FSDP组,组内进行FSDP切分,共有两个组,两个组间进行DDP,通过allreduce同步梯度。"
"2D device_mesh+HYBRID_SHARD_ZERO2","HSDP的Zero2版本"
NO_SHARD,DDP
FSDP 不支持 Zero-1, VeRL中会根据卡数和 ``actor_rollout_ref.actor.fsdp_config.fsdp_size`` 来决定 device mesh 的取值,默认使用 Zero-3 进行切分;如果模型较小(建议小于 7B 时),可以通过控制参数 ``actor_rollout_ref.actor.fsdp_config.reshard_after_forward`` 为 ``True`` 在 FSDP/FSDP2 上使用 Zero-2 来优化性能.
Megatron
**********************************
在模型较大时,使用 Megatron 作为训练后端可以更灵活的进行性能调优。
当 DP 并行显存无法容纳模型时,优先开启 TP 来切分模型权重,如果模型仍然过大,再开启 PP 来进一步切分;如果序列过长导致激活太大,则可以开启 CP 和 SP 来进行优化;在 MoE 模型中则可以额外开启 EP 来控制对专家的切分,如果专家过小,为了避免将权重切的果味细碎,则可以开启 ETP 来避免 MoE 部分的 TP 切分,而将多个完整的专家分布到 DP 和 TP 上。
TP、PP、EP、ETP和 Megatron 使用方式一样,CP 和 SP 在 NPU 上开启方式:
- SP: ``Sequence Parallel`` 在 Tensor Parallel 的基础上进一步提高计算效率,是一种通过将输入数据的序列维度进行切分的并行计算方式。在 NPU 上通过 MindSpeed 来调用SP:
::
actor_rollout_ref.actor.megatron.override_transformer_config.sequence_parallel=True
- CP: ``Context Parallel`` 是一种在多个 GPU/NPU 上并行处理神经网络激活值的方法,他通过在序列维度上对输入张量进行划分来实现。在 NPU 上通过 MindSpeed 来调用 CP (两个参数必须同时添加):
::
actor_rollout_ref.actor.megatron.context_parallel_size
actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_size
Megatron-distributed optimizer
**********************************
在面对较大尺寸模型时,通常需要将优化器分片到一个 DP 域内的每张卡上来节省显存。Megatron 后端下在 NPU 上开启分布式优化器:
::
+actor_rollout_ref.actor.megatron.override_transformer_config.use_distributed_optimizer=True
================================================
FILE: docs/perf/torch_profiling.md
================================================
# PyTorch Profiling in verl
Last updated: 01/13/2026.
This guide explains how to use the native [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) for profiling verl training runs.
## Configuration
Profiling in verl can be configured through parameters in the trainer configuration file (e.g., `ppo_trainer.yaml`).
### Global Profiling Control
In `global_profiler`, you can control when and how profiling occurs globally:
* **`global_profiler.steps`**: List of step numbers to profile. E.g., `[1, 2, 5]` profiles steps 1, 2, and 5. Set to `null` to disable.
* **`global_profiler.save_path`**: Directory to save the profiling results. Default is `outputs/profile`.
### Role Profiling Control
Each RL role (Actor, Critic, etc.) has its own `profiler` configuration:
* **`enable`**: Whether to enable profiling for this role.
* **`all_ranks`**: If `True`, profiles all ranks.
* **`ranks`**: List of specific ranks to profile if `all_ranks` is `False`.
* **`tool_config.torch`**: Configuration specific to the PyTorch Profiler.
#### PyTorch Profiler Options (`tool_config.torch`)
You can customize the PyTorch Profiler behavior using the following fields under `tool_config.torch`:
* **`contents`**: List of contents to profile.
* **`cpu`**: Profile CPU activities.
* **`cuda`**: Profile CUDA activities.
* **`memory`**: Track tensor memory allocation/free.
* **`shapes`**: Record shapes of operator inputs.
* **`stack`**: Record source code file and line number.
* **`schedule`**: (Advanced) configuration for `wait`, `warmup`, `active`, `repeat` cycles.
## Examples
### 1. End-to-End Collection
Collects performance data for all steps in a single trace file.
```yaml
global_profiler:
steps: [1, 2, 5]
save_path: ./outputs/profile
actor_rollout_ref:
actor:
profiler:
enable: True
all_ranks: True
tool_config:
torch:
discrete: False
contents: [cpu, cuda]
# rollout & ref follow actor settings
```
### 2. Discrete Mode Collection
Discrete mode saves separate trace files for each step. This is useful for detailed analysis and is **mandatory** when using Agent Loop.
**Configuration Example**
This configuration supports profiling both Training (Actor) and Inference (Rollout). You can enable/disable them independently.
```yaml
actor_rollout_ref:
actor:
profiler:
enable: True # Set to True to profile training
all_ranks: False
ranks: [0] # Global Rank 0
tool_config:
torch:
discrete: True
contents: [cpu, cuda]
rollout:
profiler:
enable: True # Set to True to profile inference
all_ranks: False
ranks: [0] # In Agent Loop, this is the Replica Rank (e.g. 0-th instance)
tool_config:
torch:
discrete: True # REQUIRED
# ref follow actor settings
```
**Agent Loop Mode Description**
When Rollout runs in [Agent Loop](../advance/agent_loop.rst) mode, performance data for the Rollout phase **must be collected using discrete mode**. In this case, the Profiler is triggered by the inference engine backend.
1. Rank Definition: ranks in the Rollout configuration refers to Replica Rank (inference instance index), not Global Rank.
2. Inference Engine Support: Currently, vLLM and SGLang engines are supported without additional settings. Specific details are as follows:
* **vLLM Engine**: Automatically collects AsyncLLM scheduling stacks and inference process performance data.
* **SGLang Engine**: Automatically collects inference process performance data. Does not support the memory option in contents.
## Visualization
Collected trace files (usually `.json` or `.json.gz`) are stored in the configured `save_path`.
You can visualize them using:
1. **Chrome Tracing**: Open `chrome://tracing` in a Chrome browser and load the JSON file.
2. **Perfetto**: Open [ui.perfetto.dev](https://ui.perfetto.dev/) and load the file (recommended for large traces).
3. **TensorBoard**: If using the TensorBoard plugin for PyTorch Profiler.
================================================
FILE: docs/perf/verl_profiler_system.md
================================================
# verl Profiler System
Last updated: 08/18/2025.
## Architecture
The architecture of verl profiler system is like below:

There is a global profiler and tool configuration to set some common config in single controller level, deciding
- `tool`: which tool to use
- `steps`: which steps to profile
- `save_path`: results saving path
When some tool need to profile behavior of each role, configurations in role-level is needed:
- `tool`: which tool to use
- `enable`: whether enable profiling on this role
- rank info: `all_ranks` and `rank` to decide which rank to profile or log output
For tool config in role-level, there are some detailed behavior needed to control, like the `discrete` mode in nsys profiler.
Every role has a profiler config, and by default, rollout/ref/reward models follow the Actor's behavior.
## To Add a new profiling tool
New added profiling tool shall reuse the current APIs as much as possible.
1. The logic of **whether to use the tool**: `tool == [new tool]`.
2. Add the global and local tool config to `ppo_trainer.yaml`/`ppo_megatron_trainer.yaml` and each `[role].yaml`, under `global_tool_config.[new tool]` and `tool_config.[new tool]`
3. The tool config should be implemented in `verl/utils/profiler/config.py`, inherit the `BaseConfig` class.
4. Implement profiling tool initialization logic using configurations in `global_profiler.global_tool_config.[new tool]` and the results saving logics (can also save in role-level profile)
5. For role function-level profiling, please follow the nsys profiler way in `nvtx_profiler.py`, implement a profiler class inherit `DistProfiler` and import new profiler in `verl/utils/profiler/__init__.py`
6. Add unit test and examples for others to use in convinience.
================================================
FILE: docs/preparation/prepare_data.rst
================================================
Prepare Data for Post-Training
========================================
Last updated: 02/09/2025.
Before starting the post-training job, we need to prepare the data for
the policy training. The data should be stored in the parquet format.
We provide several data preprocess scripts for different datasets,
including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
to follow the following steps: The data preprocess script can be divided
into two parts:
1. The first part is the common part, which loads the dataset from
huggingface's ``datasets`` package. Then preprocess the datasets with
the ``make_map_fn`` and then store in the parquet format.
.. code:: python
import re
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
# To extract the solution for each prompts in the dataset
# def extract_solution(solution_str):
# ...
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
parser.add_argument('--hdfs_dir', default=None)
args = parser.parse_args()
num_few_shot = 5
data_source = 'openai/gsm8k'
dataset = datasets.load_dataset(data_source, 'main')
train_dataset = dataset['train']
test_dataset = dataset['test']
# Construct a `def make_map_fn(split)` for the corresponding datasets.
# ...
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
2. The users are required to implement the ``make_map_fn()`` function
(as well as the ``extract_solution``) on their own to support
different datasets or tasks.
We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
datasets. And we take the GSM8k dataset as an example:
**GSM8K**
In the ``make_map_fn``, each data field should consist of the following
5 fields:
1. ``data_source``: The name of the dataset. To index the corresponding
reward function in the ``RewardModel``
2. ``prompt``: This field should be constructed in the format of
huggingface chat_template. The tokenizer in ``RLHFDataset`` will
apply chat template and tokenize the prompt.
3. ``ability``: Define the task category.
4. ``reward_model``: Currently, we only utilize the ``ground_truth``
field during evaluation. The ``ground_truth`` is computed by the
``extract_solution`` function. **NOTED** that the implementation of
the corresponding reward function should align with this extracted
``ground_truth``.
5. ``extra_info``: Record some information of the current prompt. Not
use for now.
.. code:: python
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split('#### ')[1].replace(',', '')
return final_solution
instruction_following = "Let's think step by step and output the final answer after \"####\"."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop('question')
question = question + ' ' + instruction_following
answer = example.pop('answer')
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
================================================
FILE: docs/preparation/reward_function.rst
================================================
Implement Reward Function for Dataset
======================================
Last updated: 06/02/2025.
For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses.
We already pre-implemented some reward functions in `reward_score directory `_.
You can also use customized reward functions.
Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g.,
full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model
and SandBox (will opensource soon) for evaluation respectively.
RewardManager
-------------
In the entrypoint of the PPO Post-Training script `main_ppo.py `_,
we implement a ``RewardManager`` that utilize pre-implemented reward functions to compute the scores for each response.
In the ``RewardManager``, we implemented a ``__call__`` function to
compute the score for each response.
All the reward functions are executed by ``compute_score_fn``.
The input is a ``DataProto``, which includes:
- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying
chat_template, including prompt and response
- ``responses``: response tokens
- ``ground_truth``: The ground truth string of the current prompt.
Stored in ``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
- ``data_source``: The dataset name of the current prompt. Stored in
``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
After detokenize the responses, the responses string and the ground
truth string will be input to the ``compute_score_fn`` to compute the
score for each response.
Reward Functions
----------------
Pre-implemented
~~~~~~~~~~~~~~~
We already pre-implemented some reward functions in `reward_score directory `_.
- In the `GSM8k example `_, we
force the response to output the final answer after four ####, then
use string matching to compare with the ground truth. If completely
correct, score 1 point; if the format is correct, score 0.1 points; if
the format is incorrect, score 0 points.
- In the `MATH example `_, we follow
the implementation in `lm-evaluation-harness repository `_.
Customized
~~~~~~~~~~
You can implement customized reward functions in a separate file and specify them using ``custom_reward_function.path`` and ``custom_reward_function.name``. For the set of them, please refer to :ref:`config-explain-page`.
The parameters of your reward function should be ``data_source``, ``solution_str``, ``ground_truth``, and ``extra_info``.
For example:
.. code:: python
def my_reward_fn(data_source, solution_str, ground_truth, extra_info=None):
return len(solution_str)/100
If you are testing only a single customized reward function, you can simply name it 'compute_score' and leave ``custom_reward_function.name`` unset.
To run multiple tests with different customized reward functions, you can modify both ``custom_reward_function.path`` and ``custom_reward_function.name`` for each trial.
For instance, you might create a single `my_reward.py` file and implement multiple reward functions within it. This way, for different trials, you only need to adjust ``custom_reward_function.name``, making it more convenient to conduct multiple tests within scripts.
================================================
FILE: docs/requirements-docs.txt
================================================
# markdown support
recommonmark
myst_parser
# markdown table support
sphinx-markdown-tables
# theme default rtd
# crate-docs-theme
sphinx-rtd-theme
# pin tokenizers version to avoid env_logger version req
tokenizers==0.21
================================================
FILE: docs/sglang_multiturn/interaction_system.rst
================================================
Interaction System for Multi-turn RL Training
=============================================
Last updated: 06/25/2025.
Overview
--------
The verl interaction system enables dynamic, multi-turn conversational feedback during reinforcement learning training. This system allows models to engage in iterative problem-solving scenarios where interaction agents can provide corrective feedback, guidance, or evaluation based on the model's responses.
**New in Multi-Interaction Support**: The system now supports multiple named interactions within a single training session, enabling sophisticated training scenarios where different samples can use different interaction strategies. This allows for curriculum learning, domain-specific feedback, and flexible agent switching at the sample level.
Key features:
- **Async-based Architecture**: Non-blocking interaction processing for distributed training
- **Instance Management**: Stateful session handling with unique instance IDs for concurrent interactions
- **SGLang Integration**: Seamless integration with SGLang rollout system for multi-turn conversations
- **Configuration-driven**: Dynamic agent loading via YAML configuration files
- **Multi-Interaction Support**: Registry system enabling multiple named interactions per rollout
- **Sample-Level Selection**: Each sample can specify which interaction to use via configuration
- **Reward Integration**: Turn-level scoring mechanism integrated with verl's reward system
Architecture
------------
The interaction system follows a plugin-based architecture with clear separation of concerns:
.. code-block::
Interaction Registry System
↓
BaseInteraction (Abstract Interface)
↓
Multiple Named Interactions (e.g., Gsm8kInteraction, CustomInteraction)
↓
SGLang Rollout Integration (interaction_map)
↓
Sample-Level Interaction Selection
↓
Async Request Lifecycle Management
Core Components
~~~~~~~~~~~~~~~
**Interaction Registry System**
The interaction registry system allows loading and managing multiple named interactions:
.. code-block:: python
from verl.interactions.utils.interaction_registry import initialize_interactions_from_config
# Load multiple interactions from config
interaction_map = initialize_interactions_from_config("config.yaml")
# Access specific interaction by name
gsm8k_interaction = interaction_map["gsm8k"]
custom_interaction = interaction_map["custom_solver"]
**BaseInteraction Interface**
All interaction agents must implement the ``BaseInteraction`` abstract class:
.. code-block:: python
from verl.interactions.base import BaseInteraction
from typing import Dict, Any, List, Tuple, Optional
class BaseInteraction:
def __init__(self, config: Dict[str, Any]):
self.config = config
self.name: str = config.get("name", "interaction_agent")
async def start_interaction(self, instance_id: Optional[str] = None, **kwargs) -> str:
"""Initialize interaction session, return instance_id"""
async def generate_response(self, instance_id: str, messages: List[Dict[str, Any]], **kwargs) -> Tuple[bool, str, float, Dict[str, Any]]:
"""Generate response, return (should_terminate, response, score, metadata)"""
async def calculate_score(self, instance_id: str, **kwargs) -> float:
"""Calculate turn-level score for RL training"""
async def finalize_interaction(self, instance_id: str, **kwargs) -> None:
"""Clean up resources"""
**Request Lifecycle**
The interaction system integrates with SGLang's async rollout via state management:
1. ``PENDING`` → Initialize interaction via ``start_interaction()``
2. ``GENERATING`` → Model generates response
3. ``INTERACTING`` → Process response via ``generate_response()``
4. ``GENERATING`` → Continue if not terminated, otherwise ``COMPLETED``
Configuration
-------------
**Basic Setup**
Enable interaction in your rollout configuration:
.. code-block:: yaml
actor_rollout_ref:
rollout:
multi_turn:
enable: true
interaction_config_path: "path/to/interaction_config.yaml"
max_user_turns: 10
max_assistant_turns: 10
**Interaction Configuration File**
Create an interaction configuration file (e.g., ``interaction_config.yaml``):
**Single Interaction (Legacy Format)**
.. code-block:: yaml
interaction:
- name: "gsm8k"
class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
config: {}
**Multiple Interactions (New Format)**
.. code-block:: yaml
interaction:
- name: "gsm8k"
class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
config: {}
- name: "custom_solver"
class_name: "custom.interactions.CustomInteraction"
config:
solver_type: "advanced"
timeout: 30
- name: "code_verifier"
class_name: "verl.interactions.base.BaseInteraction"
config:
verification_mode: "strict"
**Automatic Name Generation**
If no ``name`` field is provided, the system will automatically generate one from the class name:
.. code-block:: yaml
interaction:
- class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
config: {}
# Automatically generates name: "gsm8k"
The system will dynamically load all specified interaction classes and make them available by name.
Implementation Example: GSM8K
-----------------------------
The GSM8K interaction demonstrates a complete implementation for math problem-solving scenarios:
.. code-block:: python
from verl.interactions.base import BaseInteraction
from verl.utils.reward_score import gsm8k
from uuid import uuid4
class Gsm8kInteraction(BaseInteraction):
def __init__(self, config: dict):
super().__init__(config)
self._instance_dict = {}
async def start_interaction(self, instance_id=None, ground_truth=None, **kwargs):
if instance_id is None:
instance_id = str(uuid4())
self._instance_dict[instance_id] = {
"response": "",
"ground_truth": ground_truth,
"reward": 0.0,
}
return instance_id
async def generate_response(self, instance_id, messages, **kwargs):
# Extract last assistant message content
content = ""
for item in reversed(messages):
if item.get("role") == "assistant":
content = item.get("content", "")
break
# Ensure GSM8K format (#### prefix)
self._instance_dict[instance_id]["response"] = content
reward = await self.calculate_score(instance_id)
if reward == 1.0:
return True, "Your response is correct!", 1.0, {}
else:
return False, "Your response is incorrect! You need to reflect on your answer and try again.", 0.0, {}
async def calculate_score(self, instance_id, **kwargs):
return gsm8k.compute_score(
self._instance_dict[instance_id]["response"],
self._instance_dict[instance_id]["ground_truth"],
method="strict", format_score=0.0, score=1.0,
)
async def finalize_interaction(self, instance_id, **kwargs):
del self._instance_dict[instance_id]
Training Integration
--------------------
**Training Script Configuration**
Include interaction configuration in your training command:
.. code-block:: bash
python3 -m verl.trainer.main_ppo \\
--config-path="$CONFIG_PATH" \\
--config-name='gsm8k_multiturn_grpo_w_interaction' \\
algorithm.adv_estimator=grpo \\
data.train_batch_size=512 \\
data.return_raw_chat=True \\
actor_rollout_ref.rollout.name=sglang \\
actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \\
trainer.total_epochs=15
**Data Requirements**
Ensure your dataset includes interaction parameters with the ``name`` field for interaction selection:
.. code-block:: python
# Dataset should include interaction_kwargs in non_tensor_batch
interaction_kwargs = [
{"name": "gsm8k", "query": "What is 2+2?", "ground_truth": "4"},
{"name": "custom_solver", "query": "Solve: x^2 + 5x + 6 = 0", "ground_truth": "x = -2, -3"},
{"name": "gsm8k", "query": "What is 3+3?", "ground_truth": "6"},
]
**Sample-Level Interaction Selection**
Each sample can specify which interaction to use via the ``name`` field. This enables flexible training scenarios where different samples use different interaction strategies:
.. code-block:: python
# Example: Math problems use GSM8K interaction, code problems use code verifier
data_samples = [
{
"prompt": "What is 15% of 200?",
"interaction_kwargs": {
"name": "gsm8k",
"query": "What is 15% of 200?",
"ground_truth": "30"
}
},
{
"prompt": "Write a function to check if a number is prime",
"interaction_kwargs": {
"name": "code_verifier",
"code_type": "python",
"expected_behavior": "return True for prime numbers"
}
}
]
**Backward Compatibility**
If no ``name`` field is provided in ``interaction_kwargs``, the system defaults to ``"gsm8k"`` for backward compatibility.
Best Practices
--------------
**Resource Management**
- Always implement proper cleanup in ``finalize_interaction()``
- Use unique instance IDs to avoid conflicts in concurrent training
- Handle edge cases like empty messages or malformed content
**Performance Optimization**
- Keep interaction logic lightweight to avoid blocking training
- Use async/await properly to maintain non-blocking behavior
- Consider caching expensive computations within interaction instances
**Testing**
Comprehensive testing is essential for interaction systems:
.. code-block:: python
import pytest
from unittest.mock import patch
@pytest.mark.asyncio
async def test_interaction_workflow():
interaction = YourInteraction({})
# Test complete workflow
instance_id = await interaction.start_interaction(ground_truth="expected_answer")
messages = [{"role": "user", "content": "user_content"}, {"role": "assistant", "content": "assistant_content"}]
should_terminate, response, reward, metadata = await interaction.generate_response(instance_id, messages)
assert should_terminate in [True, False]
assert isinstance(reward, float)
await interaction.finalize_interaction(instance_id)
Advanced Usage
--------------
**Multi-Interaction Training Strategies**
You can design sophisticated training scenarios using multiple interactions:
.. code-block:: python
# Example: Progressive difficulty with different interaction agents
class MathTrainingPipeline:
def create_interaction_config(self):
return {
"interaction": [
{
"name": "basic_math",
"class_name": "verl.interactions.gsm8k_interaction.Gsm8kInteraction",
"config": {"difficulty": "easy"}
},
{
"name": "advanced_math",
"class_name": "custom.interactions.AdvancedMathInteraction",
"config": {"difficulty": "hard", "allow_hints": True}
},
{
"name": "competition_math",
"class_name": "custom.interactions.CompetitionMathInteraction",
"config": {"time_limit": 300, "show_steps": False}
}
]
}
def create_curriculum_data(self, epoch):
if epoch < 5:
return [{"name": "basic_math", ...} for _ in samples]
elif epoch < 10:
return [{"name": "advanced_math", ...} for _ in samples]
else:
return [{"name": "competition_math", ...} for _ in samples]
**Custom Scoring Functions**
You can integrate custom reward functions:
.. code-block:: python
async def calculate_score(self, instance_id, **kwargs):
response = self._instance_dict[instance_id]["response"]
ground_truth = self._instance_dict[instance_id]["ground_truth"]
# Custom evaluation logic
if custom_evaluation_function(response, ground_truth):
return 1.0
else:
return 0.0
**Multi-step Interactions**
For complex scenarios requiring multiple feedback rounds:
.. code-block:: python
async def generate_response(self, instance_id, messages, **kwargs):
instance = self._instance_dict[instance_id]
instance["attempts"] += 1
# Evaluate current response
reward = await self.calculate_score(instance_id)
if reward > 0.8:
return True, "Excellent work!", reward, {}
elif instance["attempts"] < 3:
return False, "Good attempt, but try to improve...", reward, {}
else:
return True, "Maximum attempts reached.", reward, {}
Troubleshooting
---------------
**Common Issues**
1. **Instance ID Conflicts**: Ensure unique instance IDs across concurrent sessions
2. **Memory Leaks**: Always call ``finalize_interaction()`` to clean up resources
3. **Blocking Operations**: Keep interaction logic async and non-blocking
4. **Configuration Errors**: Verify interaction config path and class name are correct
5. **Interaction Name Conflicts**: Ensure all interactions have unique names in the configuration
6. **Missing Interaction**: Verify the ``name`` field in ``interaction_kwargs`` matches available interactions
7. **Backward Compatibility**: When migrating from single to multi-interaction, add ``name`` fields to existing data
**Debugging**
Enable debug logging to trace interaction flow:
.. code-block:: bash
export VERL_LOGGING_LEVEL=DEBUG
**Performance Monitoring**
Monitor interaction performance impact on training throughput and adjust accordingly.
Related Documentation
--------------------
- :doc:`multiturn`: Basic multi-turn rollout configuration
- :doc:`sandbox_fusion`: Tool integration with SGLang
- :doc:`search_tool_example`: Search tool implementation example
================================================
FILE: docs/sglang_multiturn/multiturn.rst
================================================
Multi-turn Rollout Support
==========================
Last updated: 06/27/2025.
Basic Configuration
~~~~~~~~~~~~~~~~~~~
To enable multi-turn rollout, make sure to configure the following fields in your rollout configuration:
.. code-block:: yaml
actor_rollout_ref:
rollout:
multi_turn: True
name: "sglang"
These configuration activates the sglang engine for multi-turn interaction during rollout.
Custom Tool Configuration
~~~~~~~~~~~~~~~~~~~~~~~~~
For custom environment interaction tools, you can implement your own tools based on ``verl.tools.base_tool.BaseTool``. Then, specify your tool configurations in a YAML file:
.. code-block:: yaml
tools:
- class_name: ""
config:
type: native
tool_schema:
You may refer to GSM8KTool_example_configuration_, which is one example of the tool configurations. Its implementation can be found in gsm8k_tool.py_.
Finally, set the ``tools_config_file`` in your rollout config:
.. code-block:: yaml
actor_rollout_ref:
rollout:
tool_kwargs:
tools_config_file:
This allows integration of customized tool behaviors during actor rollout steps.
If you want rollout with simulated interaction, you can set the ``interaction_config_file`` in your rollout config:
.. code-block:: yaml
interaction:
- class_name: ""
config: {}
.. code-block:: yaml
actor_rollout_ref:
rollout:
interaction_config_file:
If your tool creates multi-modal inputs, you should return a list of multi-modal inputs in your tool.execute() implementation.
Image and video should be processed before returning. For example, if you are using Qwen2.5-VL, you can use the following code to get the representations:
.. code-block:: python
async def create(self, ...) -> tuple[str, ToolResponse]:
...
from verl.utils.dataset.vision_utils import process_image, process_video
img1 = process_image(img1)
video1 = process_video(video1)
# due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos
# link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
return instance_id, ToolResponse(image=[img1, ...], video=[video1, ...], text="...")
async def execute(self, ...) -> Tuple[str | Dict[str, Any], float, dict]:
...
from verl.utils.dataset.vision_utils import process_image, process_video
img1 = process_image(img1)
video1 = process_video(video1)
# due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos
# link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
return ToolResponse(image=[img1, ...], video=[video1, ...], text="..."), 0, {}
remeber to set ``return_multi_modal_inputs: False`` in your dataset config in order to process the multi-modal inputs in the rollout correctly.
Refer to the `Handling Multi-Modal Inputs in Datasets`_ section for more details.
MCP Tool Configuration
~~~~~~~~~~~~~~~~~~~~~~
For MCP interaction tools, you can flexibly configure them using a YAML file. The typical setup is as follows:
.. code-block:: yaml
tools:
- class_name: ""
config:
type: mcp
mcp:
mcp_servers_config_path: ./mcp_server.json
tool_selected_list: {}
The ``tool_selected_list`` field is optional and specifies which tools to use from the servers. If you want to enable all available tools, simply omit this attribute. Besides, ``mcp_servers_config_path`` points to a JSON file containing the MCP server configurations. For example:
.. code-block:: json
{
"mcpServers": {
"SSE Server": {
"url": "your_server_url",
"auth_token": "your_server_api_token"
},
"STDIO Server": {
"command": "npx",
"args": ["-y", "server-mcp@0.2.1"],
"env": {
"SERVER_API_KEY": "your_server_api_token"
}
}
}
}
Since the content formats returned by the MCP server may vary, users can inherit from ``MCPBaseTool`` and override the ``_parse_tool_result`` method to implement custom parsing logic.
.. code-block:: python
class MCPYourTool(MCPBaseTool):
def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
super().__init__(config, tool_schema)
def _parse_tool_result(self, content: list) -> Tuple[str, dict]:
...
Overall, you may refer to mcp_search_tool.py_ and mcp_tool_config.yaml_ for custom implementation and configuration.
Multi-turn Tokenization
~~~~~~~~~~~~~~~~~~~~~~~
Tokenizing multi-turn rollouts poses a challenge: after applying the chat template and tokenizing the full message list, it's hard to identify which tokens belong to assistant messages. Since the token list is flat, it lacks direct alignment with the message roles.
To address this, we adopt a **delta-based tokenization** strategy. Each time the LLM generates a new message, we:
1. Apply the chat template to all prior messages (`messages[:i]`).
2. Apply the chat template again including the latest message (`messages[:i+1]`).
3. Tokenize only the *delta* between these two serialized message strings.
This ensures that only tokens generated by the assistant are included in the loss mask.
.. code-block:: python
# When using tokenizer
# Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
prev = tokenizer.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
curr = tokenizer.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens
.. code-block:: python
# When using processor
# Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
prev = processor.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
prev_model_inputs = processor(text=prev, images=images, videos=videos, return_tensors="pt")[0].tolist()
curr = processor.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
curr_model_inputs = processor(text=curr, images=images, videos=videos, return_tensors="pt")[0].tolist()
token_ids += curr_model_inputs["input_ids"][len(prev_model_inputs["input_ids"]):]
loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens
While we've validated this produces consistent results with full message tokenization, future models' chat template could break compatibility. To guard against silent inconsistencies, we compare the delta-based tokenization with full-tokenization results by default at the end of each rollout.
If you see the following warning, you can check the mismatched substring in the log:
.. code-block::
Inconsistent training and inference tokenization detected. This may lead to unexpected behavior during training. Please review your chat template to determine if this is intentional. For more information, refer to the multiturn README.md.
The tokenization sanity check mode can be configured using the ``actor_rollout_ref.rollout.multi_turn.tokenization_sanity_check_mode`` parameter, which accepts the following values:
- ``strict`` (default): Performs strict comparison between delta-based and full tokenization results, raising warnings for any differences.
- ``ignore_strippable``: Ignores differences in whitespace characters (``\n``, ``\t``, ``\r``, spaces) while still checking for meaningful text mismatches. This is useful when debugging chat template issues where whitespace variations are expected and acceptable.
- ``disable``: Completely disables the tokenization sanity check. Only use this if you have thoroughly validated that tokenization discrepancies are expected and won't impact training.
Example configuration:
.. code-block:: yaml
actor_rollout_ref:
rollout:
multi_turn:
tokenization_sanity_check_mode: "ignore_strippable" # Choose from: "disable", "ignore_strippable", "strict"
Handling Multi-Modal Inputs in Datasets
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If your dataset includes multi-modal inputs (such as images or videos), you can control whether these are pre-processed and included in each sample by setting the return_multi_modal_inputs flag in your dataset config (used by RLHFDataset).
- ``return_multi_modal_inputs: True`` (default): The dataset will pre-process and include a multi_modal_inputs dictionary for each sample. This dict contains the model-ready representations (e.g., image tensors, video tensors, etc.) as produced by your processor. This is useful for single-turn or SFT-style training, where the model expects all modalities to be present in the batch.
- ``return_multi_modal_inputs: False``: The dataset will not include the multi_modal_inputs field. This is recommended for multi-turn RL or tool-augmented rollouts, where the model may generate new multi-modal inputs dynamically during rollout, and you want to avoid conflicts or redundant data in the batch.
Special Cases
^^^^^^^^^^^^^
Some models (e.g., Qwen/QwQ-32B and Qwen3 series) remove internal reasoning content during chat template rendering. As a result, the message content can vary across turns, making the delta-based tokenization inaccurate.
For example, for the following conversation:
.. code-block:: python
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2 + 2?"},
{"role": "assistant", "content": "user asked about a simple math question. 2 + 2 = 4."},
{"role": "user", "content": "Explain why."},
{"role": "assistant", "content": "user wants to know the reasoning behind the answer. Search for a good explanation",
"tool_calls": [{"id": "tool1", "type": "search", "arguments": {"query": "Why is 2 + 2 = 4?"}}]},
{"role": "tool", "content": "The sum of two and two is four because it is a basic arithmetic operation."},
{"role": "assistant", "content": "The tool provided a good explanation.The sum of two and two is four because it is a basic arithmetic operation."}
]
1. Qwen/QwQ-32B will remove all reasoning content except the last assistant message after applying the chat template.
.. code-block:: text
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 2 + 2?<|im_end|>
<|im_start|>assistant
2 + 2 = 4.<|im_end|>
<|im_start|>user
Explain why.<|im_end|>
<|im_start|>assistant
{"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
<|im_end|>
<|im_start|>user
The sum of two and two is four because it is a basic arithmetic operation.
<|im_end|>
<|im_start|>assistant
The tool provided a good explanation. The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
2. Qwen3 series will remove all reasoning content before the last user message.
.. code-block:: text
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 2 + 2?<|im_end|>
<|im_start|>assistant
2 + 2 = 4.<|im_end|>
<|im_start|>user
Explain why.<|im_end|>
<|im_start|>assistant
user wants to know the reasoning behind the answer. Search for a good explanation
{"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
<|im_end|>
<|im_start|>user
The sum of two and two is four because it is a basic arithmetic operation.
<|im_end|>
<|im_start|>assistant
The tool provided a good explanation.
The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
To handle this, we fall back to a **fixed base conversation** containing only a single system and user message. Since this base doesn't include assistant messages or reasoning content, it remains consistent across turns.
.. code-block:: python
BASE_CHAT_HISTORY = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "I am a user."}
]
prev = tokenizer.apply_chat_template(BASE_CHAT_HISTORY, add_generation_prompt=True, tokenize=False)
curr = tokenizer.apply_chat_template([*BASE_CHAT_HISTORY, messages[i]], add_generation_prompt=False, tokenize=False)
token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
loss_mask += [1] * len(token_ids)
This method works well for Qwen3 series. However, Qwen/QwQ-32B currently has a bug in its chat template. A fix_ has been proposed but not yet adopted. Until then, use the following command to download the fixed model revision:
.. code-block:: bash
pip install huggingface_hub
hf download Qwen/QwQ-32B --revision refs/pr/81
.. _fix: https://huggingface.co/Qwen/QwQ-32B/discussions/81
Discrepancy Between Training and Inference Templates
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Although the above approach fixes the delta mismatch issue, the removal of reasoning content in the inference-time chat template introduces a new discrepancy: training uses the full reasoning content, while inference does not.
This mismatch can affect model performance in unpredictable ways. To avoid it, we default to using the full response (including reasoning) for both training and rollout.
However, this approach comes with trade-offs:
1. Long reasoning contents can easily exceed the model's context window, especially in multi-turn rollout.
2. There's a mismatch between rollout and production environment now—models will not have reasoning content from past turns if you use the default chat template in production.
We are still evaluating the impact of these issues. If you experience context length problems or prefer rollouts that match production (i.e., exclude reasoning), you can enable:
``actor_rollout_ref.rollout.multi_turn.use_inference_chat_template = True``
GSM8K Multi-turn Training Performance
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
See the training performance of multi-turn rollout on the GSM8K task HERE_.
.. _HERE: https://wandb.ai/zhaochenyang20/gsm8k_async_rl/runs/1ro1r7om?nw=nwuserzhaochenyang20
.. _GSM8KTool_example_configuration: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml
.. _gsm8k_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/gsm8k_tool.py
.. _mcp_search_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/mcp_search_tool.py
.. _mcp_tool_config.yaml: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/mcp_tool_config.yaml
Interaction System
~~~~~~~~~~~~~~~~~~
For dynamic conversational feedback during RL training, see:
.. toctree::
:maxdepth: 1
interaction_system
Search Tool Integration
~~~~~~~~~~~~~~~~~~~~~~~
.. toctree::
:maxdepth: 1
search_tool_example
Code Walkthrough
~~~~~~~~~~~~~~~~~~~~~~~
If you want to learn more in depth about the code execution flow, please read https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/rlhf/verl/multi-turn/code-walk-through
================================================
FILE: docs/sglang_multiturn/sandbox_fusion.rst
================================================
===============================
Sandbox Fusion Tool Integration
===============================
Last updated: 06/10/2025.
Motivations
===========
- As users of verl, we want to allow the model to call certain tools during Actor rollout, incorporating the results into the training process.
- A colleague from ByteDance proposed a paper aimed at enhancing model capability through code execution tools.
- We aim to support tool-calling capabilities of inference engines using `sandbox-fusion` as the code execution system, providing the community with a reimplementation of `retools`.
Reward Compute with Sandbox Fusion + FaaS Integration
=====================================================
- In current datasets and tasks, similar work already exists (e.g., Prime), which uses local processes as runners to execute model-generated code for reward computation.
- On this basis, #1429 has advanced the design by integrating FaaS as the runner for reward computation.
Goals
=====
- Adapt to the `sglang` tool-calling protocol and define tools for sandbox fusion.
- Integrate with the `async-rollout` process, ensuring sandbox fusion tools follow asyncIO conventions.
- Design and implement a basic rate limiter to prevent issues such as 429 errors.
Non-Goals
=========
- Training effectiveness is out of scope.
- Observability metrics are not considered.
- Distributed failover and component fault tolerance are not addressed.
Design Details
==============
Tool Schema Definition
----------------------
- Currently, only code execution is considered, requiring a `code` field in the JSON from the model.
- Only Python code is supported for now, so no `language` parameter is defined.
.. code-block:: python
OpenAIFunctionToolSchema(
type="function",
function=OpenAIFunctionSchema(
name="code_interpreter",
description="A tool for executing code.",
parameters=OpenAIFunctionParametersSchema(
type="object",
properties={
"code": OpenAIFunctionPropertySchema(
type="string",
description="The code to execute.",
enum=None,
)
},
required=["code"],
),
strict=False,
)
)
Configuration Parameters
--------------------------
+----------------------------+--------------------------------------------------------------+
| Parameter Name | Description |
+============================+==============================================================+
| `num_workers` | Number of worker threads/processes per DP to request runner. |
+----------------------------+--------------------------------------------------------------+
| `rate_limit` | Global limit of concurrent code executions. Default: 10 |
+----------------------------+--------------------------------------------------------------+
| `default_timeout` | Timeout (in seconds) for each code execution. Default: 30 |
+----------------------------+--------------------------------------------------------------+
| `default_language` | Default programming language. Default: "python" |
+----------------------------+--------------------------------------------------------------+
| `enable_global_rate_limit` | Whether to enable global rate limiting. Default: True |
+----------------------------+--------------------------------------------------------------+
| `sandbox_fusion_url` | URL for the veFaas sandbox execution service |
+----------------------------+--------------------------------------------------------------+
Rate Limiting Design
-----------------------
Objective:
- Limit the number of inflight requests using a token bucket model.
- Ensure ordered submission to code runners to avoid starvation due to backoff.
Design Highlights:
- Use Ray Global Actor as a singleton distributed counter at cluster level.
- Semaphore used for counting, with `acquire` and `release` in separate thread pools to preserve order.
- Use Ray’s cloud-pickle to serialize functions for decoupled `ExecutionWorker`.
.. code-block:: python
@ray.remote(concurrency_groups={"acquire": 1,"release": 10})
class TokenBucketWorker:
def __init__(self, rate_limit: int):
self.rate_limit = rate_limit
self.current_count = 0
self._semaphore = threading.Semaphore(rate_limit)
@ray.method(concurrency_group="acquire")
def acquire(self):
self._semaphore.acquire()
self.current_count += 1
@ray.method(concurrency_group="release")
def release(self):
self._semaphore.release()
self.current_count -= 1
def get_current_count(self):
return self.current_count
class ExecutionWorker:
def __init__(self, enable_global_rate_limit=True, rate_limit=10):
self.rate_limit_worker = self._init_rate_limit(rate_limit) if enable_global_rate_limit else None
def _init_rate_limit(self, rate_limit):
return TokenBucketWorker.options(name="rate-limiter", get_if_exists=True).remote(rate_limit)
def execute(self, fn: Callable[..., T], *fn_args, **fn_kwargs) -> T:
with ExitStack() as stack:
stack.callback(self.rate_limit_worker.release.remote)
ray.get(self.rate_limit_worker.acquire.remote())
try:
return fn(*fn_args, **fn_kwargs)
except Exception as e:
logger.warning(f"Error when executing code: {e}")
def init_execution_pool(num_workers: int, enable_global_rate_limit=True, rate_limit=10, mode: PoolMode=PoolMode.ThreadMode):
if mode == PoolMode.ThreadMode:
return ray.remote(ExecutionWorker).options(max_concurrency=num_workers).remote(
enable_global_rate_limit=enable_global_rate_limit,
rate_limit=rate_limit
)
else:
raise NotImplementedError("Process mode is not implemented yet")
Tool Implementation
-------------------
- Use `instance_id` to identify requests across multiple dialogue rounds.
- Use `execution_pool` to implement async invocation.
- Cleanup state after rollout completion.
.. code-block:: python
class SandboxFusionTool(BaseTool):
def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
...
self.execution_pool = init_execution_pool(...)
...
async def create(self, instance_id: Optional[str] = None, ...):
...
async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> Tuple[str, float, dict]:
code = parameters.get("code", "")
timeout = parameters.get("timeout", self.default_timeout)
language = parameters.get("language", self.default_language)
if not isinstance(code, str):
code = str(code)
result = await self.execution_pool.execute.remote(self.execute_code,instance_id,code,timeout,language)
self._instance_dict[instance_id]["reward"].append(result.strip())
return result, result, {}
def execute_code(self,instance_id,code,timeout=30,language="python"):
result_status, metadata = _process_single_case(0, None, None,self.sandbox_fusion_url, code, timeout, language)
# we should always expect this since we don't have correct answer
if metadata["run_status"] == "Finished":
actual_output = metadata["stdout"] if metadata["stdout"] is not None else ""
return actual_output
else:
return "no stdout here"
async def calc_reward(self, instance_id: str, ...):
...
async def release(self, instance_id: str, ...):
...
Test Plan
=========
Unit Tests
----------
- **test_tools_registration**: Test tool registration and initialization.
- **test_rollout_req_creation**: Validate that `AsyncRolloutReq` is built correctly.
- **test_over_size_case**: Ensure rollout terminates early when exceeding `max_seq_len`.
- **test_tool_call_basic_case**: Mock `sglang` output, validate tool call and result.
- **test_tool_call_batch_case**: Test batch processing of tool calls.
- **test_basic_multi_process_init**: Validate Ray global actor behaves as singleton.
- **TestSingleNodeRateLimiterCase**: Verify rate limiter works in single-node mode.
- **test_rotten_execution**: Ensure rate limiter recovers from function errors.
- **TestMultiNodeRateLimiterCase**: Verify behavior in multi-node environments.
e2e Tests
----------
we provide e2e test scripts in `tests/special_e2e` folder, named `tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`
by setting 'trainer.rollout_data_dir' you can dump the rollout data to local disk. here is an sample taken from the rollout data:
.. code-block:: python
{
"input": "
system\nYou are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n{\"type\": \"function\", \"function\": {\"name\": \"code_interpreter\", \"description\": \"A tool for executing code.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"code\": {\"type\": \"string\", \"description\": \"The code to execute.\", \"enum\": null}}, \"required\": [\"code\"]}, \"strict\": false}}\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n
user\nJohn gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`.\n
assistant\n",
"output": "assistant\n\nOkay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together.\n\nFirst, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year.\n\nLast year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that.\n\n10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year.\n\nThis year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy.\n
\n{\"name\": \"code_interpreter\", \"arguments\": {\"code\": \"total_pay_this_year = 200000\\nbonus_percentage = 10 / 100\\nbonus_this_year = total_pay_this_year * bonus_percentage\\ntotal_income_this_year = total_pay_this_year + bonus_this_year\\nprint(total_income_this_year)\", \"executes\": \"True\"}}\n\n
tool\n220000.0\n\n
assistant\n\nSo the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000.\n\nI need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters.\n\n\n#### 220000.0",
"score": 0,
"step": 1
}
here is the readable format version:
.. code-block:: python
[system]
You are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `.
# Tools
You may call one or more functions to assist with the user query.
You are provided with function signatures within XML tags:
{"type": "function", "function": {"name": "code_interpreter", "description": "A tool for executing code.", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "The code to execute.", "enum": null}}, "required": ["code"]}, "strict": false}}
For each function call, return a json object with function name and arguments within XML tags:
{"name": , "arguments": }
[user]
John gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`.
[assistant]
Okay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together.
First, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year.
Last year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that.
10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year.
This year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy.
{"name": "code_interpreter", "arguments": {"code": "total_pay_this_year = 200000\nbonus_percentage = 10 / 100\nbonus_this_year = total_pay_this_year * bonus_percentage\ntotal_income_this_year = total_pay_this_year + bonus_this_year\nprint(total_income_this_year)", "executes": "True"}}
[tool]
220000.0
[assistant]
So the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000.
I need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters.
#### 220000.0
You can also use the `RolloutViewer` TUI tool to view the dumped rollout data:
.. code-block:: bash
python scripts/rollout_viewer.py ${trainer.rollout_data_dir}
.. image:: https://github.com/user-attachments/assets/e34e5157-2880-4a21-afb2-73885d0dfb11
:alt: RolloutViewer screenshot
================================================
FILE: docs/sglang_multiturn/search_tool_example.rst
================================================
=======================
Search Tool Integration
=======================
Last updated: 05/30/2025.
Introduction
------------
- We have added a search tool calling function to Multi-Turn RL, enabling the model to initiate retrieval requests during Actor rollout and directly use retrieval results for training. **We support using a local dense retriever as the retrieval tool, as well as integrating with your own local retrieval engine.**
Quick Reproduction
------------------
Create a New Docker Container
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: bash
docker run \
-it \
--shm-size 32g \
--gpus all \
-v {Huggingface-Cache-Path}:/root/.cache \
--ipc=host \
--network=host \
--privileged \
--name sglang_{your-name} \
lmsysorg/sglang:dev \
/bin/zsh
If you need to restart after exiting the container:
.. code:: bash
docker start -i sglang_{your-name}
Update Python and Configure the Virtual Environment using uv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: bash
apt update
apt install -y python3.10 python3.10-venv
# Create a virtual environment
python3 -m venv ~/.python/verl-multiturn-rollout
# Activate the virtual environment
source ~/.python/verl-multiturn-rollout/bin/activate
# Install uv
python3 -m pip install uv
Install verl Upstream
~~~~~~~~~~~~~~~~~~~~~
.. code:: bash
cd ~
git clone https://github.com/volcengine/verl.git
cd verl
# Install verl
python3 -m uv pip install .
python3 -m uv pip install -r ./requirements_sglang.txt
# Manually install flash-attn
python3 -m uv pip install wheel
python3 -m uv pip install packaging
python3 -m uv pip install flash-attn --no-build-isolation --no-deps
Set Up a Local Retrieval Engine
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you are using your own local retrieval service, you can skip this
step. We chose the local dense retriever provided in the search-R1
example; detailed instructions are in the `searchR1
docs `__.
In brief:
- The GPU version offers higher accuracy and speed; each GPU uses about
5–7 GB of memory.
- The CPU version can be used for simple testing but has lower
retrieval precision, which will degrade training performance. See the
`retriever
documentation `__
in search-R1 for details.
- Recommend using Conda to install faiss-gpu=1.8.0; venv may cause errors.
**Note**: To start both the training process and the local retrieval
service, we launch two separate Python environments. The training uses
uv in the verl-multiturn-rollout environment, while the retriever uses
conda to install ``faiss-gpu``.
.. code:: bash
# Download the Miniconda installer script
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
# Install to $HOME/miniconda3 in batch mode
bash ~/miniconda.sh -b -p $HOME/miniconda3
# Activate conda (only in the current shell)
eval "$($HOME/miniconda3/bin/conda shell.bash hook)"
# (Optional) Add conda to your default shell startup
conda init
# Reload shell config
source ~/.bashrc
# Create and activate the retriever environment with Python 3.10
conda create -n retriever python=3.10 -y
conda activate retriever
# Install PyTorch (with GPU support) and related libraries
conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -y
# Install other Python packages
pip install transformers datasets pyserini huggingface_hub
# Install the GPU version of faiss
conda install faiss-gpu=1.8.0 -c pytorch -c nvidia -y
# Install the API service framework
pip install uvicorn fastapi
Download the Indexing and Corpus
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The local retrieval files are large—prepare sufficient disk space.
Downloading is about 60–70 GB, and uncompressed takes about 132 GB:
.. code:: bash
conda activate retriever
save_path=/the/path/to/save
python examples/sglang_multiturn/search_r1_like/local_dense_retriever/download.py --save_path $save_path
cat $save_path/part_* > $save_path/e5_Flat.index
gzip -d $save_path/wiki-18.jsonl.gz
Start the Local flat e5 Retrieval Server
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. The first startup will download models and load the index.
2. Apart from the download, startup takes about 1–2 minutes.
3. After startup, each GPU uses about 5–7 GB of memory, leaving the rest
for multi-turn RL training.
.. code:: bash
conda activate retriever
index_file=$save_path/e5_Flat.index
corpus_file=$save_path/wiki-18.jsonl
retriever_name=e5
retriever_path=intfloat/e5-base-v2
python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \
--index_path $index_file \
--corpus_path $corpus_file \
--topk 3 \
--retriever_name $retriever_name \
--retriever_model $retriever_path \
--faiss_gpu
Set Up WANDB_API_KEY
~~~~~~~~~~~~~~~~~~~~
.. code:: bash
export WANDB_API_KEY={YOUR_WANDB_API_KEY}
# Define a timestamp function
function now() {
date '+%Y-%m-%d-%H-%M'
}
**Preprocess the Dataset**
~~~~~~~~~~~~~~~~~~~~~~~~~~
**Note:** The following data processing and training commands must be
run in the verl-multiturn-rollout environment.
.. code:: bash
python3 examples/data_preprocess/preprocess_search_r1_dataset.py
Testing on 8 x H20
~~~~~~~~~~~~~~~~~~
.. code:: bash
# Ensure the now() function is defined
# Create a logs directory
mkdir -p logs
# Set GPUs and run with a suitable log path
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
nohup bash examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh \
trainer.experiment_name=qwen2.5-3b-it_rm-searchR1-like-sgl-multiturn-$(now) \
> logs/searchR1-like$(now).log 2>&1 &
Custom Search Configuration
---------------------------
To enable multi-turn reasoning, set the following fields in your config:
.. code:: yaml
actor_rollout_ref:
rollout:
name: "sglang"
multi_turn:
enable: True
You must specify ``retrieval_service_url`` in ``examples/sglang_multiturn/config/tool_config/search_tool_config.yaml``, and properly configure concurrency. For more details on concurrency, refer to the Sandbox Fusion example:
.. code:: yaml
tools:
- class_name: verl.tools.search_tool.SearchTool
config:
retrieval_service_url: http://127.0.0.1:8000/retrieve
num_workers: 120
rate_limit: 120
timeout: 30
The retriever input/output formats are as follows. If your service
parameters match, only modify ``retrieval_service_url``. You can also
customize in ``search_r1_like_utils.py``.
.. code:: python
Input format:
{
"queries": ["What is Python?", "Tell me about neural networks."],
"topk": 3,
"return_scores": true
}
Output format (when return_scores=True, similarity scores are returned):
{
"result": [
[ # Results for each query
{
"document": doc, "score": score
},
# ... more documents
],
# ... results for other queries
]
}
Notes
-----
1. The total training time is about 27 hours; meanwhile, the validation
dataset is very large (51 k), and each validation takes about 6000 s.
(Therefore, ``val_before_train=False`` by default)
================================================
FILE: docs/single_controller.rst
================================================
The Design of ``verl.single_controller``
==============================================
Last updated: 05/21/2025.
**Author:**\ `Wang Zhang `__
Preface
-------
We prepared this document for developers of ``verl``, particularly those
interested in understanding or contributing to the
``verl.single_controller`` module. It is not intended for end users, but
for contributors seeking to understand the architectural rationale and
internal mechanics.
--------------
Origin
------
The ``single_controller`` module originated from a request I received —
to adapt a toy single-process RLHF script into a distributed system with
minimal changes, while maintaining ease of debugging.
Common practice — such as using PyTorch’s Distributed Data Parallel
(DDP) — typically involves wrapping ``nn.Module`` and launching multiple
processes that execute the same function under different ranks. However,
this approach presents two main limitations in the context of
distributed RLHF: - Difficulty representing multiple DAGs as required by
PPO; - Difficulty inspecting intermediate tensors during training.
To maintain debuggability, we opted for a different approach — breaking
the training loop into well-defined stages like ``generate_sequences``,
``compute_advantages``, and so on.
We selected `Ray `__ as the initial backend for
``verl`` due to its ability to expose Python class methods as RPC
endpoints. However, Ray’s default model only supports **one method call,
one RPC**, while training LLMs typically requires coordination across
multiple processes.
To hide this multi-Ray actors invocation for a single method from users,
we introduced the following components:
- ``WorkerGroup`` – manages a group of remote workers and provides
a unified interface for multi-process distributed computation;
- ``ResourcePool`` – binds computational resources to worker
processes;
- ``ClassWithArgs`` – enables delayed remote instantiation with
specified initialization arguments.
--------------
A Running Example: ``generate_sequences``
-----------------------------------------
To illustrate the design, we walk through how the ``generate_sequences``
method in the ``ActorRolloutRefWorker`` class is registered and invoked
across distributed workers.
--------------
Step 1: Register with a Decorator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The first step is to define the ``generate_sequences`` and decorate it
with ``@register`` as it will be called in driver script.
**Source:**
`fsdp_workers.py `__
.. code:: python
class ActorRolloutRefWorker(Worker):
...
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, prompts: DataProto):
prompts = prompts.to(torch.cuda.current_device())
...
The ``@register`` decorator adds metadata to the ``generate_sequences``
method. Currently, it doesn’t alter functionality, but attaches
attributes via a magic key (``MAGIC_ATTR``):
**Source:**
`decorator.py `__
.. code:: python
def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
...
def decorator(func):
@wraps(func)
def inner(*args, **kwargs):
if materialize_futures:
args, kwargs = _materialize_futures(*args, **kwargs)
return func(*args, **kwargs)
attrs = {"dispatch_mode": dispatch_mode, "execute_mode": execute_mode, "blocking": blocking}
setattr(inner, MAGIC_ATTR, attrs)
return inner
return decorator
As the code shows, values of ``dispatch_mode``, ``execute_mode`` and
``blocking`` is attached the ``generate_sequences`` method.
--------------
Step 2: Binding During Initialization
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These attached attributes are extracted and utilized when
``ActorRolloutRefWorker``, wrapped in a ``RayClassWithArgs``, is passed
into a ``RayWorkerGroup``.
**Source:**
`main_generation.py `__
.. code:: python
ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role="rollout")
resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
During the
`initialization `__
of ``RayWorkerGroup``, two key steps occur:
1. Worker instances (Ray actors) are created:
`RayWorkerGroup._init_with_resource_pool `__
2. Methods decorated with ``@register`` are bound to ``RayWorkerGroup``:
`RayWorkerGroup._bind_worker_method `__
.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/worker_group_init.png?raw=true
:alt: initialization_and_binding_of_worker_group
initialization_and_binding_of_worker_group
The binding procedure is the heart of ``verl.single_controller``.
**Key function:**
`WorkerGroup._bind_worker_method `__
.. code:: python
def _bind_worker_method(self, user_defined_cls, func_generator):
...
for method_name in dir(user_defined_cls):
try:
method = getattr(user_defined_cls, method_name)
assert callable(method)
except Exception:
continue # Skip properties
<<>>
When a method has the ``MAGIC_ATTR``, the attributes set by
``@register`` are extracted:
.. code:: python
<<>>
if hasattr(method, MAGIC_ATTR):
attribute = getattr(method, MAGIC_ATTR)
dispatch_mode = attribute["dispatch_mode"]
execute_mode = attribute["execute_mode"]
blocking = attribute["blocking"]
<<>>
As show in the flow chart above, these attributes are fed into
``func_generator``. However, ``func_generator`` takes ``method_name``,
``dispatch_fn``, ``collect_fn``, ``execute_fn``, ``blocking``. We need
to find the corresponding ``dispatch_fn`` and ``collect_fn`` associated
with the ``dispatch_mode`` (``DP_COMPUTE_PROTO``) from
`DISPATCH_MODE_FN_REGISTRY `__:
.. code:: python3
DISPATCH_MODE_FN_REGISTRY = {
Dispatch.ONE_TO_ALL: {
"dispatch_fn": dispatch_one_to_all,
"collect_fn": collect_all_to_all,
},
...
Dispatch.DP_COMPUTE_PROTO: {
"dispatch_fn": dispatch_dp_compute_data_proto,
"collect_fn": collect_dp_compute_data_proto,
},
...
}
Similarly, the ``execute_fn`` is selected by ``execute_mode`` and
extracted by:
.. code:: python
<<>>
# get execute_fn_name
execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
wg_execute_fn_name = execute_mode["execute_fn_name"]
# get execute_fn from string
try:
execute_fn = getattr(self, wg_execute_fn_name)
assert callable(execute_fn), "execute_fn must be callable"
except Exception:
print(f"execute_fn {wg_execute_fn_name} is invalid")
raise
<<>>
In this ``generate_sequences`` cases: -
``dispatch_mode = Dispatch.DP_COMPUTE_PROTO`` -
``dispatch_fn = dispatch_dp_compute_data_proto`` -
``collect_fn = collect_dp_compute_data_proto`` -
``execute_fn = RayWorkerGroup.execute_all``
ONE_TO_ALL v.s. DP_COMPUTE_PROTO
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``dispatch_mode`` is associated with a ``dispatch_fn`` and a
``collect_fn``. As the name implies, ``dispatch_fn`` processes the input
arguments in ``WorkerGroup`` and generate a batch (list) of input
arguments, each of which will be fed into a worker attached to the
``WorkerGroup``.
``dispatch_fn`` of ``ONE_TO_ALL`` is
`dispatch_one_to_all `__,
which just duplicates all the input arguments into N replicas, where N
equals the number of Workers attached to the ``worker_group``:
.. code:: python
def dispatch_one_to_all(worker_group, *args, **kwargs):
args = tuple([arg] * worker_group.world_size for arg in args)
kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
return args, kwargs
``dispatch_fn`` of ``DP_COMPUTE_PROTO`` is
`dispatch_dp_compute_data_proto `__,
which uses ``DataProto.chunk`` to split a large ``DataProto`` into N
smaller ``DataProto``, where N equals the world_size (number of the
workers) of the ``worker_group``:
.. code:: python
def dispatch_dp_compute_data_proto(worker_group, *args, **kwargs):
from verl.single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
# Note: enable auto padding for dp compute DatapProto
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto_with_auto_padding(
worker_group.world_size,
*args,
**kwargs,
)
return splitted_args, splitted_kwargs
The ``collect_fn`` follows the same pattern and process a batch (list)
of returned value from all workers of a ``WorkerGroup`` and merge it
into a list as ``collect_all_to_all`` does or a large ``DataProto`` as
``collect_dp_compute_data_proto`` does.
Finally, a new method is dynamically generated using ``func_generator``
and added to the ``WorkerGroup`` instance:
.. code:: python
<<>>
# bind a new method to the RayWorkerGroup
func = func_generator(
self,
method_name,
dispatch_fn=dispatch_fn,
collect_fn=collect_fn,
execute_fn=execute_fn,
blocking=blocking,
)
try:
setattr(self, method_name, func)
method_names.append(method_name)
except Exception as e:
raise ValueError(f"Fail to set method_name {method_name}") from e
This makes the method invocable via the ``WorkerGroup`` interface.
--------------
Step 3: Call Chain
~~~~~~~~~~~~~~~~~~
All the machinery above ensures that distributed calls feel identical to
single-process ones. In the original single-process script, the code
looks like:
.. code:: python
rollout = Rollout()
rollout.generate_sequences(batch)
With ``verl``, the multiprocess program becomes:
.. code:: python
rollout = RayWorkerGroup(resource_pool=[4], RayClassWithArgs(Rollout))
rollout.generate_sequences(batch)
.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/call_generate_sequences.png?raw=true
:alt: call_chain_of_generate_sequences
call_chain_of_generate_sequences
Behind this simple call: - ``dispatch_fn`` splits input across workers -
``execute_fn`` performs the actual remote invocation - ``collect_fn``
gathers the results
All of this is abstracted away, enabling developers to write distributed
code with minimal changes to their existing logic.
--------------
Beyond RL Post-Training: Generalizing ``verl.single_controller``
----------------------------------------------------------------
The ``verl.single_controller`` module generalizes well beyond
reinforcement learning. It provides a clean abstraction to batch-process
remote method calls, with automatic input/output handling.
By minimizing the gap between single-process and multi-process scripts,
``verl.single_controller`` opens the door to distributed computing in
broader domains — not limited to RL post-training.
We hope this design inspires more examples and extensions from the
community.
================================================
FILE: docs/start/agentic_rl.rst
================================================
Agentic RL Training
===================
Last updated: 07/15/2025.
Overview
----------
The goal of Agentic RL is to improve the performance of backend models from reinforcement learning to the Agent. During the training process, a series of features are developed:
1. Server-based asynchronous rollout
2. Multi-turn conversations and tool calls
3. LangGraph-based Agent
This document explains the system principles and usage involved to help users implement Agentic RL.
Server-based Asynchronous Rollout
---------------------------------
Since Agents need to interact with the environment through various tool calls, in order to avoid GPU idling while waiting for tool call return results, an asyncio based co-routing mechanism is utilized to execute each rollout requests asynchronously, thereby improving training performance. To support asynchronous rollout, the inference engine (server) and the agent (client) are architecturally separated, implementing a server-based system with the following objectives:
1. Enabling load balancing mechanisms to balance loads across multiple GPUs and reduce the impact of long-tail requests on performance. For this purpose, scheduling capabilities in stream mode (recipe\stream_mode) are implemented as a recipe.
2. Preventing agent specific features such as tracing from affecting the inference engine.
System Architecture
~~~~~~~~~~~~~~~~~~~
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop.png?raw=true
For more detail on internal design, please refer to :doc:`Agent Loop<../advance/agent_loop>`.
System Components
~~~~~~~~~~~~~~~~~
+--------------------------+----------------------------------------------------------------------------+
| Component | Role |
+==========================+============================================================================+
| AgentLoop | Client, implements Agent functions |
+--------------------------+----------------------------------------------------------------------------+
| AsyncLLMServerManager | Inference gateway, provides generate interface for AgentLoop |
+--------------------------+----------------------------------------------------------------------------+
| AsyncServer | Server, each instance is connected to one DP group of the inference engine |
+--------------------------+----------------------------------------------------------------------------+
**"generate" Interface**
The "generate" function based on ray actor is used between the Client and Server instead of the standard chat completion API. This is because the conversion between tokens and text can be irreversible. For example, the token converted from "" will be different from that generated by the LLM. During the training phase, it is necessary to strictly use the tokens generated by LLM inference to avoid inaccurate in computing advantage, which may affect model performance. Having the Server provide a token-based API helps the Client maintain the relationship between the text generated by tool calls and the tokens returned by the LLM, so as to output correct tokens for training.
**Inference Engine Adaptation**
AsyncServer uniformly provides a generate function to the upper layer, with separate implementations for SGLang and vLLM to hide underlying differences:
1. The SGLang AsyncServer uses the async_generate interface of the SGLang engine, which is located on the first GPU of each TP group. Therefore, AsyncServer needs to remotely call async_generate through ray actor.
2. The vLLM AsyncServer uses the generate interface of the vLLM engine, which can communicate with the GPUs in the TP group through ZMQ and can be directly called in AsyncServer.
Usage Example
~~~~~~~~~~~~~
Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
There are two options required to use agent loop:
- `data.return_raw_chat=True`
- `actor_rollout_ref.rollout.mode=async`
This example uses the sglang inference engine by default, and you can also modify rollout_name to use vllm.
.. code-block:: bash
bash examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
Multi-turn Conversations and Tool Calls
---------------------------------------
Follow :doc:`Multi-turn Rollout Support<../sglang_multiturn/multiturn>` to prepare tool and configuration files.
The Tool Agent Loop has an additional requirement: adding an "agent_name" field to the dataset. During rollout, it will choose to use tool_agent_loop or single_turn_agent (default) based on this field.
Usage Example
~~~~~~~~~~~~~
.. code-block:: bash
# install mlflow to view toolcall and llm trace
pip install mlflow
# This will download and preprocess the GSM8K dataset into ~/data/gsm8k/ and add the "agent_name" field.
python examples/data_preprocess/gsm8k_tool_agent_loop.py
# Start training with tool calls and enabled mlflow based trace helping to debug the rollout details
bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
# When training is done, start a mlflow server to view trace
mlflow ui -h 0.0.0.0 -p 5000 --backend-store-uri sqlite:////tmp/mlruns.db
# then you can open http://:5000 from browser to view trace
Note: During training, because the model may sometimes fail to generate correct toolcall tags, an error message "Failed to decode tool call" will be output to the console, which does not indicate an abnormality in training.
Follow :doc:`Rollout trace<../advance/rollout_trace>` to known more about trace feature.
Agent Framework
---------------
System Architecture
~~~~~~~~~~~~~~~~~~~
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/langgraph_agent.png?raw=true
System Components
~~~~~~~~~~~~~~~~~
+--------------------------+-----------------------------------------------------------------------------------------------+
| Component | Role |
+==========================+===============================================================================================+
| ChatModel | LLM object of LangChain, used to adapt to the “generate” api provided by AsyncLLMServerManager|
+--------------------------+-----------------------------------------------------------------------------------------------+
| ReactAgentLoop | Agent adaptation layer, which by default supports a naive LangGraph Agentic. |
| | New classes can be derived to support user-defined Agents, and the run function needs to be |
| | implemented to complete Agent calls. |
+--------------------------+-----------------------------------------------------------------------------------------------+
| AsyncServer | Server, each instance is connected to one DP group of the inference engine. |
+--------------------------+-----------------------------------------------------------------------------------------------+
Follow doc "recipe/langgraph_agent/example/README.md" for more details.
================================================
FILE: docs/start/install.rst
================================================
Installation
============
Requirements
------------
- **Python**: Version >= 3.10
- **CUDA**: Version >= 12.8
verl supports various backends. Currently, the following configurations are available:
- **FSDP** and **Megatron-LM** (optional) for training.
- **SGLang**, **vLLM** and **TGI** for rollout generation.
Choices of Backend Engines
----------------------------
1. Training:
We recommend using **FSDP** backend to investigate, research and prototype different models, datasets and RL algorithms. The guide for using FSDP backend can be found in :doc:`FSDP Workers<../workers/fsdp_workers>`.
For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support `Megatron-LM v0.13.1 `_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
2. Inference:
For inference, vllm 0.8.3 and later versions have been tested for stability. We recommend turning on env var `VLLM_USE_V1=1` for optimal performance.
For SGLang, refer to the :doc:`SGLang Backend<../workers/sglang_worker>` for detailed installation and usage instructions. SGLang rollout is under extensive development and offers many advanced features and optimizations. We encourage users to report any issues or provide feedback via the `SGLang Issue Tracker `_.
For huggingface TGI integration, it is usually used for debugging and single GPU exploration.
Install from docker image
-------------------------
Start from v0.6.0, we use vllm and sglang release image as our base image.
Base Image
::::::::::
- vLLM: https://hub.docker.com/r/vllm/vllm-openai
- SGLang: https://hub.docker.com/r/lmsysorg/sglang
Application Image
:::::::::::::::::
Upon base image, the following packages are added:
- flash_attn
- Megatron-LM
- Apex
- TransformerEngine
- DeepEP
Latest docker file:
- `Dockerfile.stable.vllm `_
- `Dockerfile.stable.sglang `_
All pre-built images are available in dockerhub: `verlai/verl `_. For example, ``verlai/verl:sgl055.latest``, ``verlai/verl:vllm011.latest``.
You can find the latest images used for development and ci in our github workflows:
- `.github/workflows/vllm.yml `_
- `.github/workflows/sgl.yml `_
Installation from Docker
::::::::::::::::::::::::
After pulling the desired Docker image and installing desired inference and training frameworks, you can run it with the following steps:
1. Launch the desired Docker image and attach into it:
.. code:: bash
docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl sleep infinity
docker start verl
docker exec -it verl bash
2. If you use the images provided, you only need to install verl itself without dependencies:
.. code:: bash
# install the nightly version (recommended)
git clone https://github.com/volcengine/verl && cd verl
pip3 install --no-deps -e .
[Optional] If you hope to switch between different frameworks, you can install verl with the following command:
.. code:: bash
# install the nightly version (recommended)
git clone https://github.com/volcengine/verl && cd verl
pip3 install -e ".[vllm]"
pip3 install -e ".[sglang]"
Install from custom environment
---------------------------------------------
We recommend to use docker images for convenience. However, if your environment is not compatible with the docker image, you can also install verl in a python environment.
.. note::
- Dockerfile provides more details than this installation instructions. You can find examples in each Dockerfile, for example `verl0.6-cu128-torch2.8.0-fa2.7.4 Dockerfile.base `_ .
Pre-requisites
::::::::::::::
For training and inference engines to utilize better and faster hardware support, CUDA/cuDNN and other dependencies are required,
and some of the dependencies are easy to be overridden when installing other packages,
so we put them in the :ref:`Post-installation` step.
.. note::
- The installation steps below are recommended configurations for the latest version of verl.
If you are trying to customize your own environment, please ignore the strict constraints.
We need to install the following pre-requisites:
- **CUDA**: Version >= 12.8
- **cuDNN**: Version >= 9.10.0
- **Apex**
CUDA above 12.8 is recommended to use as the docker image,
please refer to `NVIDIA's official website `_ for other version of CUDA.
.. code:: bash
# change directory to anywher you like, in verl source code directory is not recommended
wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb
dpkg -i cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb
cp /var/cuda-repo-ubuntu2204-12-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
apt-get update
apt-get -y install cuda-toolkit-12-8
update-alternatives --set cuda /usr/local/cuda-12-8
cuDNN can be installed via the following command,
please refer to `NVIDIA's official website `_ for other version of cuDNN.
.. code:: bash
# change directory to anywher you like, in verl source code directory is not recommended
wget https://developer.download.nvidia.com/compute/cudnn/9.10.2/local_installers/cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb
dpkg -i cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb
cp /var/cudnn-local-repo-ubuntu2204-9.10.2/cudnn-*-keyring.gpg /usr/share/keyrings/
apt-get update
apt-get -y install cudnn-cuda-12
Install dependencies
::::::::::::::::::::
.. note::
We recommend to use a fresh new conda environment to install verl and its dependencies.
**Notice that the inference frameworks often strictly limit your pytorch version and will directly override your installed pytorch if not paying enough attention.**
As a countermeasure, it is recommended to install inference frameworks first with the pytorch they needed. For vLLM, if you hope to use your existing pytorch,
please follow their official instructions
`Use an existing PyTorch installation `_ .
1. First of all, to manage environment, we recommend using conda:
.. code:: bash
conda create -n verl python==3.12
conda activate verl
2. Then, execute the ``install.sh`` script that we provided in verl:
.. code:: bash
# Make sure you have activated verl conda env
# If you need to run with megatron
bash scripts/install_vllm_sglang_mcore.sh
# Or if you simply need to run with FSDP
USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh
If you encounter errors in this step, please check the script and manually follow the steps in the script.
[Optional] NVIDIA Apex is recommended for Megatron-LM training, but it's not needed if you only use FSDP backend.
You can install it via the following command, but notice that this steps can take a very long time.
It is recommended to set the ``MAX_JOBS`` environment variable to accelerate the installation process,
but do not set it too large, otherwise the memory will be overloaded and your machines may hang.
.. code:: bash
# change directory to anywher you like, in verl source code directory is not recommended
git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
MAX_JOB=32 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
Install verl
::::::::::::
For installing the latest version of verl, the best way is to clone and
install it from source. Then you can modify our code to customize your
own post-training jobs.
.. code:: bash
git clone https://github.com/volcengine/verl.git
cd verl
pip install --no-deps -e .
Post-installation
:::::::::::::::::
Please make sure that the installed packages are not overridden during the installation of other packages.
The packages worth checking are:
- **torch** and torch series
- **vLLM**
- **SGLang**
- **pyarrow**
- **tensordict**
- **nvidia-cudnn-cu12**: For Magetron backend
If you encounter issues about package versions during running verl, please update the outdated ones.
Install with AMD GPUs - ROCM kernel support
------------------------------------------------------------------
When you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and run it.
If you encounter any issues in using AMD GPUs running verl, feel free to contact me - `Yusheng Su `_.
Find the docker for AMD ROCm: `docker/Dockerfile.rocm `_
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
.. code-block:: bash
# Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker
FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Set working directory
# WORKDIR $PWD/app
# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Install vllm
RUN pip uninstall -y vllm && \
rm -rf vllm && \
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
MAX_JOBS=$(nproc) python3 setup.py install && \
cd .. && \
rm -rf vllm
# Copy the entire project directory
COPY . .
# Install dependencies
RUN pip install "tensordict<0.6" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
datasets \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11 && \
pip install -e . --no-deps
Build the image
::::::::::::::::::::::::
.. code-block:: bash
docker build -t verl-rocm .
Launch the container
::::::::::::::::::::::::::::
.. code-block:: bash
docker run --rm -it \
--device /dev/dri \
--device /dev/kfd \
-p 8265:8265 \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME/.ssh:/root/.ssh \
-v $HOME:$HOME \
--shm-size 128G \
-w $PWD \
verl-rocm \
/bin/bash
If you do not want to root mode and require assign yourself as the user,
Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script.
verl with AMD GPUs currently supports FSDP as the training engine, vLLM and SGLang as the inference engine. We will support Megatron in the future.
================================================
FILE: docs/start/more_resources.rst
================================================
More Resources
==============
Last updated: 06/30/2025.
- Introduction to verl (`Slides `_)
- verl Code Walkthrough (`Slides `_, `Talk in Chinese `_)
================================================
FILE: docs/start/multinode.rst
================================================
Multinode Training
==================
Last updated: 06/10/2025.
.. _wuxibin89: https://github.com/wuxibin89
Author: `Xibin Wu `_, `Yusheng Su `_.
Option 1: Launch Manually
------------------------------
Set up multinode ray cluster
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Start head node with ``ray start --head --dashboard-host=0.0.0.0``, there're 2 address you should care about:
- GCS address: ``ray start --address=``, where worker node should connect to.
- Dashboard address: ``:8265``, where you should submit job to the cluster.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/head.png?raw=true
2. Start worker node with ``ray start --address=`` you get above.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/worker.png?raw=true
3. Now you should see the cluster have 2 nodes with ``ray status``.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/status.png?raw=true
4. Additionally, you can access dashboard in the browser with the address you get above.
*Firewall rules maybe need configure to access the dashboard, if there's any trouble, please contact your network administrator.*
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/overview.png?raw=true
Submit job to ray cluster
~~~~~~~~~~~~~~~~~~~~~~~~~
1. Submit ray job to cluster with the dashboard address you get above.
.. code-block:: bash
ray job submit --address="http://127.0.0.1:8265" \
--runtime-env=verl/trainer/runtime_env.yaml \
--no-wait \
-- \
python3 -m verl.trainer.main_ppo \
trainer.n_gpus_per_node=8 \
trainer.nnodes=2 \
...
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/submit.png?raw=true
2. Then you can check the job status with the following commands:
- ray job list: list all jobs submitted to the cluster.
- ray job logs : query the logs of the job.
- ray job status : query the status of the job.
- ray job stop