gitextract_yy4wnsy3/

├── .gitignore
├── README.md
├── benchmarks/
│   ├── LICENSE
│   ├── README.md
│   ├── api/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── config/
│   │   │   └── llm_config.json
│   │   ├── deepseek.py
│   │   ├── example.py
│   │   └── gemini.py
│   ├── benchmark/
│   │   ├── __init__.py
│   │   ├── base_generator.py
│   │   ├── benchmark.py
│   │   ├── checkpoint_utils.py
│   │   ├── console.py
│   │   ├── generation_runner.py
│   │   ├── gpu_utils.py
│   │   └── tasks/
│   │       ├── __init__.py
│   │       ├── tasks.py
│   │       └── v1_0/
│   │           ├── __init__.py
│   │           ├── base_evaluator.py
│   │           ├── base_loader.py
│   │           ├── item_understand/
│   │           │   ├── __init__.py
│   │           │   ├── config.py
│   │           │   ├── evaluator.py
│   │           │   └── utils.py
│   │           ├── label_pred/
│   │           │   ├── __init__.py
│   │           │   ├── config.py
│   │           │   ├── evaluator.py
│   │           │   └── utils.py
│   │           ├── mfu_evaluator.py
│   │           ├── qwen3.jinja2
│   │           ├── qwen3_soft_switch.jinja2
│   │           ├── rec_reason/
│   │           │   ├── __init__.py
│   │           │   ├── config.py
│   │           │   ├── evaluator.py
│   │           │   └── utils.py
│   │           ├── recommendation/
│   │           │   ├── __init__.py
│   │           │   ├── config.py
│   │           │   ├── evaluator.py
│   │           │   ├── utils.py
│   │           │   └── utils_by_pid.py
│   │           └── registry.py
│   ├── eval_script.sh
│   ├── pyproject.toml
│   ├── requirements.txt
│   └── scripts/
│       ├── __init__.py
│       ├── eval_dev_results.py
│       ├── init_ray.sh
│       ├── init_ray_cluster.sh
│       └── ray-vllm/
│           ├── evaluate.py
│           └── utils/
│               ├── __init__.py
│               ├── arguments.py
│               └── generator.py
├── data/
│   ├── README.md
│   ├── general_text/
│   │   ├── pretrain.csv
│   │   └── sft.csv
│   ├── onerec_data/
│   │   ├── README.md
│   │   ├── pretrain/
│   │   │   ├── item_understand.py
│   │   │   ├── user_profile.py
│   │   │   └── video_rec.py
│   │   ├── run.sh
│   │   └── sft/
│   │       ├── ad_rec.py
│   │       ├── interactive_rec.py
│   │       ├── item_understand.py
│   │       ├── label_cond_rec.py
│   │       ├── label_pred.py
│   │       ├── product_rec.py
│   │       ├── rec_reason.py
│   │       └── video_rec.py
│   ├── prepare_distillation.sh
│   ├── prepare_pretrain.sh
│   ├── prepare_rl.sh
│   ├── prepare_sft.sh
│   └── scripts/
│       ├── parquet_unicode_fix.py
│       ├── sample_data.py
│       ├── split_data.py
│       └── train_test_split.py
├── pretrain/
│   ├── .gitignore
│   ├── README.md
│   ├── examples/
│   │   ├── dataset_config/
│   │   │   ├── pretrain.json
│   │   │   └── sft.json
│   │   ├── posttrain_sft.sh
│   │   ├── pretrain_stg1.sh
│   │   └── pretrain_stg2.sh
│   ├── onerec_llm/
│   │   ├── __init__.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── dataloaders.py
│   │   │   ├── local_shuffle_buffer.py
│   │   │   └── qwen3_dataset.py
│   │   ├── losses/
│   │   │   ├── __init__.py
│   │   │   └── ce.py
│   │   ├── models/
│   │   │   └── qwen3/
│   │   │       ├── __init__.py
│   │   │       ├── configuration_qwen3.py
│   │   │       ├── modeling_qwen3.py
│   │   │       └── modular_qwen3.py
│   │   ├── training/
│   │   │   ├── __init__.py
│   │   │   ├── activations.py
│   │   │   ├── checkpoint.py
│   │   │   ├── common.py
│   │   │   ├── distributed.py
│   │   │   ├── gradients.py
│   │   │   └── lr_schedulers.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── common.py
│   │       ├── data_utils.py
│   │       ├── distributed.py
│   │       ├── ds_utils.py
│   │       ├── mfu_stats.py
│   │       ├── time_tracker.py
│   │       └── worker_utils.py
│   ├── recipes/
│   │   └── train_qwen3.py
│   ├── scripts/
│   │   ├── convert_checkpoint_to_hf.sh
│   │   ├── expand_qwen3_vocab.sh
│   │   ├── killall.sh
│   │   ├── numa_runner.sh
│   │   ├── test_cases_example.json
│   │   └── test_hf_model.sh
│   ├── set_env.sh
│   ├── tests/
│   │   └── test_qwen3_dataset_file_distribution.py
│   └── tools/
│       ├── model_converter/
│       │   ├── convert_checkpoint_to_hf.py
│       │   └── expand_qwen3_vocab.py
│       └── model_test/
│           └── test_hf_model.py
├── tokenizer/
│   ├── README.md
│   ├── infer_res_kmeans.py
│   ├── res_kmeans.py
│   └── train_res_kmeans.py
├── verl_distillation/
│   ├── LICENSE
│   ├── README.md
│   ├── README_ORIGINAL.md
│   ├── deploy_env.sh
│   ├── docker/
│   │   ├── Apptainerfile.rocm
│   │   ├── Dockerfile.extention.awsefa
│   │   ├── Dockerfile.ngc.vllm
│   │   ├── Dockerfile.ngc.vllm0.8
│   │   ├── Dockerfile.ngc.vllm0.8.sagemaker
│   │   ├── Dockerfile.rocm
│   │   ├── Dockerfile.rocm7
│   │   ├── Dockerfile.rocm_verl-0.3.0.post1
│   │   ├── Dockerfile.rocm_verl-0.4.1
│   │   ├── Dockerfile.sglang
│   │   ├── Dockerfile.vemlp.vllm.te
│   │   ├── Dockerfile.vllm.sglang.megatron.deepseek
│   │   ├── README.md
│   │   ├── ascend/
│   │   │   ├── Dockerfile.ascend_8.2.rc1_a2
│   │   │   └── Dockerfile.ascend_8.2.rc1_a3
│   │   ├── verl0.4-cu124-torch2.6-fa2.7.4/
│   │   │   ├── Dockerfile.app.sglang.vllm.mcore0.12
│   │   │   ├── Dockerfile.app.sglang.vllm.mcore0.12.deepep
│   │   │   ├── Dockerfile.app.sglang.vllm.mcore0.13.preview
│   │   │   ├── Dockerfile.app.vllm.mcore0.12
│   │   │   ├── Dockerfile.app.vllm.mcore0.12.deepep
│   │   │   ├── Dockerfile.app.vllm.mcore0.13.preview
│   │   │   ├── Dockerfile.base
│   │   │   └── README.md
│   │   ├── verl0.5-cu126-torch2.7-fa2.7.4/
│   │   │   ├── Dockerfile.app.sglang0.4.10.post2.mcore0.13
│   │   │   ├── Dockerfile.app.sglang0.4.9.post6.mcore0.13
│   │   │   ├── Dockerfile.app.vllm.mcore0.13
│   │   │   ├── Dockerfile.app.vllm.mcore0.15
│   │   │   ├── Dockerfile.base.torch2.7.1
│   │   │   └── README.md
│   │   ├── verl0.5-cu126-torch2.7.1-fa2.8.0/
│   │   │   ├── Dockerfile.app.sglang.mcore0.12
│   │   │   ├── Dockerfile.app.sglang.mcore0.13.preview
│   │   │   ├── Dockerfile.base
│   │   │   └── README.md
│   │   ├── verl0.5-preview-cu128-torch2.7.1-fa2.8.0/
│   │   │   ├── Dockerfile.app.sglang.megatron
│   │   │   ├── Dockerfile.base
│   │   │   └── README.md
│   │   └── verl0.6-cu128-torch2.8.0-fa2.7.4/
│   │       ├── Dockerfile.app.sglang
│   │       ├── Dockerfile.base
│   │       └── Dockerfile.vllm011.mcore_gpt-oss
│   ├── docs/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── README_vllm0.7.md
│   │   ├── README_vllm0.8.md
│   │   ├── _static/
│   │   │   ├── custom.css
│   │   │   └── js/
│   │   │       ├── resizable-sidebar.js
│   │   │       └── runllm-widget.js
│   │   ├── advance/
│   │   │   ├── agent_loop.rst
│   │   │   ├── attention_implementation.rst
│   │   │   ├── checkpoint.rst
│   │   │   ├── dpo_extension.rst
│   │   │   ├── fsdp_extension.rst
│   │   │   ├── fully_async.md
│   │   │   ├── megatron_extension.rst
│   │   │   ├── one_step_off.md
│   │   │   ├── placement.rst
│   │   │   ├── ppo_lora.rst
│   │   │   ├── reward_loop.rst
│   │   │   ├── rollout_is.md
│   │   │   ├── rollout_skip.rst
│   │   │   ├── rollout_trace.rst
│   │   │   └── rope.rst
│   │   ├── algo/
│   │   │   ├── baseline.md
│   │   │   ├── collabllm.md
│   │   │   ├── dapo.md
│   │   │   ├── entropy.md
│   │   │   ├── gpg.md
│   │   │   ├── grpo.md
│   │   │   ├── opo.md
│   │   │   ├── ppo.md
│   │   │   ├── spin.md
│   │   │   └── sppo.md
│   │   ├── amd_tutorial/
│   │   │   ├── amd_build_dockerfile_page.rst
│   │   │   └── amd_vllm_page.rst
│   │   ├── api/
│   │   │   ├── data.rst
│   │   │   ├── single_controller.rst
│   │   │   ├── trainer.rst
│   │   │   └── utils.rst
│   │   ├── ascend_tutorial/
│   │   │   ├── ascend_profiling_en.rst
│   │   │   ├── ascend_profiling_zh.rst
│   │   │   ├── ascend_quick_start.rst
│   │   │   ├── ascend_sglang_quick_start.rst
│   │   │   └── dockerfile_build_guidance.rst
│   │   ├── conf.py
│   │   ├── data/
│   │   │   └── transfer_queue.md
│   │   ├── examples/
│   │   │   ├── config.rst
│   │   │   ├── gsm8k_example.rst
│   │   │   ├── multi_modal_example.rst
│   │   │   ├── ppo_code_architecture.rst
│   │   │   ├── sandbox_fusion_example.rst
│   │   │   └── skypilot_examples.rst
│   │   ├── faq/
│   │   │   └── faq.rst
│   │   ├── hybrid_flow.rst
│   │   ├── index.rst
│   │   ├── perf/
│   │   │   ├── best_practices.rst
│   │   │   ├── device_tuning.rst
│   │   │   ├── dpsk.md
│   │   │   ├── nsight_profiling.md
│   │   │   ├── perf_tuning.rst
│   │   │   └── verl_profiler_system.md
│   │   ├── preparation/
│   │   │   ├── prepare_data.rst
│   │   │   └── reward_function.rst
│   │   ├── requirements-docs.txt
│   │   ├── sglang_multiturn/
│   │   │   ├── interaction_system.rst
│   │   │   ├── multiturn.rst
│   │   │   ├── sandbox_fusion.rst
│   │   │   └── search_tool_example.rst
│   │   ├── single_controller.rst
│   │   ├── start/
│   │   │   ├── agentic_rl.rst
│   │   │   ├── install.rst
│   │   │   ├── more_resources.rst
│   │   │   ├── multinode.rst
│   │   │   ├── quickstart.rst
│   │   │   └── ray_debug_tutorial.rst
│   │   └── workers/
│   │       ├── fsdp_workers.rst
│   │       ├── megatron_workers.rst
│   │       ├── model_engine.rst
│   │       ├── ray_trainer.rst
│   │       └── sglang_worker.rst
│   ├── examples/
│   │   ├── data_preprocess/
│   │   │   ├── aime2024_multiturn_w_tool.py
│   │   │   ├── dapo_multiturn_w_tool.py
│   │   │   ├── full_hh_rlhf.py
│   │   │   ├── geo3k.py
│   │   │   ├── geo3k_multiturn_w_tool.py
│   │   │   ├── gsm8k.py
│   │   │   ├── gsm8k_multiturn_sft.py
│   │   │   ├── gsm8k_multiturn_w_interaction.py
│   │   │   ├── gsm8k_multiturn_w_tool.py
│   │   │   ├── gsm8k_tool_agent_loop.py
│   │   │   ├── hellaswag.py
│   │   │   ├── math_dataset.py
│   │   │   ├── multiturn.py
│   │   │   └── preprocess_search_r1_dataset.py
│   │   ├── generation/
│   │   │   ├── run_deepseek7b_mutli_node.sh
│   │   │   └── run_deepseek_v2_lite_math.sh
│   │   ├── gmpo_trainer/
│   │   │   ├── README.md
│   │   │   ├── run_qwen2_5-7b_math.sh
│   │   │   ├── test_dapo_7b_math.sh
│   │   │   └── test_dapo_qwen3_30b_math.sh
│   │   ├── gpg_trainer/
│   │   │   ├── gpg.md
│   │   │   ├── run_qwen2-7b_math.sh
│   │   │   └── run_qwen2-7b_math_megatron.sh
│   │   ├── grpo_trainer/
│   │   │   ├── README.md
│   │   │   ├── run_deepseek671b_math_megatron_80gb.sh
│   │   │   ├── run_deepseek671b_math_megatron_96gb.sh
│   │   │   ├── run_deepseek7b_llm.sh
│   │   │   ├── run_deepseek7b_llm_math.sh
│   │   │   ├── run_deepseek7b_llm_math_megatron.sh
│   │   │   ├── run_deepseek7b_llm_seq_balance.sh
│   │   │   ├── run_glm41v_9b.sh
│   │   │   ├── run_gptoss_20b.sh
│   │   │   ├── run_minicpmo2_6.sh
│   │   │   ├── run_mistral13b_skyworkrm_hhrlhf.sh
│   │   │   ├── run_moonlight16b_math_megatron.sh
│   │   │   ├── run_qwen2-7b.sh
│   │   │   ├── run_qwen2-7b_math.sh
│   │   │   ├── run_qwen2-7b_math_megatron.sh
│   │   │   ├── run_qwen2-7b_seq_balance.sh
│   │   │   ├── run_qwen2-7b_seq_balance_math_megatron.sh
│   │   │   ├── run_qwen2-7b_sgl_megatron.sh
│   │   │   ├── run_qwen2_5-3b_gsm8k_grpo_lora.sh
│   │   │   ├── run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh
│   │   │   ├── run_qwen2_5-7b_math_megatron_diff_tp.sh
│   │   │   ├── run_qwen2_5_32b_grpo_npu.sh
│   │   │   ├── run_qwen2_5_7b_grpo_discrete_prof_npu.sh
│   │   │   ├── run_qwen2_5_7b_grpo_e2e_prof_npu.sh
│   │   │   ├── run_qwen2_5_7b_grpo_npu.sh
│   │   │   ├── run_qwen2_5_vl-7b-megatron.sh
│   │   │   ├── run_qwen2_5_vl-7b-sglang.sh
│   │   │   ├── run_qwen2_5_vl-7b.sh
│   │   │   ├── run_qwen2_5_vl-7b_freeze_vision.sh
│   │   │   ├── run_qwen2_5_vl-7b_lora.sh
│   │   │   ├── run_qwen2_5_vl-7b_seq_balance.sh
│   │   │   ├── run_qwen2_5_vl_32b_npu.sh
│   │   │   ├── run_qwen2_5_vl_3b_npu.sh
│   │   │   ├── run_qwen2_5_vl_7b_npu.sh
│   │   │   ├── run_qwen3-235b_megatron_96gb.sh
│   │   │   ├── run_qwen3-32b_npu.sh
│   │   │   ├── run_qwen3-8b.sh
│   │   │   ├── run_qwen3-8b_npu.sh
│   │   │   ├── run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
│   │   │   ├── run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
│   │   │   ├── run_qwen3_vl-235b-megatron.sh
│   │   │   ├── run_qwen3_vl-30b-megatron.sh
│   │   │   ├── run_qwen3_vl-8b-megatron.sh
│   │   │   ├── run_qwen3moe-30b_megatron_96gb.sh
│   │   │   └── run_seed_oss_36b.sh
│   │   ├── ppo_trainer/
│   │   │   ├── README.md
│   │   │   ├── run_deepseek7b_llm.sh
│   │   │   ├── run_deepseek7b_llm_modelscope.sh
│   │   │   ├── run_deepseek7b_llm_pfppo.sh
│   │   │   ├── run_deepseek7b_llm_sandbox_fusion.sh
│   │   │   ├── run_deepseek7b_llm_sp2.sh
│   │   │   ├── run_deepseek_full_hh_rlhf.sh
│   │   │   ├── run_deepseek_math_gsm8k_megatron.sh
│   │   │   ├── run_deepseek_math_gsm8k_megatron_nsys.sh
│   │   │   ├── run_gemma.sh
│   │   │   ├── run_moonlight16b_a3b_gsm8k_megatron.sh
│   │   │   ├── run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
│   │   │   ├── run_qwen2-7b_math_gsm8k_megatron.sh
│   │   │   ├── run_qwen2-7b_rm.sh
│   │   │   ├── run_qwen2-7b_rm_seq_balance.sh
│   │   │   ├── run_qwen2-7b_rm_seq_balance_fused_kernels.sh
│   │   │   ├── run_qwen2-7b_rm_seq_balance_nsys.sh
│   │   │   ├── run_qwen2-7b_seq_balance.sh
│   │   │   ├── run_qwen2-7b_sglang_seq_balance.sh
│   │   │   ├── run_qwen2.5-32b.sh
│   │   │   └── run_qwen3-8b_npu.sh
│   │   ├── ray/
│   │   │   └── tutorial.ipynb
│   │   ├── reinforce_plus_plus_trainer/
│   │   │   ├── run_qwen2-7b_math_rf.sh
│   │   │   └── run_qwen2-7b_math_rf_baseline.sh
│   │   ├── remax_trainer/
│   │   │   ├── run_qwen2.5-3b_seq_balance.sh
│   │   │   └── run_qwen2.5-7b_seq_balance.sh
│   │   ├── rloo_trainer/
│   │   │   └── run_qwen2-7b.sh
│   │   ├── rollout_importance_sampling/
│   │   │   ├── README.md
│   │   │   └── run_with_rollout_is.sh
│   │   ├── sft/
│   │   │   ├── gsm8k/
│   │   │   │   ├── run_deepseek_6b7.sh
│   │   │   │   ├── run_gemma_2b.sh
│   │   │   │   ├── run_gemma_7b.sh
│   │   │   │   ├── run_qwen3_8b_sft_peft_sp2_npu.sh
│   │   │   │   ├── run_qwen_05_peft.sh
│   │   │   │   ├── run_qwen_05_sp2.sh
│   │   │   │   ├── run_qwen_05_sp2_liger.sh
│   │   │   │   └── run_seed_oss_36b_sft.sh
│   │   │   └── multiturn/
│   │   │       └── run_qwen_05_sp2.sh
│   │   ├── sglang_multiturn/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   ├── geo3k_multiturn_grpo.yaml
│   │   │   │   ├── geo3k_multiturn_megatron_grpo.yaml
│   │   │   │   ├── gsm8k_multiturn_grpo.yaml
│   │   │   │   ├── gsm8k_multiturn_grpo_server.yaml
│   │   │   │   ├── gsm8k_multiturn_grpo_w_interaction.yaml
│   │   │   │   ├── gsm8k_multiturn_megatron_grpo.yaml
│   │   │   │   ├── interaction_config/
│   │   │   │   │   └── gsm8k_interaction_config.yaml
│   │   │   │   ├── retool_multiturn_grpo.yaml
│   │   │   │   ├── search_multiturn_grpo.yaml
│   │   │   │   ├── search_multiturn_grpo_one_step_off.yaml
│   │   │   │   └── tool_config/
│   │   │   │       ├── geo3k_tool_config.yaml
│   │   │   │       ├── gsm8k_tool_config.yaml
│   │   │   │       ├── mcp_server.json
│   │   │   │       ├── mcp_tool_config.yaml
│   │   │   │       ├── sandbox_fusion_tool_config.yaml
│   │   │   │       └── search_tool_config.yaml
│   │   │   ├── geo3k/
│   │   │   │   ├── run_qwen2.5-3b_geo3k_multiturn.sh
│   │   │   │   ├── run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
│   │   │   │   └── run_qwen2.5-3b_megatron_geo3k_multiturn.sh
│   │   │   ├── run_qwen0.5b_gsm8k_multiturn_curriculum.sh
│   │   │   ├── run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_multiturn.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_multiturn_4xgpu_server.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_multiturn_server.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
│   │   │   ├── run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
│   │   │   ├── run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
│   │   │   ├── run_qwen3-4b_gsm8k_multiturn.sh
│   │   │   ├── run_qwen3_4b_dapo_multiturn.sh
│   │   │   └── search_r1_like/
│   │   │       ├── local_dense_retriever/
│   │   │       │   ├── download.py
│   │   │       │   └── retrieval_server.py
│   │   │       └── run_qwen2.5-3b_instruct_search_multiturn.sh
│   │   ├── skypilot/
│   │   │   ├── README.md
│   │   │   ├── verl-grpo.yaml
│   │   │   ├── verl-multiturn-tools.yaml
│   │   │   └── verl-ppo.yaml
│   │   ├── slurm/
│   │   │   └── ray_on_slurm.slurm
│   │   ├── split_placement/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   └── ppo_trainer_split.yaml
│   │   │   ├── main_ppo_split.py
│   │   │   ├── run_deepseek7b_llm.sh
│   │   │   └── split_monkey_patch.py
│   │   ├── tuning/
│   │   │   ├── 0.5b/
│   │   │   │   └── qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
│   │   │   ├── 1.5b/
│   │   │   │   └── qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
│   │   │   ├── 14b/
│   │   │   │   ├── qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh
│   │   │   │   └── qwen2_14b_grpo_4_h800_fsdp_vllm.sh
│   │   │   ├── 32b/
│   │   │   │   ├── qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh
│   │   │   │   └── qwen2_32B_grpo_8_h20_megatron_vllm.sh
│   │   │   ├── 3b/
│   │   │   │   └── qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh
│   │   │   ├── 70b/
│   │   │   │   ├── qwen2-70b_grpo_32_h20_fsdp_vllm.sh
│   │   │   │   ├── qwen2-70b_grpo_32_h800_fsdp_vllm.sh
│   │   │   │   └── qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh
│   │   │   └── 7b/
│   │   │       ├── qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh
│   │   │       └── qwen2-7b_grpo_2_h800_fsdp_vllm.sh
│   │   └── tutorial/
│   │       └── agent_loop_get_started/
│   │           ├── agent_loop_tutorial.ipynb
│   │           └── sandbox.py
│   ├── init_ray.sh
│   ├── init_ray_cluster.sh
│   ├── pyproject.toml
│   ├── recipe/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── char_count/
│   │   │   ├── README.md
│   │   │   ├── create_dataset.py
│   │   │   ├── reward_function.py
│   │   │   ├── train_grpo.sh
│   │   │   └── train_sft.sh
│   │   ├── collabllm/
│   │   │   ├── README.md
│   │   │   ├── collabllm_agent_loop.py
│   │   │   ├── collabllm_interation.py
│   │   │   ├── config/
│   │   │   │   ├── agent.yaml
│   │   │   │   └── collabllm_interaction_config.yaml
│   │   │   ├── metrics/
│   │   │   │   ├── accuracy.py
│   │   │   │   ├── bleu_score.py
│   │   │   │   ├── interactivity.py
│   │   │   │   ├── pass_rate.py
│   │   │   │   └── token_amount.py
│   │   │   ├── process_dataset.py
│   │   │   ├── reward_function.py
│   │   │   ├── train_rl_collabllm.sh
│   │   │   ├── train_sft_collabllm.sh
│   │   │   └── utils.py
│   │   ├── dapo/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   ├── dapo_megatron_trainer.yaml
│   │   │   │   └── dapo_trainer.yaml
│   │   │   ├── dapo_ray_trainer.py
│   │   │   ├── main_dapo.py
│   │   │   ├── prepare_dapo_data.sh
│   │   │   ├── run_dapo_early_qwen2.5_32b.sh
│   │   │   ├── run_dapo_qwen2.5_32b.sh
│   │   │   ├── run_dapo_qwen2.5_32b_npu.sh
│   │   │   ├── run_dapo_qwen2.5_32b_rollout_is.sh
│   │   │   ├── run_dapo_qwen2.5_7b_npu.sh
│   │   │   ├── run_dapo_qwen3_14b_base_npu.sh
│   │   │   ├── run_dapo_qwen3_8b_base_npu.sh
│   │   │   ├── run_dapo_qwen3_moe_30b_base_fsdp_npu.sh
│   │   │   ├── run_dapo_qwen3_moe_30b_megatron_npu.sh
│   │   │   ├── run_dapo_wo_ds_qwen2.5_32b.sh
│   │   │   ├── runtime_env.yaml
│   │   │   ├── test_dapo_7b.sh
│   │   │   ├── test_dapo_7b_math.sh
│   │   │   ├── test_dapo_7b_math_lora.sh
│   │   │   ├── test_dapo_7b_math_megatron.sh
│   │   │   ├── test_dapo_dspk_671b_megatron_96gb.sh
│   │   │   ├── test_dapo_glm_air_megatron.sh
│   │   │   ├── test_dapo_qwen3_30b_math.sh
│   │   │   └── test_dapo_qwen3_30b_math_single_node.sh
│   │   ├── deepeyes/
│   │   │   ├── README.md
│   │   │   ├── configs/
│   │   │   │   ├── deepeyes_multiturn_grpo.yaml
│   │   │   │   └── image_zoom_in_tool_config.yaml
│   │   │   ├── deepeyes.py
│   │   │   └── run_deepeyes_grpo.sh
│   │   ├── entropy/
│   │   │   ├── 32b_clip_cov.sh
│   │   │   ├── 32b_kl_cov.sh
│   │   │   ├── 32b_kl_cov_mininbsz.sh
│   │   │   ├── 7b_clip_cov.sh
│   │   │   ├── 7b_kl_cov.sh
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   └── entropy_trainer.yaml
│   │   │   ├── entropy_ray_trainer.py
│   │   │   ├── main_entropy.py
│   │   │   ├── reward.py
│   │   │   └── reward_score/
│   │   │       ├── __init__.py
│   │   │       └── entropy_math/
│   │   │           ├── __init__.py
│   │   │           ├── grader.py
│   │   │           └── math_normalize.py
│   │   ├── fapo/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   └── rm_config.yaml
│   │   │   ├── prepare_fapo_data.py
│   │   │   ├── reward_fn_genrm.py
│   │   │   ├── reward_fn_reasoning.py
│   │   │   ├── reward_fn_reasoning_remote.py
│   │   │   ├── run_baseline_32b.sh
│   │   │   ├── run_baseline_7b.sh
│   │   │   ├── run_fapo_32b.sh
│   │   │   ├── run_fapo_32b_remote.sh
│   │   │   ├── run_fapo_7b.sh
│   │   │   ├── run_fapo_7b_remote.sh
│   │   │   ├── run_fapo_genrm_train.sh
│   │   │   └── runtime_env.yaml
│   │   ├── fully_async_policy/
│   │   │   ├── README.md
│   │   │   ├── README_zh.md
│   │   │   ├── agent_loop/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── agent_loop.py
│   │   │   │   └── partial_single_turn_agent_loop.py
│   │   │   ├── config/
│   │   │   │   ├── fully_async_ppo_megatron_trainer.yaml
│   │   │   │   └── fully_async_ppo_trainer.yaml
│   │   │   ├── detach_utils.py
│   │   │   ├── fsdp2_utils.py
│   │   │   ├── fsdp_workers.py
│   │   │   ├── fully_async_main.py
│   │   │   ├── fully_async_rollouter.py
│   │   │   ├── fully_async_trainer.py
│   │   │   ├── megatron_worker.py
│   │   │   ├── message_queue.py
│   │   │   ├── param_sync.py
│   │   │   ├── ray_trainer.py
│   │   │   ├── shell/
│   │   │   │   ├── dapo_7b_math_fsdp2_16_16.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_32_32.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_4_12.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_4_4.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_64_64.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_64_64_mis.sh
│   │   │   │   ├── dapo_7b_math_fsdp2_8_8.sh
│   │   │   │   ├── geo3k_qwen25vl_7b_megatron_4_4.sh
│   │   │   │   └── runtime_env.yaml
│   │   │   ├── unittest/
│   │   │   │   └── simple_streaming_demo.py
│   │   │   └── vllm_rollout/
│   │   │       ├── __init__.py
│   │   │       └── vllm_async_server.py
│   │   ├── genrm_remote/
│   │   │   ├── README.md
│   │   │   ├── reward_function.py
│   │   │   └── run_genrm_remote.sh
│   │   ├── gspo/
│   │   │   ├── test_gspo_3b_math.sh
│   │   │   ├── test_gspo_3b_math_slurm.sh
│   │   │   └── test_gspo_qwen30b_a3b_ep.sh
│   │   ├── infigui-g1/
│   │   │   ├── README.md
│   │   │   ├── reward_fn.py
│   │   │   ├── run_3b.sh
│   │   │   └── run_7b.sh
│   │   ├── langgraph_agent/
│   │   │   ├── __init__.py
│   │   │   ├── chat_model.py
│   │   │   ├── example/
│   │   │   │   ├── README.md
│   │   │   │   ├── agent.yaml
│   │   │   │   ├── create_dataset.py
│   │   │   │   ├── math_expression.py
│   │   │   │   ├── run_gpt_oss_20b_bf16.sh
│   │   │   │   └── run_qwen2.5_3b.sh
│   │   │   ├── react_agent_loop.py
│   │   │   └── test_react_agent_loop.py
│   │   ├── minicpmo/
│   │   │   └── rl_dataset.py
│   │   ├── one_step_off_policy/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   ├── one_step_off_ppo_megatron_trainer.yaml
│   │   │   │   └── one_step_off_ppo_trainer.yaml
│   │   │   ├── dapo_7b_math_fsdp2_4_12.sh
│   │   │   ├── dapo_7b_math_fsdp2_colocate.sh
│   │   │   ├── dapo_7b_math_fsdp2_sglang_4_12.sh
│   │   │   ├── dapo_7b_math_fsdp2_sglang_colocate.sh
│   │   │   ├── dapo_7b_math_megatron_4_12.sh
│   │   │   ├── dapo_7b_math_megatron_colocate.sh
│   │   │   ├── distributed_util.py
│   │   │   ├── fsdp_workers.py
│   │   │   ├── grpo_0.6b_gsm8k_fsdp2_2_6.sh
│   │   │   ├── grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
│   │   │   ├── grpo_3b_gsm8k_fsdp2_2_6.sh
│   │   │   ├── main_ppo.py
│   │   │   ├── megatron_workers.py
│   │   │   ├── ray_trainer.py
│   │   │   ├── sglang_sharding_manager.py
│   │   │   ├── utils.py
│   │   │   └── vllm_sharding_manager.py
│   │   ├── onpolicy_distill/
│   │   │   ├── __init__.py
│   │   │   ├── config/
│   │   │   │   └── onpolicy_distill_trainer.yaml
│   │   │   ├── main_onpolicy_distill.py
│   │   │   ├── onpolicy_distill_trainer.py
│   │   │   └── run_qwen3_distill.sh
│   │   ├── open_math_reasoning/
│   │   │   ├── README.md
│   │   │   ├── compute_score.py
│   │   │   ├── prepare_eval_dataset.py
│   │   │   ├── prepare_nvidia-OpenMathReasoning_sft.py
│   │   │   ├── run_eval.sh
│   │   │   ├── run_generation.sh
│   │   │   └── run_sft_qwen3_8b.sh
│   │   ├── prime/
│   │   │   ├── __init__.py
│   │   │   ├── config/
│   │   │   │   └── prime_trainer.yaml
│   │   │   ├── main_prime.py
│   │   │   ├── prime_core_algos.py
│   │   │   ├── prime_dp_rm.py
│   │   │   ├── prime_fsdp_workers.py
│   │   │   ├── prime_ray_trainer.py
│   │   │   ├── run_prime_qwen.sh
│   │   │   └── run_prime_qwen_code.sh
│   │   ├── r1/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── config/
│   │   │   │   └── evaluation.yaml
│   │   │   ├── data_process.py
│   │   │   ├── main_eval.py
│   │   │   ├── reward_score.py
│   │   │   ├── run_r1_distill_qwen.sh
│   │   │   └── tasks/
│   │   │       ├── __init__.py
│   │   │       ├── gpqa.py
│   │   │       ├── livecodebench.py
│   │   │       └── math_reward.py
│   │   ├── retool/
│   │   │   ├── README.md
│   │   │   ├── retool.py
│   │   │   ├── retool_sft_preprocess.py
│   │   │   ├── run_gpt_oss_ppo.sh
│   │   │   ├── run_qwen2-32b_dapo.sh
│   │   │   ├── run_qwen2-32b_ppo.sh
│   │   │   ├── run_qwen2-32b_sft.sh
│   │   │   ├── run_qwen2_7b_dapo.sh
│   │   │   ├── run_qwen2_7b_sft.sh
│   │   │   ├── run_qwen2_7b_sft_npu.sh
│   │   │   └── sandbox_fusion_tool_config.yaml
│   │   ├── spin/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   └── spin_trainer.yaml
│   │   │   ├── core_algos.py
│   │   │   ├── dp_actor.py
│   │   │   ├── fsdp_workers.py
│   │   │   ├── main_spin.py
│   │   │   ├── run_spin.sh
│   │   │   ├── spin_trainer.py
│   │   │   └── utils.py
│   │   ├── sppo/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── config/
│   │   │   │   └── sppo_trainer.yaml
│   │   │   ├── config.py
│   │   │   ├── dp_actor.py
│   │   │   ├── main_sppo.py
│   │   │   ├── run_qwen2.5-7b_rm.sh
│   │   │   ├── sppo_ray_trainer.py
│   │   │   └── sppo_worker.py
│   │   └── transfer_queue/
│   │       ├── agent_loop.py
│   │       ├── config/
│   │       │   └── transfer_queue_ppo_trainer.yaml
│   │       ├── main_ppo.py
│   │       ├── ray_trainer.py
│   │       └── run_qwen3-8b_transferqueue_npu.sh
│   ├── requirements-cuda.txt
│   ├── requirements-npu.txt
│   ├── requirements.txt
│   ├── requirements_sglang.txt
│   ├── requirements_transferqueue.txt
│   ├── scripts/
│   │   ├── __init__.py
│   │   ├── converter_hf_to_mcore.py
│   │   ├── diagnose.py
│   │   ├── generate_trainer_config.sh
│   │   ├── init_random_model.py
│   │   ├── install_vllm_sglang_mcore.sh
│   │   ├── legacy_model_merger.py
│   │   ├── print_cfg.py
│   │   └── rollout_viewer.py
│   ├── setup.py
│   ├── tests/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── experimental/
│   │   │   ├── agent_loop/
│   │   │   │   ├── agent_utils.py
│   │   │   │   ├── qwen_vl_tool_chat_template.jinja2
│   │   │   │   ├── test_agent_loop_reward.py
│   │   │   │   ├── test_agent_loop_reward_model.py
│   │   │   │   ├── test_basic_agent_loop.py
│   │   │   │   ├── test_gpt_oss_tool_parser.py
│   │   │   │   ├── test_multi_modal.py
│   │   │   │   └── test_standalone_rollout.py
│   │   │   └── reward/
│   │   │       ├── reward_fn.py
│   │   │       ├── test_agent_loop_reward_manager.py
│   │   │       └── test_reward_model.py
│   │   ├── interactions/
│   │   │   ├── __init__.py
│   │   │   ├── test_gsm8k_interaction.py
│   │   │   └── test_interaction_registry.py
│   │   ├── kill_github_tests.sh
│   │   ├── models/
│   │   │   ├── test_engine.py
│   │   │   ├── test_transformer.py
│   │   │   └── test_transformers_ulysses.py
│   │   ├── single_controller/
│   │   │   ├── __init__.py
│   │   │   ├── base/
│   │   │   │   └── test_decorator.py
│   │   │   ├── check_worker_alive/
│   │   │   │   └── main.py
│   │   │   ├── detached_worker/
│   │   │   │   ├── README.md
│   │   │   │   ├── client.py
│   │   │   │   ├── run.sh
│   │   │   │   └── server.py
│   │   │   ├── test_auto_padding_on_cpu.py
│   │   │   ├── test_colocated_workers.py
│   │   │   ├── test_colocated_workers_fused.py
│   │   │   ├── test_data_transfer.py
│   │   │   ├── test_decorator_on_cpu.py
│   │   │   ├── test_device_mesh_register.py
│   │   │   ├── test_driverfunc_to_worker.py
│   │   │   ├── test_fused_workers_on_cpu.py
│   │   │   ├── test_high_level_scheduling_api.py
│   │   │   ├── test_nested_worker.py
│   │   │   ├── test_ray_collectives.py
│   │   │   ├── test_ray_local_envs_on_cpu.py
│   │   │   ├── test_ray_utils_on_cpu.py
│   │   │   ├── test_rvdz.py
│   │   │   ├── test_worker_group_basics.py
│   │   │   └── test_worker_group_torch.py
│   │   ├── special_distributed/
│   │   │   ├── README.md
│   │   │   ├── run_all.sh
│   │   │   ├── test_fsdp_ckpt.py
│   │   │   ├── test_mcore_config_converter.py
│   │   │   └── test_tensor_dict.py
│   │   ├── special_e2e/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── check_custom_rwd_fn.py
│   │   │   ├── check_results.py
│   │   │   ├── envs/
│   │   │   │   ├── __init__.py
│   │   │   │   └── digit_completion/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── task.py
│   │   │   │       └── tokenizer.py
│   │   │   ├── generation/
│   │   │   │   ├── run_gen_qwen05.sh
│   │   │   │   └── run_gen_qwen05_server.sh
│   │   │   ├── ppo_trainer/
│   │   │   │   ├── expert_parallel/
│   │   │   │   │   └── qwen2moe_minimal.json
│   │   │   │   ├── run_function_reward.sh
│   │   │   │   ├── run_model_reward.sh
│   │   │   │   ├── run_single_gpu.sh
│   │   │   │   └── run_single_gpu_with_engine.sh
│   │   │   ├── run_dapo.sh
│   │   │   ├── run_fully_async_policy.sh
│   │   │   ├── run_genrm_remote.sh
│   │   │   ├── run_geo3k_fsdp_sgl_multiturn_w_tool.sh
│   │   │   ├── run_grpo_lora_with_merge.sh
│   │   │   ├── run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
│   │   │   ├── run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
│   │   │   ├── run_one_step_off_policy.sh
│   │   │   ├── run_ppo_trainer_megatron.sh
│   │   │   ├── run_prime.sh
│   │   │   ├── run_r1_distill_qwen_aime24_eval.sh
│   │   │   ├── run_spin.sh
│   │   │   ├── run_sppo.sh
│   │   │   ├── run_test.sh
│   │   │   └── sft/
│   │   │       ├── compare_sft_engine_results.py
│   │   │       ├── run_sft.sh
│   │   │       ├── run_sft_engine_gsm8k.sh
│   │   │       ├── test_sft_engine_all.sh
│   │   │       └── test_sp_loss_match.py
│   │   ├── special_npu/
│   │   │   ├── run_qwen2_5_05b_dapo.sh
│   │   │   ├── run_qwen2_5_05b_grpo.sh
│   │   │   ├── run_qwen2_5_05b_grpo_mindspeed.sh
│   │   │   ├── run_qwen2_5_05b_sft_peft_sp2.sh
│   │   │   ├── run_qwen2_5_vl_3b_npu.sh
│   │   │   └── run_qwen3_06b_ppo.sh
│   │   ├── special_sanity/
│   │   │   ├── check_api_docs.py
│   │   │   ├── check_dataproto_usage.py
│   │   │   ├── check_device_api_usage.py
│   │   │   ├── check_docs_time_info.py
│   │   │   ├── check_docstrings.py
│   │   │   ├── check_license.py
│   │   │   ├── check_pr_description.py
│   │   │   ├── check_pr_title.py
│   │   │   ├── test_config_docs.py
│   │   │   ├── test_import.py
│   │   │   ├── type_coverage_check.py
│   │   │   ├── validate_imported_docs.py
│   │   │   └── validate_structure.py
│   │   ├── special_standalone/
│   │   │   ├── README.md
│   │   │   └── test_memory_buffers.py
│   │   ├── test_base_config_on_cpu.py
│   │   ├── test_protocol_on_cpu.py
│   │   ├── test_protocol_v2_on_cpu.py
│   │   ├── trainer/
│   │   │   ├── __init__.py
│   │   │   ├── config/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── legacy_ppo_megatron_trainer.yaml
│   │   │   │   ├── legacy_ppo_trainer.yaml
│   │   │   │   ├── test_algo_config_on_cpu.py
│   │   │   │   └── test_legacy_config_on_cpu.py
│   │   │   └── ppo/
│   │   │       ├── __init__.py
│   │   │       ├── test_core_algos_on_cpu.py
│   │   │       ├── test_metric_utils_on_cpu.py
│   │   │       ├── test_rollout_is.py
│   │   │       └── test_rollout_is_integration.py
│   │   ├── utils/
│   │   │   ├── _test_module.py
│   │   │   ├── dataset/
│   │   │   │   ├── test_create_rl_sampler_on_cpu.py
│   │   │   │   ├── test_multiturn_sft_dataset_on_cpu.py
│   │   │   │   ├── test_rl_collate_fn_on_cpu.py
│   │   │   │   ├── test_rl_dataset_on_cpu.py
│   │   │   │   └── test_sft_dataset_on_cpu.py
│   │   │   ├── debug/
│   │   │   │   └── test_metrics.py
│   │   │   ├── megatron/
│   │   │   │   └── test_pipeline_parallel.py
│   │   │   ├── reward_score/
│   │   │   │   ├── reward_score/
│   │   │   │   │   └── test_sandbox_fusion_on_cpu.py
│   │   │   │   └── test_sandbox_on_cpu.py
│   │   │   ├── test_activation_offload.py
│   │   │   ├── test_config_on_cpu.py
│   │   │   ├── test_flops_counter.py
│   │   │   ├── test_fs_on_cpu.py
│   │   │   ├── test_groupwise.py
│   │   │   ├── test_import_utils_on_cpu.py
│   │   │   ├── test_linear_cross_entropy.py
│   │   │   ├── test_mlflow_key_sanitization.py
│   │   │   ├── test_model_on_cpu.py
│   │   │   ├── test_nvtx_profile.py
│   │   │   ├── test_rollout_skip_on_cpu.py
│   │   │   ├── test_rollout_trace_on_cpu.py
│   │   │   ├── test_seqlen_balancing.py
│   │   │   ├── test_special_linear_cross_entropy_tp.py
│   │   │   ├── test_special_mstx_profile.py
│   │   │   ├── test_temp_env_on_cpu.py
│   │   │   ├── test_timeout_decorator_cpu.py
│   │   │   └── test_torch_functional.py
│   │   └── workers/
│   │       ├── actor/
│   │       │   └── test_special_dp_actor.py
│   │       ├── config/
│   │       │   ├── test_actor_config_on_cpu.py
│   │       │   ├── test_critic_config_on_cpu.py
│   │       │   ├── test_engine_config_on_cpu.py
│   │       │   └── test_optim_config_on_cpu.py
│   │       ├── critic/
│   │       │   └── test_special_dp_critic.py
│   │       ├── reward_manager/
│   │       │   └── test_registry_on_cpu.py
│   │       ├── rollout/
│   │       │   ├── perf/
│   │       │   │   └── vllm_async_rollout.py
│   │       │   ├── resource/
│   │       │   │   └── tool_configs/
│   │       │   │       ├── mcp_server.json
│   │       │   │       ├── mcp_tool_config
│   │       │   │       ├── sandbox_fusion_tool_config
│   │       │   │       └── search_tool_config
│   │       │   ├── rollout_sglang/
│   │       │   │   └── test_http_server_engine.py
│   │       │   ├── rollout_vllm/
│   │       │   │   ├── run_fsdp_vllm.py
│   │       │   │   ├── test_vllm_model_rope_scaling.py
│   │       │   │   └── test_vllm_spmd.py
│   │       │   ├── test_hf_rollout.py
│   │       │   ├── test_sglang_async_rollout_mcp_tools.py
│   │       │   ├── test_sglang_async_rollout_multimodal_delta.py
│   │       │   ├── test_sglang_async_rollout_search_tools.py
│   │       │   ├── test_sglang_async_rollout_sf_tools.py
│   │       │   ├── test_sglang_async_rollout_w_interaction.py
│   │       │   ├── test_sglang_async_rollout_w_tools.py
│   │       │   ├── test_sglang_async_rollout_w_tools_token_out.py
│   │       │   ├── test_sglang_multi_interaction.py
│   │       │   ├── test_sglang_rollout_sharding_manager.py
│   │       │   ├── test_sglang_spmd.py
│   │       │   └── utils_sglang.py
│   │       ├── test_fsdp_attn_implementation.py
│   │       └── test_fsdp_workers.py
│   └── verl/
│       ├── __init__.py
│       ├── base_config.py
│       ├── experimental/
│       │   ├── __init__.py
│       │   ├── agent_loop/
│       │   │   ├── __init__.py
│       │   │   ├── agent_loop.py
│       │   │   ├── single_turn_agent_loop.py
│       │   │   ├── tool_agent_loop.py
│       │   │   ├── tool_parser.py
│       │   │   └── utils.py
│       │   ├── dataset/
│       │   │   ├── __init__.py
│       │   │   └── sampler.py
│       │   ├── dynamic_dataset/
│       │   │   ├── __init__.py
│       │   │   └── dynamicgen_dataset.py
│       │   └── reward/
│       │       ├── __init__.py
│       │       ├── reward_loop/
│       │       │   ├── __init__.py
│       │       │   ├── base.py
│       │       │   ├── dapo.py
│       │       │   ├── naive.py
│       │       │   └── registry.py
│       │       ├── reward_manager.py
│       │       ├── reward_model.py
│       │       └── router/
│       │           ├── naive_router.py
│       │           └── sglang_router.py
│       ├── interactions/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── gsm8k_interaction.py
│       │   ├── utils/
│       │   │   ├── __init__.py
│       │   │   └── interaction_registry.py
│       │   └── weather_interaction.py
│       ├── model_merger/
│       │   ├── __init__.py
│       │   ├── __main__.py
│       │   ├── base_model_merger.py
│       │   ├── fsdp_model_merger.py
│       │   └── megatron_model_merger.py
│       ├── models/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── llama/
│       │   │   ├── __init__.py
│       │   │   └── megatron/
│       │   │       ├── __init__.py
│       │   │       ├── checkpoint_utils/
│       │   │       │   ├── __init__.py
│       │   │       │   ├── llama_loader.py
│       │   │       │   ├── llama_loader_depracated.py
│       │   │       │   └── llama_saver.py
│       │   │       ├── layers/
│       │   │       │   ├── __init__.py
│       │   │       │   ├── parallel_attention.py
│       │   │       │   ├── parallel_decoder.py
│       │   │       │   ├── parallel_linear.py
│       │   │       │   ├── parallel_mlp.py
│       │   │       │   └── parallel_rmsnorm.py
│       │   │       └── modeling_llama_megatron.py
│       │   ├── mcore/
│       │   │   ├── __init__.py
│       │   │   ├── config_converter.py
│       │   │   ├── loader.py
│       │   │   ├── mbridge.py
│       │   │   ├── model_forward.py
│       │   │   ├── model_forward_1f1b_overlap.py
│       │   │   ├── model_forward_fused.py
│       │   │   ├── model_initializer.py
│       │   │   ├── patch_v012.py
│       │   │   ├── qwen2_5_vl/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── attention.py
│       │   │   │   ├── model.py
│       │   │   │   ├── rope_utils.py
│       │   │   │   ├── vision_config.py
│       │   │   │   ├── vision_model.py
│       │   │   │   └── vision_transformer_block.py
│       │   │   ├── readme.md
│       │   │   ├── registry.py
│       │   │   ├── saver.py
│       │   │   ├── util.py
│       │   │   └── weight_converter.py
│       │   ├── qwen2/
│       │   │   ├── __init__.py
│       │   │   └── megatron/
│       │   │       ├── __init__.py
│       │   │       ├── checkpoint_utils/
│       │   │       │   ├── __init__.py
│       │   │       │   ├── qwen2_loader.py
│       │   │       │   ├── qwen2_loader_depracated.py
│       │   │       │   └── qwen2_saver.py
│       │   │       ├── layers/
│       │   │       │   ├── __init__.py
│       │   │       │   ├── parallel_attention.py
│       │   │       │   ├── parallel_decoder.py
│       │   │       │   ├── parallel_linear.py
│       │   │       │   ├── parallel_mlp.py
│       │   │       │   └── parallel_rmsnorm.py
│       │   │       └── modeling_qwen2_megatron.py
│       │   ├── registry.py
│       │   ├── transformers/
│       │   │   ├── __init__.py
│       │   │   ├── apertus.py
│       │   │   ├── dense_common.py
│       │   │   ├── glm4v.py
│       │   │   ├── kimi_vl.py
│       │   │   ├── llama.py
│       │   │   ├── monkey_patch.py
│       │   │   ├── npu_patch.py
│       │   │   ├── qwen2.py
│       │   │   ├── qwen2_vl.py
│       │   │   └── qwen3_vl.py
│       │   └── weight_loader_registry.py
│       ├── protocol.py
│       ├── py.typed
│       ├── single_controller/
│       │   ├── __init__.py
│       │   ├── base/
│       │   │   ├── __init__.py
│       │   │   ├── decorator.py
│       │   │   ├── worker.py
│       │   │   └── worker_group.py
│       │   └── ray/
│       │       ├── __init__.py
│       │       └── base.py
│       ├── third_party/
│       │   ├── __init__.py
│       │   ├── sglang/
│       │   │   ├── __init__.py
│       │   │   └── parallel_state.py
│       │   ├── torch/
│       │   │   ├── __init__.py
│       │   │   └── distributed/
│       │   │       ├── __init__.py
│       │   │       ├── _state_dict_utils.py
│       │   │       └── checkpoint/
│       │   │           ├── __init__.py
│       │   │           └── state_dict.py
│       │   └── vllm/
│       │       └── __init__.py
│       ├── tools/
│       │   ├── __init__.py
│       │   ├── base_tool.py
│       │   ├── geo3k_tool.py
│       │   ├── gsm8k_tool.py
│       │   ├── image_zoom_in_tool.py
│       │   ├── mcp_base_tool.py
│       │   ├── mcp_search_tool.py
│       │   ├── sandbox_fusion_tools.py
│       │   ├── schemas.py
│       │   ├── search_tool.py
│       │   └── utils/
│       │       ├── __init__.py
│       │       ├── mcp_clients/
│       │       │   ├── McpClientManager.py
│       │       │   └── utils.py
│       │       ├── search_r1_like_utils.py
│       │       └── tool_registry.py
│       ├── trainer/
│       │   ├── __init__.py
│       │   ├── config/
│       │   │   ├── __init__.py
│       │   │   ├── _generated_ppo_megatron_trainer.yaml
│       │   │   ├── _generated_ppo_trainer.yaml
│       │   │   ├── actor/
│       │   │   │   ├── actor.yaml
│       │   │   │   ├── dp_actor.yaml
│       │   │   │   └── megatron_actor.yaml
│       │   │   ├── algorithm.py
│       │   │   ├── config.py
│       │   │   ├── critic/
│       │   │   │   ├── critic.yaml
│       │   │   │   ├── dp_critic.yaml
│       │   │   │   └── megatron_critic.yaml
│       │   │   ├── data/
│       │   │   │   └── legacy_data.yaml
│       │   │   ├── engine/
│       │   │   │   ├── fsdp.yaml
│       │   │   │   └── megatron.yaml
│       │   │   ├── evaluation.yaml
│       │   │   ├── generation.yaml
│       │   │   ├── model/
│       │   │   │   └── hf_model.yaml
│       │   │   ├── npu_profile/
│       │   │   │   └── npu_profile.yaml
│       │   │   ├── optim/
│       │   │   │   ├── fsdp.yaml
│       │   │   │   └── megatron.yaml
│       │   │   ├── ppo_megatron_trainer.yaml
│       │   │   ├── ppo_trainer.yaml
│       │   │   ├── ref/
│       │   │   │   ├── dp_ref.yaml
│       │   │   │   ├── megatron_ref.yaml
│       │   │   │   └── ref.yaml
│       │   │   ├── reward_model/
│       │   │   │   ├── dp_reward_model.yaml
│       │   │   │   ├── megatron_reward_model.yaml
│       │   │   │   └── reward_model.yaml
│       │   │   ├── rollout/
│       │   │   │   └── rollout.yaml
│       │   │   ├── sft_trainer.yaml
│       │   │   └── sft_trainer_engine.yaml
│       │   ├── constants_ppo.py
│       │   ├── fsdp_sft_trainer.py
│       │   ├── main_eval.py
│       │   ├── main_generation.py
│       │   ├── main_generation_server.py
│       │   ├── main_ppo.py
│       │   ├── ppo/
│       │   │   ├── __init__.py
│       │   │   ├── core_algos.py
│       │   │   ├── metric_utils.py
│       │   │   ├── mismatch_helper.py
│       │   │   ├── ray_trainer.py
│       │   │   ├── reward.py
│       │   │   └── utils.py
│       │   ├── runtime_env.yaml
│       │   └── sft_trainer.py
│       ├── utils/
│       │   ├── __init__.py
│       │   ├── activation_offload.py
│       │   ├── attention_utils.py
│       │   ├── checkpoint/
│       │   │   ├── __init__.py
│       │   │   ├── checkpoint_handler.py
│       │   │   ├── checkpoint_manager.py
│       │   │   ├── fsdp_checkpoint_manager.py
│       │   │   └── megatron_checkpoint_manager.py
│       │   ├── config.py
│       │   ├── dataset/
│       │   │   ├── README.md
│       │   │   ├── __init__.py
│       │   │   ├── dataset_utils.py
│       │   │   ├── multiturn_sft_dataset.py
│       │   │   ├── onerec_dataset.py
│       │   │   ├── rl_dataset.py
│       │   │   ├── rm_dataset.py
│       │   │   ├── sft_dataset.py
│       │   │   └── vision_utils.py
│       │   ├── debug/
│       │   │   ├── __init__.py
│       │   │   ├── metrics.py
│       │   │   ├── performance.py
│       │   │   └── trajectory_tracker.py
│       │   ├── device.py
│       │   ├── distributed.py
│       │   ├── experimental/
│       │   │   ├── __init__.py
│       │   │   └── torch_functional.py
│       │   ├── flops_counter.py
│       │   ├── fs.py
│       │   ├── fsdp_utils.py
│       │   ├── groupwise.py
│       │   ├── hdfs_io.py
│       │   ├── import_utils.py
│       │   ├── kernel/
│       │   │   ├── __init__.py
│       │   │   ├── kernels.py
│       │   │   └── linear_cross_entropy.py
│       │   ├── logger/
│       │   │   ├── __init__.py
│       │   │   └── aggregate_logger.py
│       │   ├── logging_utils.py
│       │   ├── megatron/
│       │   │   ├── __init__.py
│       │   │   ├── dist_checkpointing.py
│       │   │   ├── memory.py
│       │   │   ├── optimizer.py
│       │   │   ├── pipeline_parallel.py
│       │   │   ├── sequence_parallel.py
│       │   │   └── tensor_parallel.py
│       │   ├── megatron_utils.py
│       │   ├── memory_buffer.py
│       │   ├── memory_utils.py
│       │   ├── metric/
│       │   │   ├── __init__.py
│       │   │   └── utils.py
│       │   ├── model.py
│       │   ├── net_utils.py
│       │   ├── npu_utils.py
│       │   ├── profiler/
│       │   │   ├── __init__.py
│       │   │   ├── config.py
│       │   │   ├── empty_annotations.py
│       │   │   ├── mstx_profile.py
│       │   │   ├── nvtx_profile.py
│       │   │   ├── performance.py
│       │   │   └── profile.py
│       │   ├── py_functional.py
│       │   ├── ray_utils.py
│       │   ├── rendezvous/
│       │   │   ├── __init__.py
│       │   │   └── ray_backend.py
│       │   ├── reward_score/
│       │   │   ├── __init__.py
│       │   │   ├── geo3k.py
│       │   │   ├── gsm8k.py
│       │   │   ├── math_batch.py
│       │   │   ├── math_dapo.py
│       │   │   ├── math_reward.py
│       │   │   ├── math_verify.py
│       │   │   ├── prime_code/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── testing_util.py
│       │   │   │   └── utils.py
│       │   │   ├── prime_math/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── grader.py
│       │   │   │   └── math_normalize.py
│       │   │   ├── sandbox_fusion/
│       │   │   │   ├── __init__.py
│       │   │   │   └── utils.py
│       │   │   └── search_r1_like_qa_em.py
│       │   ├── rollout_skip.py
│       │   ├── rollout_trace.py
│       │   ├── seqlen_balancing.py
│       │   ├── tensordict_utils.py
│       │   ├── tokenizer.py
│       │   ├── torch_dtypes.py
│       │   ├── torch_functional.py
│       │   ├── tracking.py
│       │   ├── transferqueue_utils.py
│       │   ├── transformers_compat.py
│       │   ├── ulysses.py
│       │   └── vllm/
│       │       ├── __init__.py
│       │       ├── patch.py
│       │       └── utils.py
│       ├── version/
│       │   └── version
│       └── workers/
│           ├── __init__.py
│           ├── actor/
│           │   ├── __init__.py
│           │   ├── base.py
│           │   ├── dp_actor.py
│           │   └── megatron_actor.py
│           ├── config/
│           │   ├── __init__.py
│           │   ├── actor.py
│           │   ├── critic.py
│           │   ├── engine.py
│           │   ├── model.py
│           │   ├── optimizer.py
│           │   ├── reward_model.py
│           │   └── rollout.py
│           ├── critic/
│           │   ├── __init__.py
│           │   ├── base.py
│           │   ├── dp_critic.py
│           │   └── megatron_critic.py
│           ├── engine/
│           │   ├── __init__.py
│           │   ├── base.py
│           │   ├── fsdp/
│           │   │   ├── __init__.py
│           │   │   ├── transformer_impl.py
│           │   │   └── utils.py
│           │   ├── megatron/
│           │   │   ├── __init__.py
│           │   │   ├── transformer_impl.py
│           │   │   └── utils.py
│           │   ├── mindspeed/
│           │   │   ├── __init__.py
│           │   │   └── transformer_impl.py
│           │   └── utils.py
│           ├── fsdp_workers.py
│           ├── megatron_workers.py
│           ├── reward_manager/
│           │   ├── __init__.py
│           │   ├── abstract.py
│           │   ├── batch.py
│           │   ├── dapo.py
│           │   ├── naive.py
│           │   ├── prime.py
│           │   └── registry.py
│           ├── reward_model/
│           │   ├── __init__.py
│           │   ├── base.py
│           │   └── megatron/
│           │       ├── __init__.py
│           │       └── reward_model.py
│           ├── roles/
│           │   ├── __init__.py
│           │   ├── actor.py
│           │   ├── critic.py
│           │   ├── hybrid_engine.py
│           │   └── utils/
│           │       ├── __init__.py
│           │       ├── losses.py
│           │       └── padding.py
│           ├── rollout/
│           │   ├── __init__.py
│           │   ├── base.py
│           │   ├── hf_rollout.py
│           │   ├── naive/
│           │   │   ├── __init__.py
│           │   │   └── naive_rollout.py
│           │   ├── replica.py
│           │   ├── schemas.py
│           │   ├── sglang_rollout/
│           │   │   ├── __init__.py
│           │   │   ├── async_sglang_server.py
│           │   │   ├── http_server_engine.py
│           │   │   ├── sglang_rollout.py
│           │   │   └── utils.py
│           │   ├── tokenizer.py
│           │   ├── utils.py
│           │   └── vllm_rollout/
│           │       ├── __init__.py
│           │       ├── utils.py
│           │       ├── vllm_async_server.py
│           │       └── vllm_rollout_spmd.py
│           └── sharding_manager/
│               ├── __init__.py
│               ├── base.py
│               ├── fsdp_sglang.py
│               ├── fsdp_ulysses.py
│               ├── fsdp_vllm.py
│               ├── megatron_sglang.py
│               └── megatron_vllm.py
└── verl_rl/
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── README_ORIGINAL.md
    ├── deploy_env.sh
    ├── docker/
    │   ├── Apptainerfile.rocm
    │   ├── Dockerfile.extention.awsefa
    │   ├── Dockerfile.ngc.vllm
    │   ├── Dockerfile.ngc.vllm0.8
    │   ├── Dockerfile.ngc.vllm0.8.sagemaker
    │   ├── Dockerfile.rocm
    │   ├── Dockerfile.rocm_verl-0.3.0.post1
    │   ├── Dockerfile.rocm_verl-0.4.1
    │   ├── Dockerfile.sglang
    │   ├── Dockerfile.vemlp.vllm.te
    │   ├── Dockerfile.vllm.sglang.megatron.deepseek
    │   ├── README.md
    │   ├── verl0.4-cu124-torch2.6-fa2.7.4/
    │   │   ├── Dockerfile.app.sglang.vllm.mcore0.12
    │   │   ├── Dockerfile.app.sglang.vllm.mcore0.12.deepep
    │   │   ├── Dockerfile.app.sglang.vllm.mcore0.13.preview
    │   │   ├── Dockerfile.app.vllm.mcore0.12
    │   │   ├── Dockerfile.app.vllm.mcore0.12.deepep
    │   │   ├── Dockerfile.app.vllm.mcore0.13.preview
    │   │   ├── Dockerfile.base
    │   │   └── README.md
    │   ├── verl0.5-cu126-torch2.7-fa2.7.4/
    │   │   ├── Dockerfile.app.sglang.mcore0.12
    │   │   ├── Dockerfile.app.vllm.mcore0.12
    │   │   ├── Dockerfile.base.torch2.7.0
    │   │   ├── Dockerfile.base.torch2.7.1
    │   │   └── README.md
    │   ├── verl0.5-cu126-torch2.7.1-fa2.8.0/
    │   │   ├── Dockerfile.app.sglang.mcore0.12
    │   │   ├── Dockerfile.app.sglang.mcore0.13.preview
    │   │   ├── Dockerfile.base
    │   │   └── README.md
    │   └── verl0.5-preview-cu128-torch2.7.1-fa2.8.0/
    │       ├── Dockerfile.app.sglang.megatron
    │       ├── Dockerfile.base
    │       └── README.md
    ├── docs/
    │   ├── Makefile
    │   ├── README.md
    │   ├── README_vllm0.7.md
    │   ├── README_vllm0.8.md
    │   ├── _static/
    │   │   └── js/
    │   │       └── runllm-widget.js
    │   ├── advance/
    │   │   ├── agent_loop.rst
    │   │   ├── checkpoint.rst
    │   │   ├── dpo_extension.rst
    │   │   ├── fsdp_extension.rst
    │   │   ├── megatron_extension.rst
    │   │   ├── one_step_off.md
    │   │   ├── placement.rst
    │   │   ├── ppo_lora.rst
    │   │   ├── rollout_trace.rst
    │   │   └── rope.rst
    │   ├── algo/
    │   │   ├── baseline.md
    │   │   ├── dapo.md
    │   │   ├── entropy.md
    │   │   ├── gpg.md
    │   │   ├── grpo.md
    │   │   ├── opo.md
    │   │   ├── ppo.md
    │   │   ├── spin.md
    │   │   └── sppo.md
    │   ├── amd_tutorial/
    │   │   ├── amd_build_dockerfile_page.rst
    │   │   └── amd_vllm_page.rst
    │   ├── api/
    │   │   ├── data.rst
    │   │   ├── single_controller.rst
    │   │   ├── trainer.rst
    │   │   └── utils.rst
    │   ├── ascend_tutorial/
    │   │   ├── ascend_profiling.rst
    │   │   ├── ascend_profiling_en.rst
    │   │   └── ascend_quick_start.rst
    │   ├── conf.py
    │   ├── examples/
    │   │   ├── config.rst
    │   │   ├── gsm8k_example.rst
    │   │   ├── multi_modal_example.rst
    │   │   ├── ppo_code_architecture.rst
    │   │   └── sandbox_fusion_example.rst
    │   ├── faq/
    │   │   └── faq.rst
    │   ├── hybrid_flow.rst
    │   ├── index.rst
    │   ├── perf/
    │   │   ├── device_tuning.rst
    │   │   ├── dpsk.md
    │   │   ├── nsight_profiling.md
    │   │   └── perf_tuning.rst
    │   ├── preparation/
    │   │   ├── prepare_data.rst
    │   │   └── reward_function.rst
    │   ├── requirements-docs.txt
    │   ├── sglang_multiturn/
    │   │   ├── interaction_system.rst
    │   │   ├── multiturn.rst
    │   │   ├── sandbox_fusion.rst
    │   │   └── search_tool_example.rst
    │   ├── single_controller.rst
    │   ├── start/
    │   │   ├── agentic_rl.rst
    │   │   ├── install.rst
    │   │   ├── more_resources.rst
    │   │   ├── multinode.rst
    │   │   ├── quickstart.rst
    │   │   └── ray_debug_tutorial.rst
    │   └── workers/
    │       ├── fsdp_workers.rst
    │       ├── megatron_workers.rst
    │       ├── ray_trainer.rst
    │       └── sglang_worker.rst
    ├── examples/
    │   ├── data_preprocess/
    │   │   ├── aime2024_multiturn_w_tool.py
    │   │   ├── dapo_multiturn_w_tool.py
    │   │   ├── full_hh_rlhf.py
    │   │   ├── geo3k.py
    │   │   ├── geo3k_multiturn_w_tool.py
    │   │   ├── gsm8k.py
    │   │   ├── gsm8k_multiturn_w_interaction.py
    │   │   ├── gsm8k_multiturn_w_tool.py
    │   │   ├── gsm8k_tool_agent_loop.py
    │   │   ├── hellaswag.py
    │   │   ├── math_dataset.py
    │   │   ├── multiturn.py
    │   │   └── preprocess_search_r1_dataset.py
    │   ├── generation/
    │   │   ├── run_deepseek7b_mutli_node.sh
    │   │   └── run_deepseek_v2_lite_math.sh
    │   ├── gpg_trainer/
    │   │   ├── gpg.md
    │   │   ├── run_qwen2-7b_math.sh
    │   │   └── run_qwen2-7b_math_megatron.sh
    │   ├── grpo_trainer/
    │   │   ├── README.md
    │   │   ├── run_deepseek671b_math_megatron.sh
    │   │   ├── run_deepseek7b_llm.sh
    │   │   ├── run_deepseek7b_llm_math.sh
    │   │   ├── run_deepseek7b_llm_math_megatron.sh
    │   │   ├── run_deepseek7b_llm_seq_balance.sh
    │   │   ├── run_minicpmo2_6.sh
    │   │   ├── run_moonlight16b_math_megatron.sh
    │   │   ├── run_qwen2-7b.sh
    │   │   ├── run_qwen2-7b_math.sh
    │   │   ├── run_qwen2-7b_math_megatron.sh
    │   │   ├── run_qwen2-7b_seq_balance.sh
    │   │   ├── run_qwen2-7b_seq_balance_math_megatron.sh
    │   │   ├── run_qwen2-7b_sgl_megatron.sh
    │   │   ├── run_qwen2_5-3b_gsm8k_grpo_lora.sh
    │   │   ├── run_qwen2_5-7b_math_megatron_diff_tp.sh
    │   │   ├── run_qwen2_5_32b_grpo_npu.sh
    │   │   ├── run_qwen2_5_7b_grpo_discrete_prof_npu.sh
    │   │   ├── run_qwen2_5_7b_grpo_e2e_prof_npu.sh
    │   │   ├── run_qwen2_5_7b_grpo_npu.sh
    │   │   ├── run_qwen2_5_vl-7b-megatron.sh
    │   │   ├── run_qwen2_5_vl-7b.sh
    │   │   ├── run_qwen2_5_vl-7b_lora.sh
    │   │   ├── run_qwen2_5_vl-7b_seq_balance.sh
    │   │   ├── run_qwen2_5_vl_32b_npu.sh
    │   │   ├── run_qwen2_5_vl_3b_npu.sh
    │   │   ├── run_qwen2_5_vl_7b_npu.sh
    │   │   ├── run_qwen3-236b_megatron.sh
    │   │   ├── run_qwen3-8b.sh
    │   │   └── run_qwen3moe-30b_megatron.sh
    │   ├── ppo_trainer/
    │   │   ├── README.md
    │   │   ├── run_deepseek7b_llm.sh
    │   │   ├── run_deepseek7b_llm_modelscope.sh
    │   │   ├── run_deepseek7b_llm_pfppo.sh
    │   │   ├── run_deepseek7b_llm_sandbox_fusion.sh
    │   │   ├── run_deepseek7b_llm_sp2.sh
    │   │   ├── run_deepseek_full_hh_rlhf.sh
    │   │   ├── run_deepseek_math_gsm8k_megatron.sh
    │   │   ├── run_deepseek_math_gsm8k_megatron_nsys.sh
    │   │   ├── run_gemma.sh
    │   │   ├── run_moonlight16b_a3b_gsm8k_megatron.sh
    │   │   ├── run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
    │   │   ├── run_qwen2-7b_math_gsm8k_megatron.sh
    │   │   ├── run_qwen2-7b_rm.sh
    │   │   ├── run_qwen2-7b_rm_seq_balance.sh
    │   │   ├── run_qwen2-7b_rm_seq_balance_fused_kernels.sh
    │   │   ├── run_qwen2-7b_rm_seq_balance_nsys.sh
    │   │   ├── run_qwen2-7b_seq_balance.sh
    │   │   ├── run_qwen2-7b_sglang_seq_balance.sh
    │   │   └── run_qwen2.5-32b.sh
    │   ├── ray/
    │   │   └── tutorial.ipynb
    │   ├── reinforce_plus_plus_trainer/
    │   │   ├── run_qwen2-7b_math_rf.sh
    │   │   └── run_qwen2-7b_math_rf_baseline.sh
    │   ├── remax_trainer/
    │   │   ├── run_qwen2.5-3b_seq_balance.sh
    │   │   └── run_qwen2.5-7b_seq_balance.sh
    │   ├── rloo_trainer/
    │   │   └── run_qwen2-7b.sh
    │   ├── sft/
    │   │   ├── gsm8k/
    │   │   │   ├── run_deepseek_6b7.sh
    │   │   │   ├── run_gemma_2b.sh
    │   │   │   ├── run_gemma_7b.sh
    │   │   │   ├── run_qwen2_5_05b_sft_peft_sp2_npu.sh
    │   │   │   ├── run_qwen_05_peft.sh
    │   │   │   ├── run_qwen_05_sp2.sh
    │   │   │   └── run_qwen_05_sp2_liger.sh
    │   │   └── multiturn/
    │   │       └── run_qwen_05_sp2.sh
    │   ├── sglang_multiturn/
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   ├── geo3k_multiturn_grpo.yaml
    │   │   │   ├── geo3k_multiturn_megatron_grpo.yaml
    │   │   │   ├── gsm8k_multiturn_grpo.yaml
    │   │   │   ├── gsm8k_multiturn_grpo_w_interaction.yaml
    │   │   │   ├── gsm8k_multiturn_megatron_grpo.yaml
    │   │   │   ├── interaction_config/
    │   │   │   │   └── gsm8k_interaction_config.yaml
    │   │   │   ├── retool_multiturn_grpo.yaml
    │   │   │   ├── search_multiturn_grpo.yaml
    │   │   │   └── tool_config/
    │   │   │       ├── geo3k_tool_config.yaml
    │   │   │       ├── gsm8k_tool_config.yaml
    │   │   │       ├── mcp_server.json
    │   │   │       ├── mcp_tool_config.yaml
    │   │   │       ├── sandbox_fusion_tool_config.yaml
    │   │   │       └── search_tool_config.yaml
    │   │   ├── geo3k/
    │   │   │   ├── run_qwen2.5-3b_geo3k_multiturn.sh
    │   │   │   ├── run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
    │   │   │   └── run_qwen2.5-3b_megatron_geo3k_multiturn.sh
    │   │   ├── run_qwen0.5b_gsm8k_multiturn_curriculum.sh
    │   │   ├── run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
    │   │   ├── run_qwen2.5-3b_gsm8k_multiturn.sh
    │   │   ├── run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
    │   │   ├── run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
    │   │   ├── run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
    │   │   ├── run_qwen3-4b_gsm8k_multiturn.sh
    │   │   └── search_r1_like/
    │   │       ├── local_dense_retriever/
    │   │       │   ├── download.py
    │   │       │   └── retrieval_server.py
    │   │       └── run_qwen2.5-3b_instruct_search_multiturn.sh
    │   ├── slurm/
    │   │   └── ray_on_slurm.slurm
    │   ├── split_placement/
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   └── ppo_trainer_split.yaml
    │   │   ├── main_ppo_split.py
    │   │   ├── run_deepseek7b_llm.sh
    │   │   └── split_monkey_patch.py
    │   └── tuning/
    │       ├── 0.5b/
    │       │   └── qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
    │       ├── 1.5b/
    │       │   └── qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
    │       ├── 14b/
    │       │   ├── qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh
    │       │   └── qwen2_14b_grpo_4_h800_fsdp_vllm.sh
    │       ├── 32b/
    │       │   ├── qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh
    │       │   └── qwen2_32B_grpo_8_h20_megatron_vllm.sh
    │       ├── 3b/
    │       │   └── qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh
    │       ├── 70b/
    │       │   ├── qwen2-70b_grpo_32_h20_fsdp_vllm.sh
    │       │   ├── qwen2-70b_grpo_32_h800_fsdp_vllm.sh
    │       │   └── qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh
    │       └── 7b/
    │           ├── qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh
    │           └── qwen2-7b_grpo_2_h800_fsdp_vllm.sh
    ├── init_ray.sh
    ├── init_ray_cluster.sh
    ├── pyproject.toml
    ├── recipe/
    │   ├── README.md
    │   ├── char_count/
    │   │   ├── README.md
    │   │   ├── create_dataset.py
    │   │   ├── reward_function.py
    │   │   ├── train_grpo.sh
    │   │   └── train_sft.sh
    │   ├── dapo/
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   └── dapo_trainer.yaml
    │   │   ├── dapo_ray_trainer.py
    │   │   ├── main_dapo.py
    │   │   ├── prepare_dapo_data.sh
    │   │   ├── run_dapo_early_qwen2.5_32b.sh
    │   │   ├── run_dapo_qwen2.5_32b.sh
    │   │   ├── run_dapo_wo_ds_qwen2.5_32b.sh
    │   │   ├── runtime_env.yaml
    │   │   ├── test_dapo_7b.sh
    │   │   ├── test_dapo_7b_math.sh
    │   │   ├── test_dapo_7b_math_lora.sh
    │   │   ├── test_dapo_7b_math_megatron.sh
    │   │   ├── test_dapo_dspk_671b_megatron.sh
    │   │   ├── test_dapo_qwen3_30b_math.sh
    │   │   └── test_dapo_qwen3_30b_math_single_node.sh
    │   ├── entropy/
    │   │   ├── 32b_clip_cov.sh
    │   │   ├── 32b_kl_cov.sh
    │   │   ├── 32b_kl_cov_mininbsz.sh
    │   │   ├── 7b_clip_cov.sh
    │   │   ├── 7b_kl_cov.sh
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   └── entropy_trainer.yaml
    │   │   ├── entropy_ray_trainer.py
    │   │   ├── main_entropy.py
    │   │   ├── reward.py
    │   │   └── reward_score/
    │   │       ├── __init__.py
    │   │       └── entropy_math/
    │   │           ├── __init__.py
    │   │           ├── grader.py
    │   │           └── math_normalize.py
    │   ├── genrm_remote/
    │   │   ├── README.md
    │   │   ├── reward_function.py
    │   │   └── run_genrm_remote.sh
    │   ├── langgraph_agent/
    │   │   ├── __init__.py
    │   │   ├── chat_model.py
    │   │   ├── example/
    │   │   │   ├── README.md
    │   │   │   ├── agent.yaml
    │   │   │   ├── create_dataset.py
    │   │   │   ├── math_expression.py
    │   │   │   └── run_qwen2.5_3b.sh
    │   │   ├── react_agent_loop.py
    │   │   └── test_react_agent_loop.py
    │   ├── minicpmo/
    │   │   └── rl_dataset.py
    │   ├── one_step_off_policy/
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   ├── one_step_off_ppo_megatron_trainer.yaml
    │   │   │   └── one_step_off_ppo_trainer.yaml
    │   │   ├── dapo_7b_math_fsdp2_4_12.sh
    │   │   ├── dapo_7b_math_fsdp2_colocate.sh
    │   │   ├── dapo_7b_math_megatron_4_12.sh
    │   │   ├── dapo_7b_math_megatron_colocate.sh
    │   │   ├── fsdp_workers.py
    │   │   ├── grpo_0.6b_gsm8k_fsdp2_2_6.sh
    │   │   ├── grpo_3b_gsm8k_fsdp2_2_6.sh
    │   │   ├── main_ppo.py
    │   │   ├── megatron_workers.py
    │   │   ├── ray_trainer.py
    │   │   └── vllm_sharding_manager.py
    │   ├── onerec/
    │   │   ├── main_onerec_ppo.py
    │   │   ├── onerec_fsdp_workers.py
    │   │   ├── onerec_ray_trainer.py
    │   │   ├── onerec_recipe.py
    │   │   ├── onerec_vllm_rollout.py
    │   │   └── run_grpo.sh
    │   ├── prime/
    │   │   ├── __init__.py
    │   │   ├── config/
    │   │   │   └── prime_trainer.yaml
    │   │   ├── main_prime.py
    │   │   ├── prime_core_algos.py
    │   │   ├── prime_dp_rm.py
    │   │   ├── prime_fsdp_workers.py
    │   │   ├── prime_ray_trainer.py
    │   │   ├── run_prime_qwen.sh
    │   │   └── run_prime_qwen_code.sh
    │   ├── r1/
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── config/
    │   │   │   └── evaluation.yaml
    │   │   ├── data_process.py
    │   │   ├── main_eval.py
    │   │   ├── reward_score.py
    │   │   ├── run_r1_distill_qwen.sh
    │   │   └── tasks/
    │   │       ├── __init__.py
    │   │       ├── gpqa.py
    │   │       ├── livecodebench.py
    │   │       └── math.py
    │   ├── retool/
    │   │   ├── retool.py
    │   │   ├── retool_multi_turn_sft_preprocess.py
    │   │   ├── retool_sft_preprocess.py
    │   │   ├── run_qwen2-32b_sft.sh
    │   │   ├── run_qwen2.5_32b_sp8.sh
    │   │   ├── run_qwen2.5_7b_sp4.sh
    │   │   ├── run_qwen3_4b_sp4.sh
    │   │   └── sandbox_fusion_tool_config.yaml
    │   ├── spin/
    │   │   ├── README.md
    │   │   ├── config/
    │   │   │   └── spin_trainer.yaml
    │   │   ├── core_algos.py
    │   │   ├── dp_actor.py
    │   │   ├── fsdp_workers.py
    │   │   ├── main_spin.py
    │   │   ├── run_spin.sh
    │   │   └── spin_trainer.py
    │   └── sppo/
    │       ├── README.md
    │       ├── __init__.py
    │       ├── config/
    │       │   └── sppo_trainer.yaml
    │       ├── dp_actor.py
    │       ├── main_sppo.py
    │       ├── run_qwen2.5-7b_rm.sh
    │       ├── sppo_ray_trainer.py
    │       └── sppo_worker.py
    ├── requirements-npu.txt
    ├── requirements.txt
    ├── requirements_sglang.txt
    ├── scripts/
    │   ├── __init__.py
    │   ├── converter_hf_to_mcore.py
    │   ├── diagnose.py
    │   ├── generate_trainer_config.sh
    │   ├── init_random_model.py
    │   ├── install_vllm_sglang_mcore.sh
    │   ├── legacy_model_merger.py
    │   ├── print_cfg.py
    │   └── rollout_viewer.py
    ├── setup.py
    ├── tests/
    │   ├── README.md
    │   ├── __init__.py
    │   ├── experimental/
    │   │   └── agent_loop/
    │   │       ├── agent_utils.py
    │   │       └── test_basic_agent_loop.py
    │   ├── interactions/
    │   │   ├── __init__.py
    │   │   ├── test_gsm8k_interaction.py
    │   │   └── test_interaction_registry.py
    │   ├── kill_github_tests.sh
    │   ├── models/
    │   │   ├── test_transformer.py
    │   │   └── test_transformers_ulysses.py
    │   ├── single_controller/
    │   │   ├── __init__.py
    │   │   ├── base/
    │   │   │   └── test_decorator.py
    │   │   ├── check_worker_alive/
    │   │   │   └── main.py
    │   │   ├── detached_worker/
    │   │   │   ├── README.md
    │   │   │   ├── client.py
    │   │   │   ├── run.sh
    │   │   │   └── server.py
    │   │   ├── test_auto_padding_on_cpu.py
    │   │   ├── test_colocated_workers.py
    │   │   ├── test_colocated_workers_fused.py
    │   │   ├── test_data_transfer.py
    │   │   ├── test_decorator_on_cpu.py
    │   │   ├── test_driverfunc_to_worker.py
    │   │   ├── test_fused_workers_on_cpu.py
    │   │   ├── test_high_level_scheduling_api.py
    │   │   ├── test_ray_collectives.py
    │   │   ├── test_ray_local_envs_on_cpu.py
    │   │   ├── test_ray_utils_on_cpu.py
    │   │   ├── test_rvdz.py
    │   │   ├── test_worker_group_basics.py
    │   │   └── test_worker_group_torch.py
    │   ├── special_distributed/
    │   │   ├── README.md
    │   │   ├── run_all.sh
    │   │   ├── test_fsdp_ckpt.py
    │   │   └── test_tensor_dict.py
    │   ├── special_e2e/
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── check_custom_rwd_fn.py
    │   │   ├── check_results.py
    │   │   ├── envs/
    │   │   │   ├── __init__.py
    │   │   │   └── digit_completion/
    │   │   │       ├── __init__.py
    │   │   │       ├── task.py
    │   │   │       └── tokenizer.py
    │   │   ├── generation/
    │   │   │   └── run_gen_qwen05.sh
    │   │   ├── ppo_trainer/
    │   │   │   ├── expert_parallel/
    │   │   │   │   └── qwen2moe_minimal.json
    │   │   │   ├── run_function_reward.sh
    │   │   │   ├── run_model_reward.sh
    │   │   │   ├── run_single_gpu.sh
    │   │   │   └── run_single_gpu_with_engine.sh
    │   │   ├── run_dapo.sh
    │   │   ├── run_genrm_remote.sh
    │   │   ├── run_geo3k_fsdp_sgl_multiturn_w_tool.sh
    │   │   ├── run_grpo_lora_with_merge.sh
    │   │   ├── run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
    │   │   ├── run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
    │   │   ├── run_one_step_off_policy.sh
    │   │   ├── run_ppo_trainer_megatron.sh
    │   │   ├── run_prime.sh
    │   │   ├── run_r1_distill_qwen_aime24_eval.sh
    │   │   ├── run_spin.sh
    │   │   ├── run_sppo.sh
    │   │   ├── run_test.sh
    │   │   └── sft/
    │   │       ├── run_sft.sh
    │   │       └── test_sp_loss_match.py
    │   ├── special_npu/
    │   │   ├── run_qwen2_5_05b_dapo.sh
    │   │   ├── run_qwen2_5_05b_grpo.sh
    │   │   ├── run_qwen2_5_05b_sft_peft_sp2.sh
    │   │   └── run_qwen2_5_vl_3b_npu.sh
    │   ├── special_sanity/
    │   │   ├── check_api_docs.py
    │   │   ├── check_device_api_usage.py
    │   │   ├── check_docs_time_info.py
    │   │   ├── check_docstrings.py
    │   │   ├── check_license.py
    │   │   ├── check_pr_description.py
    │   │   ├── check_pr_title.py
    │   │   ├── test_config_docs.py
    │   │   ├── test_import.py
    │   │   ├── type_coverage_check.py
    │   │   ├── validate_imported_docs.py
    │   │   └── validate_structure.py
    │   ├── special_standalone/
    │   │   ├── README.md
    │   │   └── test_memory_buffers.py
    │   ├── test_base_config_on_cpu.py
    │   ├── test_protocol_on_cpu.py
    │   ├── tools/
    │   │   └── test_base_tool_on_cpu.py
    │   ├── trainer/
    │   │   ├── __init__.py
    │   │   ├── config/
    │   │   │   ├── __init__.py
    │   │   │   ├── legacy_ppo_megatron_trainer.yaml
    │   │   │   ├── legacy_ppo_trainer.yaml
    │   │   │   ├── test_algo_config_on_cpu.py
    │   │   │   ├── test_critic_config_on_cpu.py
    │   │   │   └── test_legacy_config_on_cpu.py
    │   │   └── ppo/
    │   │       ├── __init__.py
    │   │       ├── test_core_algos_on_cpu.py
    │   │       └── test_metric_utils_on_cpu.py
    │   ├── utils/
    │   │   ├── _test_module.py
    │   │   ├── dataset/
    │   │   │   ├── test_create_rl_sampler_on_cpu.py
    │   │   │   ├── test_multiturn_sft_dataset_on_cpu.py
    │   │   │   ├── test_rl_dataset_on_cpu.py
    │   │   │   └── test_sft_dataset_on_cpu.py
    │   │   ├── megatron/
    │   │   │   └── test_pipeline_parallel.py
    │   │   ├── reward_score/
    │   │   │   ├── reward_score/
    │   │   │   │   └── test_sandbox_fusion_on_cpu.py
    │   │   │   └── test_sandbox_on_cpu.py
    │   │   ├── test_activation_offload.py
    │   │   ├── test_config_on_cpu.py
    │   │   ├── test_flops_counter.py
    │   │   ├── test_fs_on_cpu.py
    │   │   ├── test_import_utils_on_cpu.py
    │   │   ├── test_linear_cross_entropy.py
    │   │   ├── test_linear_cross_entropy_tp.py
    │   │   ├── test_model_on_cpu.py
    │   │   ├── test_nvtx_profile.py
    │   │   ├── test_rollout_trace_on_cpu.py
    │   │   ├── test_seqlen_balancing.py
    │   │   ├── test_temp_env_on_cpu.py
    │   │   ├── test_timeout_decorator_cpu.py
    │   │   └── test_torch_functional.py
    │   └── workers/
    │       ├── reward_manager/
    │       │   └── test_registry_on_cpu.py
    │       └── rollout/
    │           ├── async_rollout_utils.py
    │           ├── perf/
    │           │   └── vllm_async_rollout.py
    │           ├── resource/
    │           │   └── tool_configs/
    │           │       ├── mcp_server.json
    │           │       ├── mcp_tool_config
    │           │       ├── sandbox_fusion_tool_config
    │           │       └── search_tool_config
    │           ├── rollout_vllm/
    │           │   ├── run_fsdp_vllm.py
    │           │   ├── test_vllm_chat_scheduler.py
    │           │   ├── test_vllm_model_rope_scaling.py
    │           │   └── test_vllm_spmd.py
    │           ├── test_async_sglang_server_on_cpu.py
    │           ├── test_custom_completion_callback.py
    │           ├── test_hf_rollout.py
    │           ├── test_sglang_async_rollout_mcp_tools.py
    │           ├── test_sglang_async_rollout_multimodal_delta.py
    │           ├── test_sglang_async_rollout_search_tools.py
    │           ├── test_sglang_async_rollout_sf_tools.py
    │           ├── test_sglang_async_rollout_w_interaction.py
    │           ├── test_sglang_async_rollout_w_tools.py
    │           ├── test_sglang_multi_interaction.py
    │           ├── test_sglang_rollout_sharding_manager.py
    │           ├── test_sglang_spmd.py
    │           └── utils_sglang.py
    └── verl/
        ├── __init__.py
        ├── base_config.py
        ├── experimental/
        │   ├── __init__.py
        │   ├── agent_loop/
        │   │   ├── __init__.py
        │   │   ├── agent_loop.py
        │   │   ├── single_turn_agent_loop.py
        │   │   ├── tool_agent_loop.py
        │   │   └── tool_parser.py
        │   ├── dataset/
        │   │   ├── __init__.py
        │   │   └── sampler.py
        │   └── dynamic_dataset/
        │       ├── __init__.py
        │       └── dynamicgen_dataset.py
        ├── interactions/
        │   ├── __init__.py
        │   ├── base.py
        │   ├── gsm8k_interaction.py
        │   └── utils/
        │       ├── __init__.py
        │       └── interaction_registry.py
        ├── model_merger/
        │   ├── __init__.py
        │   ├── __main__.py
        │   ├── base_model_merger.py
        │   ├── fsdp_model_merger.py
        │   └── megatron_model_merger.py
        ├── models/
        │   ├── README.md
        │   ├── __init__.py
        │   ├── llama/
        │   │   ├── __init__.py
        │   │   └── megatron/
        │   │       ├── __init__.py
        │   │       ├── checkpoint_utils/
        │   │       │   ├── __init__.py
        │   │       │   ├── llama_loader.py
        │   │       │   ├── llama_loader_depracated.py
        │   │       │   └── llama_saver.py
        │   │       ├── layers/
        │   │       │   ├── __init__.py
        │   │       │   ├── parallel_attention.py
        │   │       │   ├── parallel_decoder.py
        │   │       │   ├── parallel_linear.py
        │   │       │   ├── parallel_mlp.py
        │   │       │   └── parallel_rmsnorm.py
        │   │       └── modeling_llama_megatron.py
        │   ├── mcore/
        │   │   ├── __init__.py
        │   │   ├── config_converter.py
        │   │   ├── loader.py
        │   │   ├── mbridge.py
        │   │   ├── model_forward.py
        │   │   ├── model_forward_fused.py
        │   │   ├── model_initializer.py
        │   │   ├── patch_v012.py
        │   │   ├── qwen2_5_vl/
        │   │   │   ├── __init__.py
        │   │   │   ├── attention.py
        │   │   │   ├── model.py
        │   │   │   ├── rope_utils.py
        │   │   │   ├── vision_config.py
        │   │   │   ├── vision_model.py
        │   │   │   └── vision_transformer_block.py
        │   │   ├── readme.md
        │   │   ├── registry.py
        │   │   ├── saver.py
        │   │   ├── util.py
        │   │   └── weight_converter.py
        │   ├── qwen2/
        │   │   ├── __init__.py
        │   │   └── megatron/
        │   │       ├── __init__.py
        │   │       ├── checkpoint_utils/
        │   │       │   ├── __init__.py
        │   │       │   ├── qwen2_loader.py
        │   │       │   ├── qwen2_loader_depracated.py
        │   │       │   └── qwen2_saver.py
        │   │       ├── layers/
        │   │       │   ├── __init__.py
        │   │       │   ├── parallel_attention.py
        │   │       │   ├── parallel_decoder.py
        │   │       │   ├── parallel_linear.py
        │   │       │   ├── parallel_mlp.py
        │   │       │   └── parallel_rmsnorm.py
        │   │       └── modeling_qwen2_megatron.py
        │   ├── registry.py
        │   ├── transformers/
        │   │   ├── __init__.py
        │   │   ├── dense_common.py
        │   │   ├── kimi_vl.py
        │   │   ├── llama.py
        │   │   ├── monkey_patch.py
        │   │   ├── npu_patch.py
        │   │   ├── qwen2.py
        │   │   ├── qwen2_5_vl.py
        │   │   └── qwen2_vl.py
        │   └── weight_loader_registry.py
        ├── protocol.py
        ├── py.typed
        ├── single_controller/
        │   ├── __init__.py
        │   ├── base/
        │   │   ├── __init__.py
        │   │   ├── decorator.py
        │   │   ├── megatron/
        │   │   │   ├── __init__.py
        │   │   │   ├── worker.py
        │   │   │   └── worker_group.py
        │   │   ├── register_center/
        │   │   │   ├── __init__.py
        │   │   │   └── ray.py
        │   │   ├── worker.py
        │   │   └── worker_group.py
        │   └── ray/
        │       ├── __init__.py
        │       ├── base.py
        │       └── megatron.py
        ├── third_party/
        │   ├── __init__.py
        │   ├── sglang/
        │   │   ├── __init__.py
        │   │   └── parallel_state.py
        │   ├── torch/
        │   │   ├── __init__.py
        │   │   └── distributed/
        │   │       ├── __init__.py
        │   │       ├── _state_dict_utils.py
        │   │       └── checkpoint/
        │   │           ├── __init__.py
        │   │           └── state_dict.py
        │   └── vllm/
        │       └── __init__.py
        ├── tools/
        │   ├── __init__.py
        │   ├── base_tool.py
        │   ├── geo3k_tool.py
        │   ├── gsm8k_tool.py
        │   ├── mcp_base_tool.py
        │   ├── mcp_search_tool.py
        │   ├── sandbox_fusion_tools.py
        │   ├── schemas.py
        │   ├── search_tool.py
        │   └── utils/
        │       ├── __init__.py
        │       ├── mcp_clients/
        │       │   ├── McpClientManager.py
        │       │   └── utils.py
        │       ├── search_r1_like_utils.py
        │       └── tool_registry.py
        ├── trainer/
        │   ├── __init__.py
        │   ├── config/
        │   │   ├── __init__.py
        │   │   ├── _generated_ppo_megatron_trainer.yaml
        │   │   ├── _generated_ppo_trainer.yaml
        │   │   ├── actor/
        │   │   │   ├── actor.yaml
        │   │   │   ├── dp_actor.yaml
        │   │   │   └── megatron_actor.yaml
        │   │   ├── algorithm.py
        │   │   ├── config.py
        │   │   ├── critic/
        │   │   │   ├── critic.yaml
        │   │   │   ├── dp_critic.yaml
        │   │   │   └── megatron_critic.yaml
        │   │   ├── data/
        │   │   │   └── legacy_data.yaml
        │   │   ├── evaluation.yaml
        │   │   ├── generation.yaml
        │   │   ├── npu_profile/
        │   │   │   └── npu_profile.yaml
        │   │   ├── ppo_megatron_trainer.yaml
        │   │   ├── ppo_trainer.yaml
        │   │   ├── ref/
        │   │   │   ├── dp_ref.yaml
        │   │   │   ├── megatron_ref.yaml
        │   │   │   └── ref.yaml
        │   │   ├── reward_model/
        │   │   │   ├── dp_reward_model.yaml
        │   │   │   ├── megatron_reward_model.yaml
        │   │   │   └── reward_model.yaml
        │   │   ├── rollout/
        │   │   │   └── rollout.yaml
        │   │   └── sft_trainer.yaml
        │   ├── constants_ppo.py
        │   ├── fsdp_sft_trainer.py
        │   ├── main_eval.py
        │   ├── main_generation.py
        │   ├── main_ppo.py
        │   ├── ppo/
        │   │   ├── __init__.py
        │   │   ├── core_algos.py
        │   │   ├── metric_utils.py
        │   │   ├── ray_trainer.py
        │   │   └── reward.py
        │   └── runtime_env.yaml
        ├── utils/
        │   ├── __init__.py
        │   ├── activation_offload.py
        │   ├── checkpoint/
        │   │   ├── __init__.py
        │   │   ├── checkpoint_manager.py
        │   │   ├── fsdp_checkpoint_manager.py
        │   │   └── megatron_checkpoint_manager.py
        │   ├── config.py
        │   ├── dataset/
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── multiturn_sft_dataset.py
        │   │   ├── rl_dataset.py
        │   │   ├── rm_dataset.py
        │   │   ├── sft_dataset.py
        │   │   └── vision_utils.py
        │   ├── debug/
        │   │   ├── __init__.py
        │   │   ├── performance.py
        │   │   └── trajectory_tracker.py
        │   ├── device.py
        │   ├── distributed.py
        │   ├── experimental/
        │   │   ├── __init__.py
        │   │   └── torch_functional.py
        │   ├── flops_counter.py
        │   ├── fs.py
        │   ├── fsdp_utils.py
        │   ├── hdfs_io.py
        │   ├── import_utils.py
        │   ├── kernel/
        │   │   ├── __init__.py
        │   │   ├── kernels.py
        │   │   └── linear_cross_entropy.py
        │   ├── logger/
        │   │   ├── __init__.py
        │   │   └── aggregate_logger.py
        │   ├── logging_utils.py
        │   ├── megatron/
        │   │   ├── __init__.py
        │   │   ├── dist_checkpointing.py
        │   │   ├── memory.py
        │   │   ├── optimizer.py
        │   │   ├── pipeline_parallel.py
        │   │   ├── sequence_parallel.py
        │   │   └── tensor_parallel.py
        │   ├── megatron_utils.py
        │   ├── memory_buffer.py
        │   ├── metric/
        │   │   ├── __init__.py
        │   │   └── utils.py
        │   ├── model.py
        │   ├── net_utils.py
        │   ├── profiler/
        │   │   ├── __init__.py
        │   │   ├── config.py
        │   │   ├── empty_annotations.py
        │   │   ├── mstx_profile.py
        │   │   ├── nvtx_profile.py
        │   │   ├── performance.py
        │   │   └── profile.py
        │   ├── py_functional.py
        │   ├── ray_utils.py
        │   ├── rendezvous/
        │   │   ├── __init__.py
        │   │   └── ray_backend.py
        │   ├── reward_score/
        │   │   ├── __init__.py
        │   │   ├── geo3k.py
        │   │   ├── gsm8k.py
        │   │   ├── math.py
        │   │   ├── math_batch.py
        │   │   ├── math_dapo.py
        │   │   ├── math_verify.py
        │   │   ├── prime_code/
        │   │   │   ├── README.md
        │   │   │   ├── __init__.py
        │   │   │   ├── testing_util.py
        │   │   │   └── utils.py
        │   │   ├── prime_math/
        │   │   │   ├── __init__.py
        │   │   │   ├── grader.py
        │   │   │   └── math_normalize.py
        │   │   ├── sandbox_fusion/
        │   │   │   ├── __init__.py
        │   │   │   └── utils.py
        │   │   └── search_r1_like_qa_em.py
        │   ├── rollout_trace.py
        │   ├── seqlen_balancing.py
        │   ├── tokenizer.py
        │   ├── torch_dtypes.py
        │   ├── torch_functional.py
        │   ├── tracking.py
        │   ├── ulysses.py
        │   └── vllm_utils.py
        ├── version/
        │   └── version
        └── workers/
            ├── __init__.py
            ├── actor/
            │   ├── __init__.py
            │   ├── base.py
            │   ├── dp_actor.py
            │   └── megatron_actor.py
            ├── critic/
            │   ├── __init__.py
            │   ├── base.py
            │   ├── dp_critic.py
            │   └── megatron_critic.py
            ├── engine/
            │   ├── __init__.py
            │   ├── base.py
            │   ├── fsdp/
            │   │   ├── __init__.py
            │   │   ├── engine_impl.py
            │   │   └── utils.py
            │   └── megatron/
            │       ├── __init__.py
            │       └── engine_impl.py
            ├── fsdp_workers.py
            ├── megatron_workers.py
            ├── reward_manager/
            │   ├── __init__.py
            │   ├── batch.py
            │   ├── dapo.py
            │   ├── naive.py
            │   ├── prime.py
            │   └── registry.py
            ├── reward_model/
            │   ├── __init__.py
            │   ├── base.py
            │   └── megatron/
            │       ├── __init__.py
            │       └── reward_model.py
            ├── roles/
            │   ├── __init__.py
            │   ├── actor.py
            │   └── critic.py
            ├── rollout/
            │   ├── __init__.py
            │   ├── async_server.py
            │   ├── base.py
            │   ├── chat_scheduler.py
            │   ├── hf_rollout.py
            │   ├── naive/
            │   │   ├── __init__.py
            │   │   └── naive_rollout.py
            │   ├── schemas.py
            │   ├── sglang_rollout/
            │   │   ├── __init__.py
            │   │   ├── async_sglang_server.py
            │   │   ├── sglang_rollout.py
            │   │   └── utils.py
            │   ├── tokenizer.py
            │   └── vllm_rollout/
            │       ├── __init__.py
            │       ├── vllm_async_server.py
            │       └── vllm_rollout_spmd.py
            └── sharding_manager/
                ├── __init__.py
                ├── base.py
                ├── fsdp_sglang.py
                ├── fsdp_ulysses.py
                ├── fsdp_vllm.py
                ├── megatron_sglang.py
                └── megatron_vllm.py