Full Code of PaddlePaddle/PaddleFormers for AI

develop 505415631247 cached

1045 files

23.2 MB

4.0M tokens

6547 symbols

1 requests

Copy disabled (too large) Download .txt

Showing preview only (16,047K chars total). Download the full file to get everything.

Repository: PaddlePaddle/PaddleFormers
Branch: develop
Commit: 505415631247
Files: 1045
Total size: 23.2 MB

Directory structure:
gitextract__9f9_ucr/

├── .copyright.hook
├── .flake8
├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CODE_OF_CONDUCT_en.md
│   ├── CONTRIBUTING_en.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── ask-question.yml
│   │   ├── bug-report.yml
│   │   ├── docs-report.yml
│   │   ├── feature-request.yml
│   │   ├── new-model.yaml
│   │   └── others.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── actions/
│   │   └── rerun-workflow/
│   │       ├── action.yml
│   │       └── rerun.sh
│   ├── codecov.yml
│   └── workflows/
│       ├── _clone_linux.yml
│       ├── _xpu_ci_test.yml
│       ├── ce-build-ci-workflow.yml
│       ├── ce-build-images.yml
│       ├── ce-build-whl.yml
│       ├── ce-deadlink.yml
│       ├── ce-unittest-gpu.yml
│       ├── check-release-pr.yaml
│       ├── cherry-pick.yml
│       ├── ci_iluvatar.yml
│       ├── ci_xpu.yml
│       ├── debug-unittest-gpu.yml
│       ├── fleet-model-test.yml
│       ├── lint.yml
│       ├── model-unittest-gpu.yml
│       ├── requirements-review.yml
│       ├── rerun.yml
│       ├── stale.yml
│       ├── unittest-gpu.yml
│       └── update-precision.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── docs/
│   ├── en/
│   │   ├── cli_usage.md
│   │   ├── datasets.md
│   │   ├── datasets_format.md
│   │   ├── image_processors.md
│   │   ├── processors.md
│   │   └── video_processors.md
│   └── zh/
│       ├── ILUVATAR-GPU_installation_guide.md
│       ├── ILUVATAR-GPU_usage_guide.md
│       ├── Metax-GPU_installation_guide.md
│       ├── Metax-GPU_usage_guide.md
│       ├── XPU_installation_guide.md
│       ├── XPU_usage_guide.md
│       ├── chat_template_guide.md
│       ├── cli_usage.md
│       ├── custom_datasets_format_zh.md
│       ├── data_processing_guide.md
│       ├── dataset_format.md
│       ├── deployment_guide.md
│       ├── dpo_and_lora_guide.md
│       ├── ernie4.5_pretraining.md
│       ├── how_to_download_model.md
│       ├── image_processors_zh.md
│       ├── model_capability.md
│       ├── processors_zh.md
│       ├── pt_and_cpt_guide.md
│       ├── sft_and_lora_guide.md
│       ├── template.md
│       ├── template_zh.md
│       ├── training_arguments.md
│       └── video_processors_zh.md
├── examples/
│   ├── FAQ.md
│   ├── README.md
│   ├── best_practices/
│   │   ├── DeepSeek-V3/
│   │   │   ├── README.md
│   │   │   ├── SFT-Practice.md
│   │   │   ├── dsv3_128k_config.yaml
│   │   │   ├── dsv3_32k_config.yaml
│   │   │   ├── dsv3_4k_config.yaml
│   │   │   ├── pretrain/
│   │   │   │   ├── config/
│   │   │   │   │   ├── config.json
│   │   │   │   │   ├── pretrain_argument.yaml
│   │   │   │   │   ├── tokenizer.json
│   │   │   │   │   └── tokenizer_config.json
│   │   │   │   ├── run.sh
│   │   │   │   └── train_gpu.sh
│   │   │   ├── run_dsv3_128k.sh
│   │   │   ├── run_dsv3_32k.sh
│   │   │   └── run_dsv3_4k.sh
│   │   ├── ERNIE-4.5/
│   │   │   └── README.md
│   │   ├── ERNIE-4.5-VL/
│   │   │   ├── README.md
│   │   │   ├── ernie45vl_32k_config.yaml
│   │   │   ├── ernie45vl_8k_config.yaml
│   │   │   └── ernie45vl_8k_lora_config.yaml
│   │   ├── PaddleOCR-VL/
│   │   │   ├── README.md
│   │   │   ├── paddleocr-vl_full_16k_config.yaml
│   │   │   ├── paddleocr-vl_lora_16k_config.yaml
│   │   │   ├── paddleocr-vl_lora_export.yaml
│   │   │   ├── run_paddleocr-vl_full_16k.sh
│   │   │   ├── run_paddleocr-vl_full_16k_4090D.sh
│   │   │   ├── run_paddleocr-vl_lora_16k.sh
│   │   │   ├── run_paddleocr-vl_lora_16k_4090D.sh
│   │   │   └── run_paddleocr-vl_lora_export.sh
│   │   ├── PaddleOCR-VL-1.5/
│   │   │   ├── README.md
│   │   │   ├── paddleocr-vl-v15_full_16k_region_config.yaml
│   │   │   ├── paddleocr-vl-v15_full_16k_table_config.yaml
│   │   │   ├── paddleocr-vl-v15_lora_16k_region_config.yaml
│   │   │   ├── paddleocr-vl-v15_lora_16k_table_config.yaml
│   │   │   ├── region_ocr.md
│   │   │   └── table_ocr.md
│   │   ├── function_call.md
│   │   └── tutorials/
│   │       ├── how_to_train_a_function_call_model.md
│   │       ├── how_to_train_a_reasoning_model.md
│   │       ├── how_to_train_a_visual_grounding_model.md
│   │       └── how_to_train_an_emoji_model.md
│   ├── config/
│   │   ├── dpo/
│   │   │   ├── full.yaml
│   │   │   ├── full_function_call.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── dpo-vl/
│   │   │   ├── full.yaml
│   │   │   ├── full_fsdp.yaml
│   │   │   ├── full_tp.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_fsdp.yaml
│   │   │   └── lora_tp.yaml
│   │   ├── iluvatar/
│   │   │   ├── ERNIE-4.5-0.3B-PT/
│   │   │   │   └── sft/
│   │   │   │       ├── full_8k.yaml
│   │   │   │       ├── lora_8k.yaml
│   │   │   │       ├── lora_export.yaml
│   │   │   │       ├── run_full_8k.sh
│   │   │   │       ├── run_lora_8k.sh
│   │   │   │       └── run_lora_export.sh
│   │   │   ├── ERNIE-4.5-21B-A3B-PT/
│   │   │   │   └── sft/
│   │   │   │       ├── full_8k.yaml
│   │   │   │       ├── lora_8k.yaml
│   │   │   │       ├── lora_export.yaml
│   │   │   │       ├── run_full_8k.sh
│   │   │   │       ├── run_lora_8k.sh
│   │   │   │       └── run_lora_export.sh
│   │   │   └── PaddleOCR-VL/
│   │   │       └── sft/
│   │   │           ├── paddleocr-vl_full_16k_config.yaml
│   │   │           ├── paddleocr-vl_lora_16k_config.yaml
│   │   │           ├── paddleocr-vl_lora_export.yaml
│   │   │           ├── run_paddleocr-vl_full_16k.sh
│   │   │           ├── run_paddleocr-vl_lora_16k.sh
│   │   │           └── run_paddleocr-vl_lora_export.sh
│   │   ├── metax/
│   │   │   ├── ERNIE-4.5-0.3B/
│   │   │   │   └── sft/
│   │   │   │       ├── lora.yaml
│   │   │   │       ├── run_lora.sh
│   │   │   │       ├── run_sft.sh
│   │   │   │       └── sft.yaml
│   │   │   └── ERNIE-4.5-21B-A3B/
│   │   │       └── sft/
│   │   │           ├── lora.yaml
│   │   │           ├── run_lora.sh
│   │   │           ├── run_sft.sh
│   │   │           └── sft.yaml
│   │   ├── pt/
│   │   │   ├── eb45_pretrain/
│   │   │   │   ├── 21b_8_gpus.yaml
│   │   │   │   ├── 300b_2016_gpus.yaml
│   │   │   │   ├── 300b_4_nodes_ce.yaml
│   │   │   │   ├── 300b_8_gpus_ci.yaml
│   │   │   │   ├── 300b_96gpus.yaml
│   │   │   │   └── 300b_96gpus_small_acc.yaml
│   │   │   ├── full.yaml
│   │   │   ├── full_offline_data.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── run_export.yaml
│   │   ├── sft/
│   │   │   ├── full.yaml
│   │   │   ├── full_function_call.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── sft-vl/
│   │   │   ├── full.yaml
│   │   │   ├── full_fsdp.yaml
│   │   │   ├── full_tp.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_fsdp.yaml
│   │   │   └── lora_tp.yaml
│   │   └── xpu/
│   │       ├── DeepseekV3/
│   │       │   └── sft/
│   │       │       ├── full_32k_config.yaml
│   │       │       ├── full_4k_config.yaml
│   │       │       ├── run_full_32k.sh
│   │       │       └── run_full_4k.sh
│   │       ├── ERNIE-4.5-0.3B/
│   │       │   └── sft/
│   │       │       ├── full_8k.yaml
│   │       │       ├── lora_8k.yaml
│   │       │       └── lora_8k_export.yaml
│   │       ├── ERNIE-4.5-21B-A3B/
│   │       │   └── sft/
│   │       │       ├── full_32k.yaml
│   │       │       ├── lora_32k.yaml
│   │       │       ├── lora_32k_export.yaml
│   │       │       └── run_lora_32k.sh
│   │       ├── ERNIE-4.5-21B-A3B-Thinking/
│   │       │   └── sft/
│   │       │       └── full_8k.yaml
│   │       ├── ERNIE-4.5-VL-28B-A3B-Thinking/
│   │       │   └── sft/
│   │       │       └── full_32k.yaml
│   │       └── PaddleOCR-VL/
│   │           └── sft/
│   │               ├── paddleocr-vl_full_16k_config.yaml
│   │               ├── paddleocr-vl_lora_16k_config.yaml
│   │               ├── paddleocr-vl_lora_export.yaml
│   │               ├── run_paddleocr-vl_full_16k.sh
│   │               ├── run_paddleocr-vl_lora_16k.sh
│   │               └── run_paddleocr-vl_lora_export.sh
│   ├── experiments/
│   │   ├── deepseek_v3_pretrain/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── config.json
│   │   │   │   ├── configuration.py
│   │   │   │   ├── pretrain_argument.json
│   │   │   │   └── pretrain_argument.yaml
│   │   │   ├── convert_ckpt_to_sft.py
│   │   │   ├── fp8_linear.py
│   │   │   ├── kernel.py
│   │   │   ├── load_hf_ckpt.py
│   │   │   ├── modeling.py
│   │   │   ├── modeling_pp.py
│   │   │   ├── moe_gate.py
│   │   │   ├── moe_layer.py
│   │   │   ├── moe_utils.py
│   │   │   ├── run.sh
│   │   │   ├── run_pretrain.py
│   │   │   ├── script/
│   │   │   │   └── train_gpu.sh
│   │   │   └── token_dispatcher.py
│   │   ├── ernie_pretrain/
│   │   │   ├── README.md
│   │   │   ├── README_zh.md
│   │   │   ├── demo_data/
│   │   │   │   ├── data-1-part0.idx
│   │   │   │   └── data-1-part1.idx
│   │   │   ├── ernie/
│   │   │   │   ├── config.py
│   │   │   │   ├── model_config.py
│   │   │   │   ├── pretrain.py
│   │   │   │   └── src/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── callbacks/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── fp8_quant_weight_callback.py
│   │   │   │       │   ├── gc_callback.py
│   │   │   │       │   ├── logging_callback.py
│   │   │   │       │   ├── moe_correction_bias_adjust_callback.py
│   │   │   │       │   ├── moe_logging_callback.py
│   │   │   │       │   ├── ortho_loss_callback.py
│   │   │   │       │   ├── sp_grad_sync_callback.py
│   │   │   │       │   └── tensorboard_callback.py
│   │   │   │       ├── clip/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── moe_clip.py
│   │   │   │       ├── lr_schedulers/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── cosine_lr.py
│   │   │   │       │   └── wsd_lr.py
│   │   │   │       ├── tokenizers/
│   │   │   │       │   ├── tokenization_eb_v2.py
│   │   │   │       │   └── tokenizer_model/
│   │   │   │       │       ├── added_tokens.json
│   │   │   │       │       ├── special_tokens_map.json
│   │   │   │       │       ├── tokenizer.model
│   │   │   │       │       └── tokenizer_config.json
│   │   │   │       ├── trainers/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── data_parallel.py
│   │   │   │       │   ├── dygraph_optimizer/
│   │   │   │       │   │   └── hybrid_parallel_optimizer.py
│   │   │   │       │   └── pretraining_trainer.py
│   │   │   │       └── utils/
│   │   │   │           ├── __init__.py
│   │   │   │           ├── logging.py
│   │   │   │           ├── misc.py
│   │   │   │           ├── seed_utils.py
│   │   │   │           └── training_utils.py
│   │   │   ├── model_configs/
│   │   │   │   ├── ERNIE-4p5-21B-A3B/
│   │   │   │   │   └── model_config.json
│   │   │   │   └── ERNIE-4p5-300B-A47B/
│   │   │   │       └── model_config.json
│   │   │   ├── models/
│   │   │   │   ├── comm_utils.py
│   │   │   │   ├── ernie/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── configuration.py
│   │   │   │   │   ├── modeling.py
│   │   │   │   │   ├── modeling_moe.py
│   │   │   │   │   └── modeling_pp.py
│   │   │   │   ├── fp8_linear.py
│   │   │   │   ├── moe/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   ├── token_dispatcher/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── fp8_utils.py
│   │   │   │   │   │   └── moe_utils.py
│   │   │   │   │   └── top2_gate.py
│   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   └── utils.py
│   │   │   ├── requirements.txt
│   │   │   ├── scripts/
│   │   │   │   └── ERNIE-4p5-300B-A47B/
│   │   │   │       ├── ci_ce/
│   │   │   │       │   ├── train_4_nodes_ce.sh
│   │   │   │       │   └── train_8_gpus_ci.sh
│   │   │   │       ├── train_2016_gpus.sh
│   │   │   │       └── train_96_gpus.sh
│   │   │   ├── tools/
│   │   │   │   ├── sharded_to_uc/
│   │   │   │   │   ├── README_zh.md
│   │   │   │   │   ├── convert_multi_nodes_sharded_to_single_uc.sh
│   │   │   │   │   ├── convert_sharded_to_uc.py
│   │   │   │   │   ├── gather_all_ckpt.py
│   │   │   │   │   └── merge_sharding_ep.py
│   │   │   │   └── uc_to_sharded/
│   │   │   │       ├── README.md
│   │   │   │       ├── README_zh.md
│   │   │   │       └── convert_uc_to_sharded.py
│   │   │   └── yamls/
│   │   │       ├── ERNIE-4p5-21B-A3B/
│   │   │       │   └── pretrain_8_gpus.yaml
│   │   │       └── ERNIE-4p5-300B-A47B/
│   │   │           ├── ci_ce/
│   │   │           │   ├── pretrain_4_nodes_ce.yaml
│   │   │           │   └── pretrain_8_gpus_ci.yaml
│   │   │           ├── pretrain_2016_gpus.yaml
│   │   │           ├── pretrain_96_gpus.yaml
│   │   │           └── pretrain_96_gpus_small_acc.yaml
│   │   ├── glm_pretrain/
│   │   │   └── GLM4.5-Air.yaml
│   │   └── paddlefleet/
│   │       ├── glm45.json
│   │       ├── glm45_provider.py
│   │       ├── glm45_single_card.json
│   │       ├── qwen_provider.py
│   │       ├── qwen_single_card.json
│   │       ├── run_glm45.sh
│   │       └── run_pretrain.py
│   └── tools/
│       ├── create_pretraining_data.py
│       ├── gpt-oss_weight_change/
│       │   ├── README.md
│       │   └── change_weight_dtype.py
│       ├── merge.py
│       └── trans_paddlenlp2hf.py
├── paddleformers/
│   ├── __init__.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── export/
│   │   │   ├── __init__.py
│   │   │   └── export.py
│   │   ├── hparams/
│   │   │   ├── __init__.py
│   │   │   ├── data_args.py
│   │   │   ├── export_args.py
│   │   │   ├── finetuning_args.py
│   │   │   ├── generating_args.py
│   │   │   ├── model_args.py
│   │   │   ├── parser.py
│   │   │   ├── preprocess_args.py
│   │   │   └── server_args.py
│   │   ├── launcher.py
│   │   ├── train/
│   │   │   ├── __init__.py
│   │   │   ├── auto_parallel/
│   │   │   │   ├── __init__.py
│   │   │   │   └── workflow.py
│   │   │   ├── deepseek_v3_pretrain/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── configuration.py
│   │   │   │   ├── fp8_linear.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── modeling.py
│   │   │   │   ├── modeling_pp.py
│   │   │   │   ├── moe_gate.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── utils/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── convert_ckpt_to_sft.py
│   │   │   │   │   └── load_hf_ckpt.py
│   │   │   │   └── workflow.py
│   │   │   ├── dpo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data_config.py
│   │   │   │   ├── dpo_argument.py
│   │   │   │   ├── dpo_estimate_training.py
│   │   │   │   ├── dpo_trainer.py
│   │   │   │   └── workflow.py
│   │   │   ├── ernie_pretrain/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── model_config.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── comm_utils.py
│   │   │   │   │   ├── ernie/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── configuration.py
│   │   │   │   │   │   ├── modeling.py
│   │   │   │   │   │   ├── modeling_moe.py
│   │   │   │   │   │   └── modeling_pp.py
│   │   │   │   │   ├── fp8_linear.py
│   │   │   │   │   ├── moe/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   │   ├── token_dispatcher/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── fp8_utils.py
│   │   │   │   │   │   │   └── moe_utils.py
│   │   │   │   │   │   └── top2_gate.py
│   │   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── src/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── callbacks/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── fp8_quant_weight_callback.py
│   │   │   │   │   │   ├── gc_callback.py
│   │   │   │   │   │   ├── logging_callback.py
│   │   │   │   │   │   ├── moe_correction_bias_adjust_callback.py
│   │   │   │   │   │   ├── moe_logging_callback.py
│   │   │   │   │   │   ├── ortho_loss_callback.py
│   │   │   │   │   │   ├── sp_grad_sync_callback.py
│   │   │   │   │   │   └── tensorboard_callback.py
│   │   │   │   │   ├── clip/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── moe_clip.py
│   │   │   │   │   ├── lr_schedulers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── cosine_lr.py
│   │   │   │   │   │   └── wsd_lr.py
│   │   │   │   │   ├── tokenizers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── tokenization_eb_v2.py
│   │   │   │   │   ├── trainers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── data_parallel.py
│   │   │   │   │   │   ├── dygraph_optimizer/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── hybrid_parallel_optimizer.py
│   │   │   │   │   │   └── pretraining_trainer.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── logging.py
│   │   │   │   │       ├── misc.py
│   │   │   │   │       ├── seed_utils.py
│   │   │   │   │       └── training_utils.py
│   │   │   │   └── workflow.py
│   │   │   ├── sft/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dataset_formatting.py
│   │   │   │   ├── make_data_utils.py
│   │   │   │   ├── sft_config.py
│   │   │   │   ├── sft_trainer.py
│   │   │   │   └── workflow.py
│   │   │   └── tuner.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── llm_utils.py
│   │       ├── mllm_utils.py
│   │       └── process.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── blendable_dataset.py
│   │   ├── causal_dataset.py
│   │   ├── collate.py
│   │   ├── data_collator.py
│   │   ├── dist_dataloader.py
│   │   ├── indexed_dataset.py
│   │   ├── iterator.py
│   │   ├── sampler.py
│   │   ├── tokenizer.py
│   │   └── vocab.py
│   ├── datasets/
│   │   ├── DPODataset.py
│   │   ├── SFTDataset.py
│   │   ├── __init__.py
│   │   ├── collate.py
│   │   ├── data_utils.py
│   │   ├── dataset.py
│   │   ├── loader.py
│   │   ├── reader/
│   │   │   ├── __init__.py
│   │   │   ├── convertor.py
│   │   │   ├── data_info.json
│   │   │   ├── download_manager.py
│   │   │   ├── file_reader.py
│   │   │   ├── io.py
│   │   │   ├── mix_datasets.py
│   │   │   └── multi_source_datasets.py
│   │   ├── rlhf_datasets/
│   │   │   ├── __init__.py
│   │   │   ├── protocol.py
│   │   │   └── rl_dataset.py
│   │   ├── sampler/
│   │   │   └── __init__.py
│   │   └── template/
│   │       ├── __init__.py
│   │       ├── augment_utils.py
│   │       ├── formatter.py
│   │       ├── grounding_plugin.py
│   │       ├── mm_plugin.py
│   │       ├── template.py
│   │       └── tool_utils.py
│   ├── generation/
│   │   ├── __init__.py
│   │   ├── configuration_utils.py
│   │   ├── logits_process.py
│   │   ├── stopping_criteria.py
│   │   ├── streamers.py
│   │   └── utils.py
│   ├── mergekit/
│   │   ├── __init__.py
│   │   ├── merge_config.py
│   │   ├── merge_method.py
│   │   ├── merge_model.py
│   │   ├── merge_utils.py
│   │   └── sparsify_method.py
│   ├── nn/
│   │   ├── __init__.py
│   │   ├── activation.py
│   │   ├── attention/
│   │   │   ├── __init__.py
│   │   │   ├── eager_attention.py
│   │   │   ├── flashmask_attention.py
│   │   │   ├── interface.py
│   │   │   ├── sdpa_attention.py
│   │   │   ├── sink_impl.py
│   │   │   └── utils.py
│   │   ├── criterion/
│   │   │   ├── __init__.py
│   │   │   ├── dpo_loss.py
│   │   │   ├── interface.py
│   │   │   ├── kto_loss.py
│   │   │   ├── loss_utils.py
│   │   │   └── sft_loss.py
│   │   ├── embedding.py
│   │   ├── general.py
│   │   ├── linear.py
│   │   ├── lm_head.py
│   │   ├── mlp.py
│   │   ├── moe/
│   │   │   ├── __init__.py
│   │   │   ├── abstract.py
│   │   │   ├── all_gather.py
│   │   │   ├── all_to_all.py
│   │   │   ├── moe_allgather_layer.py
│   │   │   ├── moe_alltoall_layer.py
│   │   │   ├── moe_block.py
│   │   │   ├── topk_gate.py
│   │   │   └── utils.py
│   │   ├── moe_deepep/
│   │   │   ├── __init__.py
│   │   │   ├── modular_moe_layer.py
│   │   │   ├── moe_communication.py
│   │   │   ├── moe_expert.py
│   │   │   ├── moe_factory.py
│   │   │   ├── moe_gate.py
│   │   │   ├── moe_loss.py
│   │   │   └── moe_loss_instance.py
│   │   ├── norm.py
│   │   └── pp_model.py
│   ├── peft/
│   │   ├── __init__.py
│   │   └── lora/
│   │       ├── __init__.py
│   │       ├── auto_lora_model.py
│   │       ├── lora_config.py
│   │       ├── lora_layers.py
│   │       ├── lora_model.py
│   │       ├── lora_quant_layers.py
│   │       ├── lora_quantization_layers.py
│   │       ├── loraga_utils.py
│   │       └── utils.py
│   ├── quantization/
│   │   ├── __init__.py
│   │   ├── checkpoint_quantization_utils.py
│   │   ├── hadamard_utils.py
│   │   ├── qat_utils.py
│   │   ├── qlora.py
│   │   ├── quantization_config.py
│   │   ├── quantization_linear.py
│   │   ├── quantization_utils.py
│   │   └── unified_checkpoint_quantization.py
│   ├── trainer/
│   │   ├── __init__.py
│   │   ├── argparser.py
│   │   ├── integrations.py
│   │   ├── plugins/
│   │   │   ├── __init__.py
│   │   │   ├── npu_plugin.py
│   │   │   └── timer.py
│   │   ├── trainer.py
│   │   ├── trainer_callback.py
│   │   ├── trainer_utils.py
│   │   ├── training_args.py
│   │   ├── unified_checkpoint/
│   │   │   ├── __init__.py
│   │   │   ├── async_handler.py
│   │   │   ├── check_completion.py
│   │   │   ├── load_dynamic.py
│   │   │   ├── load_local.py
│   │   │   ├── load_save_single_card.py
│   │   │   ├── sharding_split_param_utils.py
│   │   │   ├── shared_memory_utils.py
│   │   │   ├── unified_checkpoint.py
│   │   │   └── utils.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── async_save.py
│   │       ├── ckpt_converter.py
│   │       ├── doc.py
│   │       ├── helper.py
│   │       ├── offload_optimizer.py
│   │       ├── reshard/
│   │       │   ├── __init__.py
│   │       │   ├── common.py
│   │       │   ├── pp_reshard.py
│   │       │   ├── sharding_v1.py
│   │       │   └── sharding_v2.py
│   │       ├── sharding_io.py
│   │       └── zero_cost_checkpoint.py
│   ├── transformers/
│   │   ├── __init__.py
│   │   ├── activations.py
│   │   ├── aistudio_utils.py
│   │   ├── attention_utils.py
│   │   ├── audio_processing_utils.py
│   │   ├── audio_utils.py
│   │   ├── auto/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── factory.py
│   │   │   ├── feature_extraction.py
│   │   │   ├── image_processing.py
│   │   │   ├── modeling.py
│   │   │   ├── processing.py
│   │   │   ├── tokenizer.py
│   │   │   └── video_processing.py
│   │   ├── auto_utils.py
│   │   ├── cache_utils.py
│   │   ├── configuration_utils.py
│   │   ├── context_parallel_utils.py
│   │   ├── contrastive_loss.py
│   │   ├── conversion_utils.py
│   │   ├── deepseek_v3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── mfu_utils.py
│   │   │   └── modeling.py
│   │   ├── download_utils.py
│   │   ├── dpo_criterion.py
│   │   ├── embedding_utils.py
│   │   ├── ernie4_5/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── ernie4_5_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── ernie4_5_moe_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── model/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── comm_utils.py
│   │   │   │   ├── configuration.py
│   │   │   │   ├── dfnrope/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation.py
│   │   │   │   │   ├── configuration.py
│   │   │   │   │   ├── modeling.py
│   │   │   │   │   └── modeling_pp.py
│   │   │   │   ├── distributed/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── common_dist_utils.py
│   │   │   │   │   └── xpu_dist_utils.py
│   │   │   │   ├── fusion_ops/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── common_fusion_ops.py
│   │   │   │   │   └── npu_fusion_ops.py
│   │   │   │   ├── longcontext_ops.py
│   │   │   │   ├── loss/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── dpo.py
│   │   │   │   ├── modeling.py
│   │   │   │   ├── modeling_moe.py
│   │   │   │   ├── modeling_moe_pp.py
│   │   │   │   ├── modeling_moe_vl.py
│   │   │   │   ├── modeling_moe_vl_pp.py
│   │   │   │   ├── moe/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── moe_all_gather_layer.py
│   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   └── topk_gate.py
│   │   │   │   ├── refined_recompute/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   └── utils/
│   │   │   │       ├── __init__.py
│   │   │   │       └── misc.py
│   │   │   ├── modeling.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   └── vision_process.py
│   │   ├── feature_extraction_utils.py
│   │   ├── fp8_utils.py
│   │   ├── fused_a2a.py
│   │   ├── gemma3_text/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── glm4_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── glm4v_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── image_processor_fast.py
│   │   │   ├── modeling.py
│   │   │   ├── processor.py
│   │   │   └── video_processor.py
│   │   ├── glm_ocr/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── gpt_oss/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── gpt_provider.py
│   │   ├── image_processing_utils.py
│   │   ├── image_processing_utils_fast.py
│   │   ├── image_transforms.py
│   │   ├── image_utils.py
│   │   ├── kimi_k2/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── kimi_k25/
│   │   │   ├── __init__.py
│   │   │   ├── media_utils.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   ├── tool_declaration_ts.py
│   │   │   └── vision_processor.py
│   │   ├── kto_criterion.py
│   │   ├── legacy/
│   │   │   ├── __init__.py
│   │   │   ├── tokenizer_utils.py
│   │   │   └── tokenizer_utils_base.py
│   │   ├── linear_utils.py
│   │   ├── llama/
│   │   │   ├── __init__.py
│   │   │   ├── auto_dist_config.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── tokenizer.py
│   │   │   └── tokenizer_fast.py
│   │   ├── masking_utils.py
│   │   ├── mc2_parallel_linear.py
│   │   ├── model_outputs.py
│   │   ├── model_provider.py
│   │   ├── model_utils.py
│   │   ├── modeling_rope_utils.py
│   │   ├── modelscope_utils.py
│   │   ├── moe_gate.py
│   │   ├── moe_gate_auto.py
│   │   ├── moe_layer.py
│   │   ├── moe_layer_auto.py
│   │   ├── moe_utils.py
│   │   ├── ofa_utils.py
│   │   ├── optimization.py
│   │   ├── paddle_vision_utils.py
│   │   ├── paddleocr_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── phi3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── processing_utils.py
│   │   ├── qwen2/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── tokenizer.py
│   │   │   └── tokenizer_fast.py
│   │   ├── qwen2_5_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── qwen2_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen2_vl/
│   │   │   ├── __init__.py
│   │   │   ├── image_processor.py
│   │   │   ├── image_processor_fast.py
│   │   │   ├── processor.py
│   │   │   ├── video_processor.py
│   │   │   └── vision_process.py
│   │   ├── qwen3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_5/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_next/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_omni_moe/
│   │   │   ├── __init__.py
│   │   │   └── processor.py
│   │   ├── qwen3_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── modeling_fleet.py
│   │   │   ├── processor.py
│   │   │   └── video_processor.py
│   │   ├── qwen3_vl_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── refined_recompute.py
│   │   ├── ring_flash_attention.py
│   │   ├── segment_parallel_utils.py
│   │   ├── sequence_parallel_utils.py
│   │   ├── tensor_parallel_utils.py
│   │   ├── token_dispatcher.py
│   │   ├── tokenizer_utils.py
│   │   ├── tokenizer_utils_base.py
│   │   ├── utils.py
│   │   ├── video_processing_utils.py
│   │   ├── video_utils.py
│   │   ├── vocab_utils.py
│   │   └── whisper/
│   │       ├── __init__.py
│   │       └── processor.py
│   ├── triton_kernels/
│   │   ├── __init__.py
│   │   └── rope_triton.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── adamw_triton.py
│   │   ├── batch_sampler.py
│   │   ├── converter.py
│   │   ├── distributed.py
│   │   ├── doc_parser.py
│   │   ├── download/
│   │   │   ├── __init__.py
│   │   │   ├── aistudio_hub_download.py
│   │   │   ├── common.py
│   │   │   └── download.py
│   │   ├── downloader.py
│   │   ├── env.py
│   │   ├── fault_tolerance.py
│   │   ├── ie_utils.py
│   │   ├── image_utils.py
│   │   ├── import_utils.py
│   │   ├── infohub.py
│   │   ├── initializer.py
│   │   ├── lazy_import.py
│   │   ├── log.py
│   │   ├── masking_utils.py
│   │   ├── memory_utils.py
│   │   ├── moe_hybrid_parallel_optimizer.py
│   │   ├── nested.py
│   │   ├── optimizer.py
│   │   ├── paddle_patch.py
│   │   ├── pdc_sdk.py
│   │   ├── perf_utils.py
│   │   ├── profiler.py
│   │   ├── safetensors.py
│   │   ├── serialization.py
│   │   ├── tools.py
│   │   ├── type_validators.py
│   │   └── upcast_downcast_triton.py
│   └── version/
│       ├── __init__.py
│       └── git.py
├── pyproject.toml
├── requirements.txt
├── scripts/
│   ├── ci_utils/
│   │   ├── __init__.py
│   │   ├── log_analyzer.py
│   │   └── training_utils.py
│   ├── codestyle/
│   │   ├── check_dead_links.py
│   │   ├── check_spaces.py
│   │   └── get_modified_files.py
│   ├── dependence/
│   │   └── build.sh
│   ├── iluvatar_ci/
│   │   ├── base_value/
│   │   │   └── ERNIE-21B-SFT-LOSS.json
│   │   ├── config/
│   │   │   └── ERNIE-21B-SFT.yaml
│   │   ├── conftest.py
│   │   └── test_ernie_21b_sft.py
│   ├── regression/
│   │   ├── ci_model_unittest.sh
│   │   ├── test_dpo_tiny-random-glm4moe.py
│   │   ├── test_pt_tiny-random-glm4moe.py
│   │   └── test_sft_tiny-random-glm4moe.py
│   ├── unit_test/
│   │   ├── ci_unittest.sh
│   │   └── gen_allure_report.py
│   └── xpu_ci/
│       ├── README.md
│       ├── base_value/
│       │   ├── ernie_21b_sft_loss.json
│       │   └── ernie_28b_thinking_sft_loss.json
│       ├── config/
│       │   ├── ernie_21b_sft.yaml
│       │   └── ernie_vl_28b_sft.yaml
│       ├── conftest.py
│       ├── test_ernie_21b_sft.py
│       ├── test_ernie_28b_thinking_sft.py
│       └── test_example_template.py.template
├── setup.py
└── tests/
    ├── README.md
    ├── __init__.py
    ├── check_log_for_exitcode.py
    ├── common_test.py
    ├── config/
    │   ├── benchmark/
    │   │   └── config/
    │   │       ├── pt/
    │   │       │   ├── DeepSeek-V3.yaml
    │   │       │   ├── ERNIE45-21B.yaml
    │   │       │   ├── ERNIE45-300B.yaml
    │   │       │   ├── GLM4.5-Air.yaml
    │   │       │   ├── GLM4.5-Air_64k.yaml
    │   │       │   ├── GLM4.5-Air_FP8.yaml
    │   │       │   ├── Qwen3-30B-A3B-Base-64k.yaml
    │   │       │   └── Qwen3-30B-A3B-Base.yaml
    │   │       └── sft/
    │   │           ├── GLM4.5-Air.yaml
    │   │           ├── GLM4.5-Air_128k.yaml
    │   │           ├── GLM4.5-Air_64k.yaml
    │   │           ├── Qwen3-30B-A3B-Base-64k.yaml
    │   │           ├── Qwen3-30B-A3B-Base.yaml
    │   │           ├── Qwen3-VL-30B-A3B-Instruct.yaml
    │   │           └── Qwen3-VL-8B-Instruct.yaml
    │   └── ci/
    │       ├── glm45_dpo.yaml
    │       ├── glm45_dpo_lora.yaml
    │       ├── glm45_lora.yaml
    │       ├── glm45_pt.yaml
    │       ├── glm45_pt_fp8.yaml
    │       ├── glm45_pt_grouped_gemm.yaml
    │       ├── glm45_sft.yaml
    │       ├── glm45_single_pt-test.yaml
    │       ├── qwen3_multicard_lora.yaml
    │       ├── qwen3_multicard_pt.yaml
    │       ├── qwen3_multicard_sft.yaml
    │       ├── qwen3_pt.yaml
    │       ├── qwen3vl_lora.yaml
    │       ├── qwen3vl_sft.yaml
    │       ├── qwen3vl_sft_fsdp.yaml
    │       ├── qwen3vl_sft_moe.yaml
    │       ├── qwen3vl_sft_moe_a100.yaml
    │       └── qwen3vl_sft_single.yaml
    ├── conftest.py
    ├── data/
    │   ├── __init__.py
    │   ├── test_blendable_dataset.py
    │   ├── test_collate.py
    │   ├── test_data_collator.py
    │   ├── test_sampler.py
    │   └── test_vocab.py
    ├── dataset/
    │   ├── __init__.py
    │   ├── test_convertor.py
    │   ├── test_ernie_datasets.py
    │   ├── test_file_reader.py
    │   ├── test_io.py
    │   └── test_iter_datasets.py
    ├── fixtures/
    │   ├── chat_template.json
    │   ├── chat_template_with_context.json
    │   ├── dummy/
    │   │   ├── dpo/
    │   │   │   ├── eval.jsonl
    │   │   │   ├── function-call-eval.jsonl
    │   │   │   ├── function-call-train.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── dpo-vl/
    │   │   │   ├── eval.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── io/
    │   │   │   ├── train.jsonl
    │   │   │   └── train.parquet
    │   │   ├── pt/
    │   │   │   ├── eval.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── sft/
    │   │   │   ├── eval.jsonl
    │   │   │   ├── function-call-eval.jsonl
    │   │   │   ├── function-call-train.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── sft-vl/
    │   │   │   ├── thinking_safety_demo.jsonl
    │   │   │   └── train.jsonl
    │   │   └── tnews/
    │   │       ├── dev.json
    │   │       └── train.json
    │   └── sample_text.txt
    ├── generation/
    │   ├── __init__.py
    │   ├── test_logits_process.py
    │   ├── test_stopping_criteria.py
    │   ├── test_streamers.py
    │   └── test_synced_gpus.py
    ├── integration_test/
    │   ├── check_loss.py
    │   ├── check_pr_approval.py
    │   ├── check_precision_approval.sh
    │   ├── glm45_a100.sh
    │   ├── glm45_dpo.sh
    │   ├── glm45_dpo_lora.sh
    │   ├── glm45_lora.sh
    │   ├── glm45_pt.sh
    │   ├── glm45_pt_ep4.sh
    │   ├── glm45_pt_fp8.sh
    │   ├── glm45_pt_grouped_gemm.sh
    │   ├── glm45_pt_single_card.sh
    │   ├── glm45_sft.sh
    │   ├── preprocess.sh
    │   ├── qwen.sh
    │   ├── qwen3_a100.sh
    │   ├── qwen3_single_card.sh
    │   ├── qwen3vl_lora.sh
    │   ├── qwen3vl_sft.sh
    │   ├── qwen3vl_sft_single_card.sh
    │   └── update_precision.sh
    ├── mergekit/
    │   ├── __init__.py
    │   ├── test_merge_config.py
    │   ├── test_merge_method.py
    │   ├── test_merge_model.py
    │   └── test_sparsify_method.py
    ├── nn/
    │   ├── __init__.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_criterion.py
    │   ├── test_embedding.py
    │   ├── test_linear.py
    │   ├── test_lm_head.py
    │   ├── test_mlp.py
    │   └── test_norm.py
    ├── parallel_launch.py
    ├── peft/
    │   ├── __init__.py
    │   ├── test_lora.py
    │   └── test_quant_lora.py
    ├── quantization/
    │   ├── __init__.py
    │   └── test_quant.py
    ├── requirements.txt
    ├── testing_utils.py
    ├── trainer/
    │   ├── test_argparser.py
    │   ├── test_hf_format_saver_tp4_sharding2.py
    │   ├── test_lora_unified_checkpoint.py
    │   ├── test_moe_unified_checkpoint.py
    │   ├── test_trainer_callback.py
    │   ├── test_trainer_visualization.py
    │   ├── test_unified_checkpoint.py
    │   ├── trainer_utils.py
    │   └── unified-ckpt-llama-170m/
    │       └── config.json
    ├── transformers/
    │   ├── __init__.py
    │   ├── auto/
    │   │   ├── __init__.py
    │   │   ├── test_configuration.py
    │   │   ├── test_feature_extraction.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   ├── test_tokenizer.py
    │   │   ├── test_tokenizer_without_paddle.py
    │   │   └── test_video_processor.py
    │   ├── deepseek_v3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5_moe_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   ├── test_tokenizer.py
    │   │   └── test_vision_process.py
    │   ├── gemma3_text/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── glm4_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── glm4v_moe/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── glm_ocr/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── gpt_oss/
    │   │   ├── __init__.py
    │   │   ├── test_fp4_to_bf16.py
    │   │   └── test_modeling.py
    │   ├── kimi_k2/
    │   │   └── test_modeling.py
    │   ├── kimi_k25/
    │   │   ├── __init__.py
    │   │   └── test_processor.py
    │   ├── llama/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_tokenizer.py
    │   ├── paddleocr_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── phi3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen2/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_tokenizer.py
    │   ├── qwen2_5_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── qwen2_vl/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_processor.py
    │   │   ├── test_video_processor.py
    │   │   └── test_vision_process.py
    │   ├── qwen2moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3_omni_moe/
    │   │   ├── __init__.py
    │   │   └── test_processor.py
    │   ├── qwen3_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   └── test_video_processor.py
    │   ├── qwen3_vl_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3next/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── test_cache_utils.py
    │   ├── test_configuration_common.py
    │   ├── test_configuration_utils.py
    │   ├── test_conversion_common.py
    │   ├── test_conversion_tp_split_merge.py
    │   ├── test_generation_utils.py
    │   ├── test_hf_feature_extractor.py
    │   ├── test_hf_image_processor.py
    │   ├── test_hf_processor.py
    │   ├── test_hf_tokenizer.py
    │   ├── test_hf_video_processor.py
    │   ├── test_image_processing_common.py
    │   ├── test_masking_utils.py
    │   ├── test_modeling_common.py
    │   ├── test_modeling_rope_utils.py
    │   ├── test_modeling_utils.py
    │   ├── test_processing_common.py
    │   ├── test_ring_flash_attention.py
    │   ├── test_safetensors.py
    │   ├── test_segment_parallel_utils.py
    │   ├── test_tensor_parallel.py
    │   ├── test_utils.py
    │   └── test_video_processing_common.py
    ├── triton/
    │   └── test_rope_triton.py
    └── utils/
        ├── __init__.py
        ├── test_aistudio_download.py
        ├── test_downloader.py
        ├── test_import_utils.py
        ├── test_module/
        │   ├── __init__.py
        │   ├── custom_configuration.py
        │   ├── custom_model.py
        │   ├── custom_tokenizer.py
        │   └── custom_tokenizer_fast.py
        ├── test_serialization.py
        └── test_set_nccl_config.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .copyright.hook
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import io
import re
import sys
import os
import datetime

COPYRIGHT = '''Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.'''

def _generate_copyright(comment_mark):
    copyright=COPYRIGHT.split(os.linesep)
    header = copyright[0].rstrip()

    p = re.search('(\d{4})', header).group(0)
    now = datetime.datetime.now()

    header = header.replace(p,str(now.year))

    ans=[comment_mark + " " + header + os.linesep]
    for idx, line in enumerate(copyright[1:]):
        ans.append(comment_mark + " " + line.rstrip() + os.linesep)

    return ans

def _get_comment_mark(path):
    lang_type=re.compile(r"\.(py|sh)$")
    if lang_type.search(path) is not None:
        return "#"

    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
    if lang_type.search(path) is not None:
        return "//"

    return None


RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
RE_COPYRIGHT = re.compile(r".*Copyright( \(c\))* \d{4}", re.IGNORECASE)
RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")

def _check_copyright(path):
    head=[]
    try:
        with open(path, encoding="utf-8") as f:
            head = [next(f) for x in range(4)]
    except StopIteration:
        pass

    for idx, line in enumerate(head):
        if RE_COPYRIGHT.search(line) is not None:
            return True

    return False

def generate_copyright(path, comment_mark):
    original_contents = io.open(path, encoding="utf-8").readlines()
    head = original_contents[0:4]

    insert_line_no=0
    for i, line in enumerate(head):
        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
            insert_line_no=i+1

    copyright = _generate_copyright(comment_mark)
    if insert_line_no == 0:
        new_contents = copyright
        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents)
    else:
        new_contents=original_contents[0:insert_line_no]
        new_contents.append(os.linesep)
        new_contents.extend(copyright)
        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents[insert_line_no:])
    new_contents="".join(new_contents)

    with io.open(path, 'w') as output_file:
        output_file.write(new_contents)



def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Checker for copyright declaration.')
    parser.add_argument('filenames', nargs='*', help='Filenames to check')
    args = parser.parse_args(argv)

    retv = 0
    for path in args.filenames:
        comment_mark = _get_comment_mark(path)
        if comment_mark is None:
            print("warning:Unsupported file", path, file=sys.stderr)
            continue

        if _check_copyright(path):
            continue

        generate_copyright(path, comment_mark)


if __name__ == '__main__':
    exit(main())


================================================
FILE: .flake8
================================================
[flake8]
ignore = E203, E402, E501, E731, E741, W503, W605, E722
max-line-length = 119

# E402: module level import not at top of file
per-file-ignores =
    __init__.py:F401,F403,E402

================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
**简体中文**🀄 | [English🌎](./CODE_OF_CONDUCT_en.md)

# 贡献者公约

## 我们的承诺

身为社区成员、贡献者和领袖，我们承诺使社区参与者不受骚扰，无论其年龄、体型、可见或不可见的缺陷、族裔、性征、性别认同和表达、经验水平、教育程度、社会与经济地位、国籍、相貌、种族、种姓、肤色、宗教信仰、性倾向或性取向如何。

我们承诺以有助于建立开放、友善、多样化、包容、健康社区的方式行事和互动。

## 我们的准则

有助于为我们的社区创造积极环境的行为例子包括但不限于：

* 表现出对他人的同情和善意
* 尊重不同的主张、观点和感受
* 提出和大方接受建设性意见
* 承担责任并向受我们错误影响的人道歉
* 注重社区共同诉求，而非个人得失

不当行为例子包括：

* 使用情色化的语言或图像，及性引诱或挑逗
* 嘲弄、侮辱或诋毁性评论，以及人身或政治攻击
* 公开或私下的骚扰行为
* 未经他人明确许可，公布他人的私人信息，如物理或电子邮件地址
* 其他有理由认定为违反职业操守的不当行为

## 责任和权力

社区领袖有责任解释和落实我们所认可的行为准则，并妥善公正地对他们认为不当、威胁、冒犯或有害的任何行为采取纠正措施。

社区领导有权力和责任删除、编辑或拒绝或拒绝与本行为准则不相符的评论（comment）、提交（commits）、代码、维基（wiki）编辑、议题（issues）或其他贡献，并在适当时机知采取措施的理由。

## 适用范围

本行为准则适用于所有社区场合，也适用于在公共场所代表社区时的个人。

代表社区的情形包括使用官方电子邮件地址、通过官方社交媒体帐户发帖或在线上或线下活动中担任指定代表。

## 监督

辱骂、骚扰或其他不可接受的行为可通过 paddlenlp@baidu.com 向负责监督的社区领袖报告。
所有投诉都将得到及时和公平的审查和调查。

所有社区领袖都有义务尊重任何事件报告者的隐私和安全。

## 处理方针

社区领袖将遵循下列社区处理方针来明确他们所认定违反本行为准则的行为的处理方式：

### 1. 纠正

**社区影响**：使用不恰当的语言或其他在社区中被认定为不符合职业道德或不受欢迎的行为。

**处理意见**：由社区领袖发出非公开的书面警告，明确说明违规行为的性质，并解释举止如何不妥。或将要求公开道歉。

### 2. 警告

**社区影响**：单个或一系列违规行为。

**处理意见**：警告并对连续性行为进行处理。在指定时间内，不得与相关人员互动，包括主动与行为准则执行者互动。这包括避免在社区场所和外部渠道中的互动。违反这些条款可能会导致临时或永久封禁。

### 3. 临时封禁

**社区影响**: 严重违反社区准则，包括持续的不当行为。

**处理意见**: 在指定时间内，暂时禁止与社区进行任何形式的互动或公开交流。在此期间，不得与相关人员进行公开或私下互动，包括主动与行为准则执行者互动。违反这些条款可能会导致永久封禁。

### 4. 永久封禁

**社区影响**：行为模式表现出违反社区准则，包括持续的不当行为、骚扰个人或攻击或贬低某个类别的个体。

**处理意见**：永久禁止在社区内进行任何形式的公开互动。

## 参见

本行为准则改编自 [Contributor Covenant][homepage] 2.1 版, 参见 [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]。

社区处理方针灵感来源于 [Mozilla's code of conduct enforcement ladder][Mozilla CoC]。

有关本行为准则的常见问题的答案，参见 [https://www.contributor-covenant.org/faq][FAQ]。
其他语言翻译参见 [https://www.contributor-covenant.org/translations][translations]。

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


================================================
FILE: .github/CODE_OF_CONDUCT_en.md
================================================
[简体中文🀄](./CODE_OF_CONDUCT.md) |  **English**🌎

# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
paddlenlp@baidu.com.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


================================================
FILE: .github/CONTRIBUTING_en.md
================================================
[简体中文🀄](../CONTRIBUTING.md) |  **English**🌎

# Contributing to PaddleFormers

We highly welcome and value your contributions to `PaddleFormers`. The first step to start your contribution is to sign the [PaddlePaddle Contributor License Agreement](https://cla-assistant.io/PaddlePaddle/PaddleFormers).

This document explains our workflow and work style:

## Finding out what to work on Workflow

## Development Workflow

PaddleFormers uses the [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.

#### 1. Fork

   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).

#### 2. Clone

   To make a copy of your fork to your local computers, please run

   ```bash
   git clone https://github.com/<your-github-account>/PaddleFormers
   cd PaddleFormers
   ```

#### 3. Create the local feature branch

   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:

   ```bash
   git checkout -b my-cool-feature
   ```

#### 4. Set up the development environment

   Before you start coding, you need to setup the development environment. We highly recommend doing all your development in a virtual environment such as
   [venv](https://docs.python.org/3/library/venv.html) or [conda](https://docs.conda.io/en/latest/). After you setup and activated your virtual environment,
   run the following command:

   ```bash
   make install
   ```

   This will setup all the dependencies of `PaddleFormers` as well as the [`pre-commit`](http://pre-commit.com/) tool.

   If you are working on the `examples` or `applications` module and require importing from `PaddleFormers`, make sure you install `PaddleFormers` in editable mode.
   If `PaddleFormers` is already installed in the virtual environment, remove it with `pip uninstall PaddleFormers` before reinstalling it in editable mode with
   `pip install -e .`

#### 5. Develop

   As you develop your new exciting feature, keep in mind that it should be covered by unit tests. All of our unit tests can be found under the `tests` directory.
   You can either modify existing unit test to cover the new feature, or create a new test from scratch.
   As you finish up the your code, you should make sure the test suite passes. You can run the tests impacted by your changes like this:

   ```bash
   pytest tests/<test_to_run>.py
   ```

#### 6. Commit

   We utilizes [`pre-commit`](http://pre-commit.com/) (with [black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/) and
   [flake8](https://flake8.pycqa.org/en/latest/) under the hood) to check the style of code and documentation in every commit. When you run run `git commit`, you will see
   something like the following:

   ```
    ➜  (my-virtual-env) git commit -m "commiting my cool feature"
    black....................................................................Passed
    isort....................................................................Passed
    flake8...................................................................Passed
    check for merge conflicts................................................Passed
    check for broken symlinks............................(no files to check)Skipped
    detect private key.......................................................Passed
    fix end of files.....................................(no files to check)Skipped
    trim trailing whitespace.............................(no files to check)Skipped
    CRLF end-lines checker...............................(no files to check)Skipped
    CRLF end-lines remover...............................(no files to check)Skipped
    No-tabs checker......................................(no files to check)Skipped
    Tabs remover.........................................(no files to check)Skipped
    copyright_checker........................................................Passed
   ```

   But most of the time things don't go so smoothly. When your code or documentation doesn't meet the standard, the `pre-commit` check will fail.
   ```
    ➜  (my-virtual-env) git commit -m "commiting my cool feature"
    black....................................................................Passed
    isort....................................................................Failed
    - hook id: isort
    - files were modified by this hook

    Fixing examples/information_extraction/waybill_ie/run_ernie_crf.py

    flake8...................................................................Passed
    check for merge conflicts................................................Passed
    check for broken symlinks............................(no files to check)Skipped
    detect private key.......................................................Passed
    fix end of files.....................................(no files to check)Skipped
    trim trailing whitespace.............................(no files to check)Skipped
    CRLF end-lines checker...............................(no files to check)Skipped
    CRLF end-lines remover...............................(no files to check)Skipped
    No-tabs checker......................................(no files to check)Skipped
    Tabs remover.........................................(no files to check)Skipped
    copyright_checker........................................................Passed
   ```

   But **don't panic**!
   Our tooling will fix most of the style errors automatically. Some errors will need to be addressed manually. Fortunately, the error messages are straight forward and
   the errors are usually simple to fix. After addressing the errors, you can run `git add <files>` and `git commit` again, which will trigger `pre-commit` again.
   Once the `pre-commit` checks pass, you are ready to push the code.

   [Google][http://google.com/] or [StackOverflow](https://stackoverflow.com/) are great tools to help you understand the code style errors.
   Don't worry if you still can't figure it out. You can commit with `git commit -m "style error" --no-verify` and we are happy to help you once you create a Pull Request.

#### 7. Keep pulling

   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.

   ```bash
   git remote add upstream https://github.com/PaddlePaddle/PaddleFormers
   git pull upstream develop
   ```

#### 8. Push and file a pull request

   You can "push" your local work into your forked repo:

   ```bash
   git push origin my-cool-stuff
   ```

   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/PaddleFormers) to pull your change into the official one.

   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).

#### 9. Delete local and remote branches

   To keep your local workspace and your fork clean, you might want to remove merged branches:

   ```bash
   git push origin my-cool-stuff
   git checkout develop
   git pull upstream develop
   git branch -d my-cool-stuff
   ```

## Code Review

-  Please feel free to ping your reviewers by @-mentioning the in the Pull Request.  Please do this after your pull request passes the CI.

- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; Otherwise, please start a discussion under the comment.

- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).


================================================
FILE: .github/ISSUE_TEMPLATE/ask-question.yml
================================================
name: 🐛 Ask Question
description: 请描述您使用PaddleFormers时遇到的问题
title: "[Question]: "
labels: 
  - question
body:
- type: markdown
  attributes:
    value: >
      #### 你可以在这里提出一个使用/咨询问题，提问之前请确保：
      
      - 1）已经百度/谷歌搜索过你的问题，但是没有找到解答；
      
      - 2）已经在官网查询过[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/index_cn.html)与[FAQ](https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/index_cn.html)，但是没有找到解答；
      
      - 3）已经在[历史issue](https://github.com/PaddlePaddle/Paddle/issues)中搜索过，没有找到同类issue或issue未被解答。
      
- type: textarea
  id: question
  attributes:
    label: 请提出你的问题
  validations:
    required: true

================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yml
================================================
name: 🐛 Bug Report
description: PaddleFormers问题反馈
title: "[Bug]: "
labels: bug
body: 
  - type: textarea
    id: environment
    attributes:
      label: 软件环境
      description: |
        请使用以下命令给出您本地Paddle相关包信息
          ```sh
          pip list | grep paddle
          
          ```
      value: |
          - paddlepaddle:
          - paddlepaddle-gpu: 
          - paddleformers: 
      render: Markdown
    validations:
      required: true
  - type: checkboxes
    id: dumplicated-problem
    attributes:
      label: 重复问题
      description: 是否已在issues中搜索相关问题
      options:
      - label: I have searched the existing issues
        required: true
  - type: textarea
    id: descripton
    attributes:
      label: 错误描述
      description: 给出错误详细描述，以便能够更好的追踪相关问题
      render: Markdown
    validations:
      required: true
  - type: textarea
    id: mvp-code
    attributes:
      label: 稳定复现步骤 & 代码
      description: 请给出稳定复现该问题的步骤 & 代码，以便相关人员能够快速定位到具体问题。
    validations:
      required: true

================================================
FILE: .github/ISSUE_TEMPLATE/docs-report.yml
================================================
name: 🐛 Docs Report
description: PaddleFormers文档反馈
title: "[Docs]: "
labels: 
  - documentation

body: 
  - type: textarea
    id: environment
    attributes:
      label: 软件环境
      description: |
        请使用以下命令给出您本地Paddle相关包信息
          ```sh
          pip list | grep paddle
          
          ```
      value: |
          - paddlepaddle:
          - paddlepaddle-gpu: 
          - paddleformers: 
      render: Markdown
    validations:
      required: true
  - type: textarea
    id: description
    attributes:
      label: 详细描述
      description: 请详细描述您想要反馈的具体问题
      render: Markdown
    validations:
      required: true

================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.yml
================================================
name: "\U0001F680 Feature request"
description: 请详细描述您所需功能
labels: [ "feature" ]
body:
  - type: textarea
    id: feature-request
    validations:
      required: true
    attributes:
      label: Feature request
      description: |
        对特性提案的清晰而简明的描述。如果论文和代码存在，请提供链接。 

  - type: textarea
    id: motivation
    validations:
      required: true
    attributes:
      label: Motivation
      description: |
        请概述这项建议的动机。您的特性要求与问题有关吗?
        
  - type: textarea
    id: contribution
    validations:
      required: true
    attributes:
      label: Your contribution
      description: |
        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md)


================================================
FILE: .github/ISSUE_TEMPLATE/new-model.yaml
================================================
name: "\U0001F31F 添加新模型"
description: 请为新模型提交一份说明
labels: [ "New model" ]

body:
  - type: textarea
    id: description-request
    validations:
      required: true
    attributes:
      label: 简要描述
      description: |
        请简要描述模型的类型、解决的问题等。

  - type: checkboxes
    id: information-tasks
    attributes:
      label: 是否已开源
      options:
        - label: 已开源
        - label: 未开源

  - type: textarea
    id: additional-info
    attributes:
      label: 模型详细信息
      description: |
        请给出新模型相关信息，如论文地址、现存代码地址等。

================================================
FILE: .github/ISSUE_TEMPLATE/others.yml
================================================
name: 🧩 其他 Others
description: 提出其他问题。
labels: [others]

body:
- type: markdown
  attributes:
    value: >
      #### 你可以在这里提出任何前面几类模板不适用的问题，包括但不限于：优化性建议、框架使用体验反馈、版本兼容性问题、报错信息不清楚等。

- type: textarea
  id: others
  attributes:
    label: 问题描述
  validations:
    required: true
    
- type: markdown
  attributes:
    value: >
      感谢你的贡献 🎉！




================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!-- Demo: https://github.com/PaddlePaddle/PaddleFormers/pull/ -->
#### Before submitting

- [ ] Lint code. If there are lint issues, please format the code first.

```shell
# Install and register `pre-commit` in the project folder
pip install pre-commit && pre-commit install

# Process previous code files separately
pre-commit run --file XXXX.py
```

- [ ] Add test cases into `tests` folder. If there are codecov issues, please add tests cases first.

### PR types
<!-- One of [ New features | Bug fixes | Function optimization | Performance optimization | Breaking changes | Others ] -->

### PR changes
<!-- One of [ Models | APIs | Docs | Others ] -->

### Description
<!-- Describe what this PR does -->


================================================
FILE: .github/actions/rerun-workflow/action.yml
================================================
name: 'Rerun Workflow'
description: 'Re-run GitHub Actions workflow for a given Pull Request'
inputs:
  GITHUB_TOKEN:
    description: 'GitHub token with repo scope'
    required: true
  OWNER:
    description: 'Repository owner'
    required: true
  REPO:
    description: 'Repository name'
    required: true
  PR_ID:
    description: 'Pull Request ID'
    required: true
  JOB_NAME:
    description: 'Job name to rerun'
    required: true

runs:
  using: 'composite'
  steps:
    - run: bash ./.github/actions/rerun-workflow/rerun.sh
      shell: bash
      env:
        GITHUB_TOKEN: ${{ inputs.GITHUB_TOKEN }}
        OWNER: ${{ inputs.OWNER }}
        REPO: ${{ inputs.REPO }}
        PR_ID: ${{ inputs.PR_ID }}
        JOB_NAME: ${{ inputs.JOB_NAME }}

================================================
FILE: .github/actions/rerun-workflow/rerun.sh
================================================
# Copyright (c) 2025 PaddleFormers Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

COMMIT_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
  "https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_ID" | jq -r '.head.sha')

echo "Commit SHA: $COMMIT_SHA"

response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
  "https://api.github.com/repos/$OWNER/$REPO/actions/runs?head_sha=$COMMIT_SHA&per_page=100")

echo "Response: $response"

run_ids=$(echo "$response" | jq -r '.workflow_runs[].id')

if [ -n "$run_ids" ]; then
  echo "Found run_ids for commit $COMMIT_SHA: $run_ids"

  for run_id in $run_ids; do
    if [ "$JOB_NAME" = "all-failed" ]; then
      echo "Rerunning all failed jobs for run_id: $run_id"

      rerun_response=$(curl -X POST -s -w "%{http_code}" -o /dev/null \
        -H "Accept: application/vnd.github.v3+json" \
        -H "Authorization: Bearer $GITHUB_TOKEN" \
        "https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/rerun-failed-jobs")
      if [ "$rerun_response" -eq 201 ]; then
        echo "Successfully requested rerun for all blocked jobs in run_id: $run_id"
      else
        echo "Failed to request rerun for run_id: $run_id with status code $rerun_response"
      fi

    else
      jobs_response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
      "https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/jobs")

      echo "Jobs Response for run_id $run_id: $jobs_response"

      # if [[ "$JOB_NAME" == *"bypass"* ]]; then
        block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
        '.jobs[] | select(.name == $job_name) | .id')
      # else
      #   block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
      #   '.jobs[] | select(.name == $job_name and .conclusion != "success") | .id')
      # fi

      if [ -n "$block_jobs" ]; then
        echo "Found block jobs for run_id $run_id: $block_jobs"

        for job_id in $block_jobs; do
          echo "Rerunning job_id: $job_id"
          curl -X POST -H "Accept: application/vnd.github.v3+json" \
            -H "Authorization: token $GITHUB_TOKEN" \
            "https://api.github.com/repos/$OWNER/$REPO/actions/jobs/$job_id/rerun"
        done
      else
        echo "No block jobs found for run_id $run_id with name $JOB_NAME."
      fi
    fi
  done
else
  echo "No matching workflow runs found for commit $COMMIT_SHA."
  exit 1
fi

================================================
FILE: .github/codecov.yml
================================================
codecov:
  notify:
    wait_for_ci: false

coverage:
  status:
    project:
      default: 
        target: 30% # overall project Coverage 
        threshold: 1% # Allow the coverage to drop by 1%, and posting a success status.
    patch:
      default: 
        target: 75% # lines adjusted  Coverage < 60%  CI will fail


================================================
FILE: .github/workflows/_clone_linux.yml
================================================
name: PaddleFormers Code Clone
description: "PaddleFormers clone and upload"

on:
  workflow_call:
    inputs:
        bos_dir:
          type: string
          required: false
          default: 'PaddleFormers'
    outputs:
      repo_archive_url:
        description: "Compressed source code archive."
        value: ${{ jobs.code-clone.outputs.repo_archive_url }}
jobs:
  code-clone:
    runs-on:
      group: HK-Clone
    outputs:
      repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
    steps:
      - name: Clone PaddleFormers
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request'
                && github.event.pull_request.base.ref
                || github.ref_name }}
          submodules: 'recursive'
          fetch-depth: 1000

      - name: Merge PR (if needed)
        if: ${{ github.event_name == 'pull_request' }}
        run: |
          git config --global user.name "PaddleFormersCI"
          git config --global user.email "paddleformers_ci@example.com"
          echo "Fetching and merging PR..."
          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
          git merge --no-ff pr/${{ github.event.pull_request.number }}
          echo "PR Branch log "
          git log --oneline -n 5 pr/${{ github.event.pull_request.number }}
      - uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - name: Code Info Show and Upload
        id: set_output
        env:
          AK: paddle
          SK: paddle
        run: |
          git config --unset http.https://github.com/.extraheader
          git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
          git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
          echo "Current HEAD Log:"
          git log --oneline -n 5
          ls
          cd ..
          tar -zcf PaddleFormers.tar.gz PaddleFormers
          if [[ "${{ github.event_name }}" == "pull_request" ]];then
            commit_id=${{ github.event.pull_request.head.sha }}
            pr_num=${{ github.event.pull_request.number }}
            target_path=paddle-github-action/PR/PaddleFormers/${pr_num}/${commit_id}
          elif [[ "${{ github.ref_type }}" == "tag" ]]; then
            commit_id=${{ github.sha }}
            tag_name=${{ github.ref_name }}
            target_path=paddle-github-action/TAG/PaddleFormers/${tag_name}/${commit_id}
          else
            commit_id=${{ github.sha }}
            branch_name=${{ github.ref_name }}
            target_path=paddle-github-action/BRANCH/PaddleFormers/${branch_name}/${commit_id}
          fi
          wget -O bos_tools.py -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
          push_file=$(realpath bos_tools.py)
          python -m pip install bce-python-sdk==0.9.29
          ls
          python ${push_file} PaddleFormers.tar.gz ${target_path}
          target_path_stripped="${target_path#paddle-github-action/}"
          REPO_ARCHIVE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/PaddleFormers.tar.gz
          echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT


================================================
FILE: .github/workflows/_xpu_ci_test.yml
================================================
name: xpu_ci_test

on:
  workflow_call:
    inputs:
      DOCKER_IMAGE:
        description: "Build Images"
        required: true
        type: string
        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:xpu-ubuntu2204-x86_64-gcc123-py310"
      PADDLEFORMERS_ARCHIVE_URL:
        description: "URL of the compressed PaddleFormers code archive."
        required: true
        type: string
      PADDLE_WHL_URL:
        description: "Paddle Wheel Package URL"
        required: false
        type: string
        default: ""
      MODEL_PATH:
        description: "MODEL Dir Use"
        required: true
        type: string
        default: ""

jobs:
  run_xpu_cases:
    runs-on: [self-hosted, XPU-P800-8Cards]
    timeout-minutes: 60
    steps:
      - name: Print current runner name
        run: |
          echo "Current runner name: ${{ runner.name }}"
      - name: Code Prepare
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          formers_archive_url: ${{ inputs.PADDLEFORMERS_ARCHIVE_URL }}
          model_path: ${{ inputs.MODEL_PATH }}
        run: |
            set -x
            REPO="https://github.com/${{ github.repository }}.git"
            FULL_REPO="${{ github.repository }}"
            REPO_NAME="${FULL_REPO##*/}"
            BASE_BRANCH="${{ github.base_ref }}"
            # 由于ci机器网络问题 暂时屏蔽pull命令
            # docker pull ${docker_image}
            # Clean the repository directory before starting
            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
            -e "REPO_NAME=${REPO_NAME}" \
            ${docker_image} /bin/bash -c '
              CLEAN_RETRIES=3
              CLEAN_COUNT=0

              while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do
                echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..."
                rm -rf "${REPO_NAME}"* || true
                sleep 2

                # Check if anything matching ${REPO_NAME}* still exists
                if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then
                  echo "All ${REPO_NAME}* removed successfully"
                  break
                fi

                CLEAN_COUNT=$((CLEAN_COUNT + 1))
              done

              if ls "${REPO_NAME}"* >/dev/null 2>&1; then
                echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
                ls -ld "${REPO_NAME}"*
                exit 1
              fi
            '

            wget -q --no-proxy ${formers_archive_url}
            tar -xf PaddleFormers.tar.gz
            rm -rf PaddleFormers.tar.gz
            cd PaddleFormers
            git config --global user.name "PaddleFormersCI"
            git config --global user.email "paddleformers_ci@example.com"
            git log -n 3 --oneline

      - name: Run CI unittest
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          formers_archive_url: ${{ inputs.PADDLEFORMERS_ARCHIVE_URL }}
          model_path: ${{ inputs.MODEL_PATH }}
        run: |
          runner_name="${{ runner.name }}"
          last_char="${runner_name: -1}"

          PARENT_DIR=$(dirname "$WORKSPACE")
          echo "PARENT_DIR:$PARENT_DIR"
          docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G  \
          -v $(pwd):/workspace -w /workspace \
          -v "/home/suijiaxin/model:/model" \
          -v "/home/suijiaxin/images:/images" \
          -e "MODEL_PATH=${model_path}" \
          -e "PADDLEFORMERS_ARCHIVE_URL=${formers_archive_url}" \
          -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
          -e "http_proxy=$(git config --global --get http.proxy)" \
          -e "https_proxy=$(git config --global --get https.proxy)" \
          -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \
          ${docker_image} /bin/bash -c '

          # 重启XPU卡
          echo "重启XPU卡..."
          xpu-smi -r -i 0,1,2,3,4,5,6,7
          xpu-smi
          set -e
          git config --global --add safe.directory /workspace/PaddleFormers
          cd PaddleFormers
          python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
          python -m pip install -r requirements.txt
          echo "安装PaddlePaddle..."
          # 针对不同分支和tag使用不同的PaddlePaddle安装包
          python -m pip uninstall paddlepaddle-xpu paddleformers -y
          python -m pip uninstall librosa -y
          python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/          
          echo "安装PaddleFormers..."
          python -m pip install -e .
          echo "============================安装测试依赖============================"
          python -m pip install pytest
          python -m pip install pytest-timeout
          unset http_proxy
          unset https_proxy
          echo "============================开始运行pytest测试============================"
          ln -s /model baidu
          ln -s /images tests/fixtures/dummy/sft-vl/
          export XPU_VISIBLE_DEVICES="0,1,2,3"
          python -m pytest -v -s --tb=short scripts/xpu_ci/
          exit_code=$?

          if [ $exit_code -eq 0 ]; then
              echo "============================4卡cases测试通过!============================"
          else
              echo "============================4卡cases测试失败,请检查日志!============================"
              exit $exit_code
          fi
          '


================================================
FILE: .github/workflows/ce-build-ci-workflow.yml
================================================
name: Build CI Images

on:
  schedule:
    - cron: "0 22 * * *"     # every day at 06:00 Beijing time (UTC+8)
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

env:
  image_base: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev"

jobs:
  build-ci-images-test:
    name: build-ci-images-test
    if: github.ref == 'refs/heads/develop'
    uses: ./.github/workflows/ce-build-images.yml
    with:
      flag_build: test
      runner: ernie-8gpu-2
      image_base: ${{ env.image_base }}

  test-ci-images:
    name: test-ci-images
    needs: build-ci-images-test
    if: ${{ needs.build-ci-images-test.outputs.flag_downstream == 'true' }}
    uses: ./.github/workflows/unittest-gpu.yml
    with:
      runner: ernie-8gpu-2
      image_name: ${{ needs.build-ci-images-test.outputs.image_name }}

  clean-ci-image:
    name: clean-ci-image
    needs: [build-ci-images-test, test-ci-images]
    if: always()
    runs-on: [self-hosted, ernie-8gpu-2]
    steps:
      - name: Remove Dangling Image
        run: docker images -f "dangling=true" -q | xargs -r docker rmi -f

  update-ci-images-1:
    name: update-ci-images-1
    needs: [build-ci-images-test, test-ci-images]
    uses: ./.github/workflows/ce-build-images.yml
    with:
      flag_build: update
      runner: ernie-8gpu-1
      image_base: ${{ env.image_base }}

  update-ci-images-2:
    name: update-ci-images-2
    needs: [build-ci-images-test, test-ci-images]
    uses: ./.github/workflows/ce-build-images.yml
    with:
      flag_build: update
      runner: ernie-8gpu-2
      image_base: ${{ env.image_base }}

================================================
FILE: .github/workflows/ce-build-images.yml
================================================
name: Build CI Images For Test

on:
  workflow_call:
    inputs:
      flag_build:       # test||update
        required: true
        type: string
      runner:
        required: true
        type: string
      image_base:
        required: true
        type: string

env:
  TASK: PaddleFormers-build-CI-image

jobs:
  build-images:
    name: build-images
    runs-on: ${{ inputs.runner }}
    outputs:
      flag_downstream: ${{ steps.build_image_flags.outputs.flag_downstream }}
      image_name: ${{ steps.build_image_flags.outputs.image_name }}

    steps:
      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          IMAGE_BASE: ${{ inputs.image_base }}
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> ${{ github.env }}
          docker_image="${IMAGE_BASE}"
          echo "docker_image=${docker_image}"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/pip:/home/.cache/pip \
            -e work_dir \
            -e no_proxy \
            -w /workspace --privileged ${docker_image}

      - name: Download Code
        env:
          work_dir: ${{ github.workspace }}
        run: |
          docker exec -t ${container_name} /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar.gz"
          wget -q --no-proxy  https://paddleformers.bj.bcebos.com/wheels/PaddleFormers.tar.gz --no-check-certificate
          echo "Extracting PaddleFormers.tar.gz"
          tar xf PaddleFormers.tar.gz && rm -rf PaddleFormers.tar.gz
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCI"
          git config --global user.email "paddle_ci@example.com"
          git pull
          git submodule update --init --recursive --force
          git log --pretty=oneline -10
          unset http_proxy && unset https_proxy
          '

      - name: Write Dockerfile Inline
        run: |
          cat <<EOF > Dockerfile
          FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev
          # RUN apt-get update && \
          #     apt-get install -y openjdk-11-jdk && \
          #     rm -rf /var/lib/apt/lists/*
          # ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
          # ENV PATH="$JAVA_HOME/bin:${PATH}"
          RUN python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
          RUN python -m pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn
          COPY PaddleFormers/ /tmp/paddleformers/
          WORKDIR /tmp/paddleformers
          RUN python -m pip install -r requirements.txt --progress-bar off
          RUN python -m pip install -r tests/requirements.txt --progress-bar off
          RUN python setup.py bdist_wheel
          RUN python -m pip install "$(ls -t dist/*.whl | head -1)[paddlefleet]" --progress-bar off
          WORKDIR /
          RUN rm -rf /tmp/paddleformers
          RUN python -m pip install opt_einsum --progress-bar off  # for paddle
          EOF

      - name: Build Docker Image
        id: build_image_flags
        env:
          work_dir: ${{ github.workspace }}
          flag_build: ${{ inputs.flag_build }}
          image_base: ${{ inputs.image_base }}
        run: |
          flag_downstream=false
          if [[ "${flag_build}" == "test" ]]; then
            flag_downstream=true
            image_name="${image_base}-test-$(date +%Y%m%d-%H%M%S)"
            docker build --network host -t ${image_name} .
          elif [[ "${flag_build}" == "update" ]]; then
            flag_downstream=false
            image_name="${image_base}-latest"
            # remove old bak 
            docker images --format '{{.Repository}}:{{.Tag}}' | grep 'bak' || true | while read img; do
              echo "Removing image $img ..."
              docker rmi "$img" || true
            done
            # mv old latest to bak
            docker images --format '{{.Repository}}:{{.Tag}}' | grep 'latest' || true | while read img; do
              echo "Tagging $img as ${image_base}-bak ..."
              docker tag "$img" "${image_base}-bak"
              docker rmi "$img" || true
            done
            # mv test image to latest or new build latest
            if docker images --format '{{.Repository}}:{{.Tag}}' | grep 'test'; then
              echo "mv test image to latest, no need to build"
              docker tag "$img" "${image_name}"
              docker rmi "$img" || true
            else
              docker build --network host -t ${image_name} .
            fi
          else
            echo "Invalid flag_build: ${flag_build}"
            exit 1
          fi
          echo "flag_downstream=${flag_downstream}" >> "$GITHUB_OUTPUT"
          echo "image_name=${image_name}" >> "$GITHUB_OUTPUT"
          
      
      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f ${container_name} 2>/dev/null || true

================================================
FILE: .github/workflows/ce-build-whl.yml
================================================
name: Build Whl CE

on:
  push:
    branches:
      - develop
      - release/*

env:
  BRANCH: ${{ github.ref_name }}
  COMMIT_ID: ${{ github.sha }}
  TASK: PaddleFormers-CE-${{ github.sha }}-build
  CE_name: build-ce
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"

defaults:
  run:
    shell: bash

jobs:
  build-ce:
    name: build-ce
    runs-on: [self-hosted, paddleformers]
    steps:
      - name: Determine Image Name
        run: |
          echo "IMAGE_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest" >> "$GITHUB_ENV"

      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          paddle: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda126-Cudnn95-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/pip:/home/.cache/pip \
            -v /home/paddle-1/models/:/home/models/ \
            -e "BRANCH=$BRANCH" \
            -e "COMMIT_ID=$COMMIT_ID" \
            -e work_dir \
            -e ce_scripts \
            -e no_proxy \
            -e CE_name \
            -e paddle \
            -e FLAGS_dynamic_static_unified_comm \
            -e python_version \
            -w /workspace --privileged $IMAGE_NAME
            
      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar"
          wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
          echo "Extracting PaddleFormers.tar"
          tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCE"
          git config --global user.email "paddle_ce@example.com"
          git pull
          git submodule update --init --recursive --force
          git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
          echo "Checking out ${BRANCH}..."
          git fetch upstream ${BRANCH}:${BRANCH}
          git checkout ${BRANCH}
          git log --pretty=oneline -10
          '

      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          pip config set global.cache-dir "/home/.cache/pip"
          set -e
          cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
          bash scripts/dependence/build.sh
          '
          
      - name: Upload Products
        if: always()
        run: |
          docker exec -t $container_name /bin/bash -c '
          cd /workspace/PaddleFormers/upload
          cp /home/models/bos/bos_tools.py ./
          for FILE in /workspace/PaddleFormers/upload/*; do
            file=$(basename "$FILE")
            python bos_tools.py $file paddle-whl/nightly/cu126/paddleformers/
            echo "$file: https://paddle-whl.bj.bcebos.com/nightly/cu126/paddleformers/$file"
          done
          '

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true

================================================
FILE: .github/workflows/ce-deadlink.yml
================================================
name: Deadlink CE

on:
  schedule:
    - cron: "0 8 * * 6"     # every Saturday at 16:00
  workflow_dispatch:        # allow to manually trigger the workflow

concurrency:
  group: deadlink-${{ github.workflow }}
  cancel-in-progress: true

env:
  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
  TASK: PaddleFormers-CE-${{ github.event.pull_request.number }}-deadlink
  BRANCH: ${{ github.event.pull_request.base.ref }}
  CE_name: deadlink-ce
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"

defaults:
  run:
    shell: bash

jobs:
  deadlink-ce:
    name: deadlink-ce
    if: github.ref == 'refs/heads/develop'
    runs-on: [self-hosted, paddleformers]
    steps:
      - name: Determine Image Name
        run: |
          echo "IMAGE_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest" >> "$GITHUB_ENV"

      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda126-Cudnn95-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/pip:/home/.cache/pip \
            -e BRANCH \
            -e COMMIT_ID \
            -e work_dir \
            -e ce_scripts \
            -e no_proxy \
            -e CE_name \
            -e paddle_whl \
            -e FLAGS_dynamic_static_unified_comm \
            -e python_version \
            -w /workspace --privileged $IMAGE_NAME
            
      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar"
          wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
          echo "Extracting PaddleFormers.tar"
          tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCE"
          git config --global user.email "paddle_ce@example.com"
          git pull
          git submodule update --init --recursive --force
          git log --pretty=oneline -10
          cd -
          echo "Downloading PaddleTest.tar"
          wget -q --no-proxy https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz --no-check-certificate
          tar xf PaddleTest.tar.gz
          cp -r PaddleTest/models/PaddleNLP/deadlink/* ./
          '

      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          pip config set global.cache-dir "/home/.cache/pip"
          set -e
          python -m pip install beautifulsoup4 openpyxl
          source $work_dir/../../../proxy
          bash run.sh PaddleFormers develop liujie44@baidu.com "PaddleFormers Broken Link Check Summary Report"
          '
          
      - name: Upload Logs
        if: always()
        env:
          home_path: ${{ github.workspace }}/../../..
          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
        run: |
          docker exec -t $container_name /bin/bash -c '
          if [ ! -f "${{ env.bos_file }}" ]; then
            wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
            mkdir ${{ env.home_path }}/bos
            tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
          fi
          bos_prefix="schedule/$(date +%Y%m%d)"
          cd /workspace/result
          for FILE in /workspace/result/*; do
            file=$(basename "$FILE")
            python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleFormers/deadlink/${bos_prefix}/logs
            echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/deadlink/${bos_prefix}/logs/$file"
          done
          '

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true

================================================
FILE: .github/workflows/ce-unittest-gpu.yml
================================================
name: Unittest GPU CE

on:
  schedule:
    - cron: "0 1 * * *"     # every day at 09:00 Beijing time (UTC+8)
  workflow_dispatch:        # allow to manually trigger the workflow
    inputs:
      paddle_whl:
        description: "paddle_whl"
        required: false
        default: "https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda126-Cudnn95-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl"
        type: string
      repo_branch:
        description: "repo_branch"
        required: false
        type: string

concurrency:
  group: unittest-ce-${{ github.workflow }}
  cancel-in-progress: true

env:
  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
  TASK: PaddleFormers-CE-unittest-gpu
  ce_scripts: workspace/PaddleFormers/scripts/unit_test
  BRANCH: ${{ github.event.pull_request.base.ref }}
  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
  CE_name: unittest-gpu-ce
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"

defaults:
  run:
    shell: bash

jobs:
  unittest-gpu-ce:
    name: unittest-gpu-ce
    runs-on: [self-hosted, ernie-8gpu, distribute]
    steps:
      - name: Determine Image Name
        run: |
          echo "IMAGE_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest" >> "$GITHUB_ENV"

      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          paddle_whl: ${{ github.event.inputs.paddle_whl || 'https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda126-Cudnn95-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl' }}
          repo_branch: ${{ github.event.inputs.repo_branch || '' }}
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          echo "Workspace path: ${{ github.workspace }}"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:workspace/ \
            -v /home/.cache/pip:/home/.cache/pip \
            -v /home/paddle-1/models/:/home/models/ \
            -e PF_HOME=/home/models/ \
            -e BRANCH \
            -e AGILE_COMPILE_BRANCH \
            -e COMMIT_ID \
            -e work_dir \
            -e ce_scripts \
            -e no_proxy \
            -e CE_name \
            -e paddle_whl \
            -e repo_branch \
            -e FLAGS_dynamic_static_unified_comm \
            -e python_version \
            -e HF_PROXY_PATH=$work_dir/../../../proxy_huggingface \
            -e AISTUDIO_PROXY_PATH=$work_dir/../../../proxy_aistudio \
            -e "HF_DATASETS_CACHE=$work_dir/../../../paddlenlp/huggingface/datasets" \
            -e "TRANSFORMERS_CACHE=$work_dir/../../../paddlenlp/huggingface" \
            -w workspace/ ${GPU_OPTION} --privileged $IMAGE_NAME
            
      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar"
          wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
          echo "Extracting PaddleFormers.tar"
          tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCE"
          git config --global user.email "paddle_ce@example.com"
          if [ -n "$repo_branch" ]; then
            echo "Switching to branch: $repo_branch"
            git fetch origin $repo_branch
            git checkout $repo_branch || git checkout -b $repo_branch origin/$repo_branch
            git pull
          else
            echo "No repo_branch provided, just pulling latest changes"
            git pull
          fi
          git submodule update --init --recursive --force
          git log --pretty=oneline -10
          '

      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          pip config set global.cache-dir "/home/.cache/pip"
          set -e
          rm -rf /root/.cache/aistudio/
          cd workspace/PaddleFormers && git config --global --add safe.directory $PWD
          source $work_dir/../../../proxy
          source $work_dir/../../../AISTUDIO_ACCESS_TOKEN
          echo "work_dir = ${work_dir}"
          cp -r ${work_dir}/../../../models ./models
          echo "Check whether the local model file exists:"
          ls -l ./models
          bash scripts/unit_test/ci_unittest.sh ${paddle_whl} true
          '
          
      - name: Upload Allure-reports & Logs
        if: always()
        env:
          home_path: ${{ github.workspace }}/../../..
          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
          allure_file: ${{ github.workspace }}/../../../allure-2.19.0/bin/allure
        run: |
          docker exec -t $container_name /bin/bash -c '
          if [ ! -f "${{ env.bos_file }}" ]; then
            wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
            mkdir ${{ env.home_path }}/bos
            tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
          fi
          # if [ ! -f "${{ env.allure_file }}" ]; then
          #   wget -q --no-proxy -O ${{ env.home_path }}/allure-2.19.0.zip https://xly-devops.bj.bcebos.com/tools/allure-2.19.0.zip --no-check-certificate
          #   unzip -q ${{ env.home_path }}/allure-2.19.0.zip
          # fi
          bos_prefix="schedule/github-ce-$(date +%Y%m%d)"
          # # coverage.xml
          # cd /workspace/PaddleFormers
          # python ${{ env.bos_file }} coverage.xml paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
          # echo "cov-report: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/coverage.xml"
          # logs
          cd /workspace/PaddleFormers/unittest_logs
          for FILE in /workspace/PaddleFormers/unittest_logs/*; do
            file=$(basename "$FILE")
            python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
            echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/$file"
          done
          # cd /workspace/PaddleFormers/
          # ${{ env.allure_file }} generate result -o report
          # tar -czf products.tar.gz report unittest_logs
          # python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
          # echo "report: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/products.tar.gz"
          '

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true

================================================
FILE: .github/workflows/check-release-pr.yaml
================================================
name: Check Release PR

on:
  pull_request:
    branches:
      - 'release/*'
    types:
      - opened
      - edited
      - synchronize

jobs:
  validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Check if PR description contains dev PR link
        env:
          PR_BODY: ${{ github.event.pull_request.body }}
        run: |
          echo "Checking PR body: $PR_BODY"
          if ! echo "$PR_BODY" | grep -Eqi "dev.*#([0-9]+)"; then
            echo "::error::PR must include a link to merged PR in dev branch (e.g., 'Merged in dev: #1234')"
            exit 1
          fi
          echo "Dev PR link found"

      - name: Check the referenced PR is merged into dev
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          DEV_PR=$(echo "${{ github.event.pull_request.body }}" | grep -Eo "#[0-9]+" | head -1 | tr -d "#")
          echo "Found dev PR: $DEV_PR"

          if [ -z "$DEV_PR" ]; then
            echo "::error::No valid PR number found"
            exit 1
          fi

          STATUS=$(gh pr view $DEV_PR --json state --jq .state)
          echo "Dev PR status: $STATUS"

          if [ "$STATUS" != "MERGED" ]; then
            echo "::error::The referenced dev PR (#$DEV_PR) is not merged yet"
            exit 1
          fi

          echo "Dev PR merged. Validation completed."

================================================
FILE: .github/workflows/cherry-pick.yml
================================================
name: Cherry Pick

on:
  pull_request_target:
    branches: [develop]
    types: [closed, labeled]

permissions:
  contents: write
  pull-requests: write
  issues: write

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: false

jobs:
  cherry-pick:
    if: >
      github.event.pull_request.merged == true &&
      (
        github.event.action == 'labeled' ||
        contains(join(github.event.pull_request.labels.*.name, ' '), 'cherry-pick')
      )
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Cherry Pick
        env:
          GH_TOKEN: ${{ secrets.CHERRY_PICK_BOT_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_TITLE: ${{ github.event.pull_request.title }}
          PR_BODY: ${{ github.event.pull_request.body }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
          MERGE_COMMIT_SHA: ${{ github.event.pull_request.merge_commit_sha }}
          BOT_USERNAME: ShigureNyako
          BOT_EMAIL: shigure_nyako@outlook.com
          REPO_NAME: ShigureNyako/PaddleFormers
        run: |
          # Function to post comment
          post_comment() {
            gh pr comment "$PR_NUMBER" --body "$1"
          }

          # Configure git for the original author
          echo "Fetching author info for $PR_AUTHOR..."
          AUTHOR_INFO=$(gh api "/users/$PR_AUTHOR" --jq '{email: .email, name: .name}')
          AUTHOR_EMAIL=$(echo "$AUTHOR_INFO" | jq -r '.email')
          AUTHOR_NAME=$(echo "$AUTHOR_INFO" | jq -r '.name')

          if [ "$AUTHOR_EMAIL" = "null" ] || [ -z "$AUTHOR_EMAIL" ]; then
            AUTHOR_EMAIL="${PR_AUTHOR}@users.noreply.github.com"
            echo "Author email not found, using default: $AUTHOR_EMAIL"
          fi
          if [ "$AUTHOR_NAME" = "null" ] || [ -z "$AUTHOR_NAME" ]; then
            AUTHOR_NAME="${PR_AUTHOR}"
            echo "Author name not found, using username: $AUTHOR_NAME"
          fi

          git config user.name "$AUTHOR_NAME"
          git config user.email "$AUTHOR_EMAIL"

          # Capture current SHA to return to later
          ORIGINAL_HEAD_SHA=$(git rev-parse HEAD)

          # Get labels
          LABELS=$(gh pr view "$PR_NUMBER" --json labels --jq '.labels[].name')

          if [ -z "$LABELS" ]; then
            echo "No labels found."
            exit 0
          fi

          # Loop through labels
          while read -r label; do
            if [[ "$label" == cherry-pick:* ]]; then
              TARGET_BRANCH=$(echo "${label#cherry-pick:}" | xargs)

              if [ -z "$TARGET_BRANCH" ]; then
                echo "Empty target branch for label '$label', skipping."
                continue
              fi

              echo "Processing cherry-pick to $TARGET_BRANCH"

              # Check if target branch exists on remote
              if ! git ls-remote --exit-code --heads origin "$TARGET_BRANCH"; then
                echo "Target branch $TARGET_BRANCH does not exist."
                post_comment "❌ Cherry-pick failed: Target branch \`$TARGET_BRANCH\` does not exist."
                continue
              fi

              # Create a new branch for the cherry-pick
              NEW_BRANCH="cherry-pick/$PR_NUMBER/$TARGET_BRANCH"

              # Clean up local branch if it exists (from previous run)
              if git show-ref --verify --quiet "refs/heads/$NEW_BRANCH"; then
                git branch -D "$NEW_BRANCH"
              fi

              # Fetch the target branch and checkout a new branch from it
              git fetch origin "$TARGET_BRANCH"
              git checkout -b "$NEW_BRANCH" "origin/$TARGET_BRANCH"

              # Cherry pick
              # Try standard cherry-pick first (for squash merges or single commits)
              if git cherry-pick "$MERGE_COMMIT_SHA"; then
                echo "Cherry-pick successful."
              else
                echo "Standard cherry-pick failed, trying with -m 1 (for merge commits)..."
                git cherry-pick --abort
                if git cherry-pick -m 1 "$MERGE_COMMIT_SHA"; then
                  echo "Cherry-pick with -m 1 successful."
                else
                  echo "Cherry-pick failed."
                  git cherry-pick --abort
                  post_comment "❌ Cherry-pick failed: Conflicts detected when cherry-picking to \`$TARGET_BRANCH\`. Please resolve manually."

                  # Cleanup
                  git checkout "$ORIGINAL_HEAD_SHA"
                  git branch -D "$NEW_BRANCH"
                  continue
                fi
              fi

              # Push
              # Construct authenticated URL for the fork
              FORK_URL_AUTH="https://${BOT_USERNAME}:${GH_TOKEN}@github.com/${REPO_NAME}.git"

              echo "Pushing to fork..."
              git push "$FORK_URL_AUTH" "$NEW_BRANCH" --force

              # Create PR
              # If PR_TITLE starts with "[", don't insert an extra space.
              if [ "${PR_TITLE:0:1}" = "[" ]; then
                NEW_TITLE="[$TARGET_BRANCH]$PR_TITLE"
              else
                NEW_TITLE="[$TARGET_BRANCH] $PR_TITLE"
              fi

              NEW_BODY="$PR_BODY

          Cherry-pick of #$PR_NUMBER to \`$TARGET_BRANCH\`.

          Merged in dev: #$PR_NUMBER  <!-- For pass CI -->"

              # Prepare head ref for PR creation (owner:branch)
              HEAD_REF="${BOT_USERNAME}:${NEW_BRANCH}"

              # Check if PR already exists
              EXISTING_PR=$(gh pr list --base "$TARGET_BRANCH" --head "$NEW_BRANCH" --json url --jq '.[0].url')

              if [ -n "$EXISTING_PR" ]; then
                echo "PR already exists: $EXISTING_PR"
                post_comment "ℹ️ Cherry-pick PR already exists: $EXISTING_PR"
              else
                # Create PR using gh CLI, ignoring errors because of "Resource not accessible" false positives
                gh pr create --base "$TARGET_BRANCH" --head "$HEAD_REF" --title "$NEW_TITLE" --body "$NEW_BODY" || true

                # Wait a bit for eventual consistency
                sleep 2

                # Search for the created PR
                CREATED_PR_URL=$(gh pr list --head "$NEW_BRANCH" --state all --json url --jq '.[0].url')

                if [ -n "$CREATED_PR_URL" ]; then
                  echo "Created PR: $CREATED_PR_URL"
                  post_comment "✅ Cherry-pick successful! Created PR: $CREATED_PR_URL"

                  # Request review
                  gh pr review-request "$CREATED_PR_URL" --reviewer "$PR_AUTHOR" || true
                else
                  echo "Failed to create PR."
                  post_comment "❌ Cherry-pick failed: Could not create PR to \`$TARGET_BRANCH\`."
                  continue
                fi
              fi

              # Cleanup for next loop
              git checkout "$ORIGINAL_HEAD_SHA"
              git branch -D "$NEW_BRANCH"
            fi
          done <<< "$LABELS"

      - name: Remove Cherry Pick Labels
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          REPO_NAME: ${{ github.repository }}
        run: |
          LABELS=$(gh pr view "$PR_NUMBER" --repo "$REPO_NAME" --json labels --jq '.labels[].name')
          if [ -z "$LABELS" ]; then
            exit 0
          fi
          while read -r label; do
            if [[ "$label" == cherry-pick:* ]]; then
              echo "Removing label: $label"
              gh pr edit "$PR_NUMBER" --repo "$REPO_NAME" --remove-label "$label"
            fi
          done <<< "$LABELS"


================================================
FILE: .github/workflows/ci_iluvatar.yml
================================================
name: CI_ILUVATAR

on:
  pull_request:
    types: [opened, synchronize]
    branches: [develop, release/**]
permissions: read-all

concurrency:
  group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
  cancel-in-progress: true

jobs:
  iluvatar_test:
    name: iluvatar_test
    runs-on: iluvatar-gpu-2
    timeout-minutes: 60
    container:
      image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
      env:
        LD_LIBRARY_PATH: /usr/local/corex/lib
        LIBRARY_PATH: /usr/local/corex/lib
        http_proxy: http://oversea-website-proxy.aistudio.public:8888
        https_proxy: http://oversea-website-proxy.aistudio.public:8888

    steps:
      - name: Print current runner name
        run: |
          echo "Current runner name: ${{ runner.name }}"
          env
          curl -v --proxy http://oversea-website-proxy.aistudio.public:8888 https://www.github.com

      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request'
                && github.event.pull_request.base.ref
                || github.ref_name }}
          submodules: 'recursive'
          fetch-depth: 1000

      - name: Merge PR (if needed)
        if: ${{ github.event_name == 'pull_request' }}
        run: |
          git config --global --add safe.directory "$GITHUB_WORKSPACE"
          git config --global user.name "PaddleFormersCI"
          git config --global user.email "paddleformers_ci@example.com"
          echo "Fetching and merging PR..."
          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
          git merge --no-ff pr/${{ github.event.pull_request.number }}
          echo "PR Branch log "
          git log --oneline -n 5 pr/${{ github.event.pull_request.number }}

      - name: Run CI unittest
        env:
          MODEL_PATH: /aistudio/paddle_ci
        run: |
          set -e
          git config --global --add safe.directory "$GITHUB_WORKSPACE"
          
          unset http_proxy
          unset https_proxy

          python -m pip install -r requirements.txt

          echo "Uninstall PaddlePaddle and PaddleFormers beforehand..."
          python -m pip uninstall paddlepaddle paddle-iluvatar-gpu paddlepaddle-iluvatar paddleformers -y

          echo "Install PaddlePaddle..."
          retry_count=0
          max_retries=3
          while [ $retry_count -lt $max_retries ]; do
            if python -m pip install --pre paddlepaddle-iluvatar -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/; then
              echo "PaddlePaddle Iluvatar installation successful"
              break
            else
              retry_count=$((retry_count + 1))
              if [ $retry_count -lt $max_retries ]; then
                echo "PaddlePaddle Iluvatar installation failed, retrying in 10 seconds... (Attempt $retry_count/$max_retries)"
                sleep 10
              else
                echo "PaddlePaddle Iluvatar installation failed after $max_retries attempts, Please try rerun this job."
                exit 1
              fi
            fi
          done

          echo "Install PaddleFormers..."
          retry_count=0
          max_retries=3
          while [ $retry_count -lt $max_retries ]; do
            if python -m pip install -e .; then
              echo "PaddleFormers installation successful"
              break
            else
              retry_count=$((retry_count + 1))
              if [ $retry_count -lt $max_retries ]; then
                echo "PaddleFormers installation failed, retrying in 10 seconds... (Attempt $retry_count/$max_retries)"
                sleep 10
              else
                echo "PaddleFormers installation failed after $max_retries attempts, Please try rerun this job."
                exit 1
              fi
            fi
          done
          python -m pip install pytest
          python -m pip install pytest-timeout

          echo "============================Start running tests============================"
          mkdir -p baidu
          cp -r ${MODEL_PATH}/ERNIE-4.5-21B-A3B-PT baidu/
          ixsmi
          python -m pytest -v -s --tb=short scripts/iluvatar_ci/
          echo "============================All tests passed============================"
          rm -rf baidu/
          

================================================
FILE: .github/workflows/ci_xpu.yml
================================================
name: CI_XPU

on:
  pull_request:
    types: [opened, synchronize]
    branches: [develop, release/**]
permissions: read-all

concurrency:
  group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
  cancel-in-progress: true

jobs:
  clone:
    name: Clone-Linux
    uses: ./.github/workflows/_clone_linux.yml

  xpu_test:
    name: xpu_test
    needs: [clone]
    uses: ./.github/workflows/_xpu_ci_test.yml
    with:
      PADDLEFORMERS_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:xpu-ubuntu2204-x86_64-gcc123-py310
      MODEL_PATH: /ssd3/model
  # xpu_4cards_case_test:
  #   name: xpu_4cards_case_test
  #   needs: [clone, xpu_build_test]
  #   uses: ./.github/workflows/_xpu_4cards_case_test.yml
  #   with:
  #     FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
  #     DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci
  #     FASTDEPLOY_WHEEL_URL: ${{ needs.xpu_build_test.outputs.wheel_path }}
  #     MODEL_PATH: /ssd3/model

  # xpu_8cards_case_test:
  #   name: xpu_8cards_case_test
  #   needs: [clone, xpu_build_test]
  #   uses: ./.github/workflows/_xpu_8cards_case_test.yml
  #   with:
  #     FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
  #     DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci
  #     FASTDEPLOY_WHEEL_URL: ${{ needs.xpu_build_test.outputs.wheel_path }}
  #     MODEL_PATH: /ssd3/model

================================================
FILE: .github/workflows/debug-unittest-gpu.yml
================================================
name: Debug Unittest GPU with SSH

on:
  workflow_dispatch:

env:
  PR_ID: ${{ github.event.pull_request.number }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
  BRANCH: ${{ github.event.pull_request.base.ref }}
  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
  TASK: PaddleFormers-CI-${{ github.event.pull_request.number }}-unittest-gpu-debug
  ci_scripts: /workspace/PaddleFormers/scripts/unit_test
  CI_name: unittest-gpu-debug
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"

defaults:
  run:
    shell: bash

jobs:
  debug-container:
    name: unittest-gpu-debug
    runs-on: [self-hosted, 4gpu]
    timeout-minutes: 30
    steps:
      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda126-Cudnn95-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          image_name=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest
          echo "image_name=${image_name}" >> "$GITHUB_ENV"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/pip:/home/.cache/pip \
            -e BRANCH \
            -e PR_ID \
            -e COMMIT_ID \
            -e work_dir \
            -e ci_scripts \
            -e no_proxy \
            -e CI_name \
            -e paddle_whl \
            -e FLAGS_dynamic_static_unified_comm \
            -e python_version \
            -w /workspace --privileged $image_name
       
      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar"
          wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
          echo "Extracting PaddleFormers.tar"
          tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCI"
          git config --global user.email "paddle_ci@example.com"
          git pull
          git submodule update --init --recursive --force
          if [ -n "${PR_ID}" ]; then
            git fetch origin pull/${PR_ID}/head
            git checkout -b PR_${PR_ID} FETCH_HEAD
            git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
            git fetch upstream ${BRANCH}
            git merge ${BRANCH} --no-edit
            git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
          else
            echo "Not in a pull_request event. Skipping PR-specific operations."
          fi
          git log --pretty=oneline -10
          unset http_proxy && unset https_proxy
          '

      - name: Prepare Environment
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          set -e
          python -c "import sys; print(sys.version_info[:])"
          pip config set global.cache-dir "/home/.cache/pip"
          cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
          '

      - name: Print Info
        env:
          work_dir: ${{ github.workspace }}
        run: |
          echo "docker exec -it $container_name bash"
          echo "work_path: $work_dir/PaddleFormers"
          echo "work_path in docker: /workspace/PaddleFormers"
          echo "cmd: "
          echo "bash scripts/unit_test/ci_unittest.sh ${paddle_whl}"
          echo "or python -m pytest fail_case_name"
          echo "docker rm -f $container_name"

================================================
FILE: .github/workflows/fleet-model-test.yml
================================================
name: Fleet Model Test

on:
  pull_request:
    branches:
      - develop
      - release/**

permissions: read-all

concurrency:
  group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
  cancel-in-progress: true

env:
  PR_ID: ${{ github.event.pull_request.number || '0' }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha || github.sha }}
  work_dir: /paddle
  PADDLE_ROOT: /paddle
  ci_scripts: /paddle/ci
  BRANCH: ${{ github.event.pull_request.base.ref || github.ref_name }}
  CI_name: fleet-model-test
  no_proxy: "bcebos.com,.bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
  docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test"

defaults:
  run:
    shell: bash

jobs:
  check_skip:
    name: Check skip-fleet-models-ci label
    runs-on: ubuntu-latest

    outputs:
      skip: ${{ steps.check_skip.outputs.skip }}

    steps:
      - name: Check skip-fleet-models-ci label
        id: check_skip
        shell: bash
        run: |
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            labels='${{ toJson(github.event.pull_request.labels.*.name) }}'
            echo "PR labels: $labels"

            if echo "$labels" | grep -q "skip-fleet-models-ci"; then
              echo "skip=true" >> "$GITHUB_OUTPUT"
            else
              echo "skip=false" >> "$GITHUB_OUTPUT"
            fi
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi

      - name: Skip CI but mark success
        if: steps.check_skip.outputs.skip == 'true'
        run: |
          echo "skip-fleet-models-ci label found"
          echo "Downstream GPU jobs will be skipped"
  
  check_documents_type:
    needs: check_skip
    if: ${{ needs.check_skip.outputs.skip == 'false' }}
    name: check documents type for pull request
    runs-on: ubuntu-latest
    env:
      GITHUB_REPO_NAME: ${{ github.repository }}
    outputs:
      is_md_only: ${{ steps.check_files.outputs.is_md_only }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - name: git diff and check 
        id: check_files
        if: |
          github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'fleet-models-ci')
        run: |
          changed_files=$(git diff origin/${{ github.event.pull_request.base.ref }}...HEAD --name-only)
          echo "Changed files: $changed_files"
          all_count=0
          md_count=0
          for file in $changed_files; do
            all_count=$((all_count+1))
            echo $file
            if [[ $file == *.md ]]; then
              echo "Markdown document: $file has been changed."
              md_count=$((md_count+1))
            fi
          done
          echo "总计: all_count=$all_count, md_count=$md_count"
          if [ "$md_count" -eq "$all_count" ]; then
            echo "is_md_only=true" >> $GITHUB_OUTPUT
            echo "is_md_only=true"
          else
            echo "is_md_only=false" >> $GITHUB_OUTPUT
            echo "is_md_only=false"
          fi
          echo "is_md_only: $(cat $GITHUB_OUTPUT | grep is_md_only || echo '未找到')"
    
  integration-test-H20-single-card:
    needs: [check_documents_type, check_skip]
    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' && needs.check_skip.outputs.skip == 'false' }}
    name: Integration test (H20, single card)
    runs-on:
      group: Fleet-H-single-card
    env:
      PIP_CACHE_DIR: /home/.cache/pip
      CACHE_DIR: /home/.cache
      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card
    steps:
      - name: Determine the runner
        run: |
          gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
          echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV

      - name: Check docker image and run container
        env:
          GPU_DEVICES: ${{ env.GPU_DEVICES }}
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> ${{ github.env }}
          docker pull $docker_image
          set -x
          docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
            -v "/dev/shm:/dev/shm"  \
            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
            -v ${{ github.workspace }}/../../..:/root \
            -v /ssd1/paddle-1/action_cache:/home/.cache \
            -v ${{ github.workspace }}:/workspace \
            -e BRANCH \
            -e PR_ID \
            -e COMMIT_ID \
            -e PADDLE_ROOT \
            -e ci_scripts \
            -e CACHE_DIR \
            -e no_proxy \
            -e CI_name \
            -e PIP_CACHE_DIR \
            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
            -e GITHUB_REPO_NAME="${{ github.repository }}" \
            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
            -e GITHUB_RUN_ID="${{ github.run_id }}" \
            -e PR_USER="${{ github.event.pull_request.user.login }}" \
            -w /workspace --network host ${docker_image}

      - name: Install PaddleFormers
        id: formers_install
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          rm -rf * .[^.]*
          echo $PR_USER
          source /root/proxy
          mkdir -p /home/.cache/pip
          pip cache dir
          pip install --upgrade pip
          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
          cd PaddleFormers
          git status
          git config --global --add safe.directory /workspace/PaddleFormers
          git config user.name "PaddleCI"
          git config user.email "paddle_ci@example.com"
          git config pull.rebase false
          git pull --no-edit origin pull/${PR_ID}/head
          export UV_SKIP_WHEEL_FILENAME_CHECK=1
          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
          pip install -e ".[paddlefleet]" --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/ --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.2/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlefleet -y
          # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlepaddle-gpu -y
          # pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          echo "paddle commit:"
          python -c "import paddle; print(paddle.version.commit)"
          echo "paddlefleet commit:"
          python -c "import paddlefleet; print(paddlefleet.version.commit)"
          cd /workspace
          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
          mkdir bos
          tar xf bos_new.tar.gz -C bos
          pip install bce-python-sdk==0.8.74
          pip install coverage==7.6.1
          pip install librosa==0.11.0
          '

      - name: Proprocess for integration test
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh
          preprocess_exit_code=$?
          if [[ "$preprocess_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mPreprocess failed.\033[0m"
            exit 1
          else
            echo -e "\033[32mPreprocess succeeded.\033[0m"
          fi
          '


      - name: Integration test (GLM4.5 single-card)
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh
          glm45_single_card_exit_code=$?
          if [[ "$glm45_single_card_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m"
          fi
          '

      - name: Integration test (Qwen3-30B-A3B single-card)
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh
          qwen3_single_card_exit_code=$?
          if [[ "$qwen3_single_card_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m"
          fi
          '

      - name: Qwen3-vl-8k-single-card
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft_single_card.sh single
          exit_code=$?
          if [[ "$exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: Qwen3-vl-8k-single-card.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: Qwen3-vl-8k-single-card.\033[0m"
          fi
          '


      - name: Terminate and delete the container
        if: ${{ always() }}
        run: |
          set +e
          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
          docker rm -f ${{ env.container_name }}


  integration-test-H20-multi-card:
    needs: [check_documents_type, check_skip]
    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' && needs.check_skip.outputs.skip == 'false' }}
    name: Integration test (H20, multi-card)
    runs-on:
      group: Fleet-H-multi-card
    env:
      PIP_CACHE_DIR: /home/.cache/pip
      CACHE_DIR: /home/.cache
      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card
    steps:
      - name: Check docker image and run container
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> ${{ github.env }}
          docker pull $docker_image
          docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
            -v "/dev/shm:/dev/shm"  \
            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
            -v ${{ github.workspace }}/../../..:/root \
            -v /ssd1/paddle-1/action_cache:/home/.cache \
            -v ${{ github.workspace }}:/workspace \
            -e BRANCH \
            -e PR_ID \
            -e COMMIT_ID \
            -e PADDLE_ROOT \
            -e ci_scripts \
            -e CACHE_DIR \
            -e no_proxy \
            -e CI_name \
            -e PIP_CACHE_DIR \
            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
            -e GITHUB_REPO_NAME="${{ github.repository }}" \
            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
            -e GITHUB_RUN_ID="${{ github.run_id }}" \
            -e PR_USER="${{ github.event.pull_request.user.login }}" \
            -w /workspace --network host ${docker_image}


      - name: Install PaddleFormers
        id: formers_install
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          rm -rf * .[^.]*
          source /root/proxy
          mkdir -p /home/.cache/pip
          pip cache dir
          pip install --upgrade pip
          git clone https://github.com/PaddlePaddle/PaddleFormers.git  -b ${BRANCH}
          cd PaddleFormers
          git status
          git config --global --add safe.directory /workspace/PaddleFormers
          git config user.name "PaddleCI"
          git config user.email "paddle_ci@example.com"
          git config pull.rebase false
          git pull --no-edit origin pull/${PR_ID}/head
          export UV_SKIP_WHEEL_FILENAME_CHECK=1
          pip install -e ".[paddlefleet]" --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/ --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.2/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlefleet -y
          # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlepaddle-gpu -y
          # pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          echo "paddle commit:"
          python -c "import paddle; print(paddle.version.commit)"
          echo "paddlefleet commit:"
          python -c "import paddlefleet; print(paddlefleet.version.commit)"
          cd /workspace
          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
          mkdir bos
          tar xf bos_new.tar.gz -C bos
          pip install bce-python-sdk==0.8.74
          pip install coverage==7.6.1
          pip install librosa==0.11.0
          '
      
      - name: GLM4.5 pre-train
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
          fi
          '

      - name: GLM4.5 sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
          fi
          '
      
      - name: GLM4.5 lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
          fi
          '
      
      - name: GLM4.5 dpo
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
          fi
          '

      - name: GLM4.5 dpo_lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo_lora.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo lora.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo lora.\033[0m"
          fi
          '
      - name: GLM4.5 pre-train (EP4)
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_ep4.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 EP4.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 EP4.\033[0m"
          fi
          '

      - name: GLM4.5 pre-train (FP8)
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
          fi
          '

      - name: GLM4.5 pre-train (Grouped GEMM)
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m"
          fi
          '

      - name: Qwen pre-train
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
          '
        
      - name: Qwen sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
          '


      - name: Qwen lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
          '

      - name: Qwen vl sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh tp8 h20
          '
      
      - name: Qwen vl lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_lora.sh h20
          '

      - name: Qwen vl moe
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh moe h20
          '
      
      - name: Qwen3-vl-8k-fsdp
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh fsdp h20
          '

      - name: Terminate and delete the container
        if: ${{ always() }}
        run: |
          set +e
          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
          docker rm -f ${{ env.container_name }}
      

  integration-test-a100:
    needs: [check_documents_type, check_skip]
    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' && needs.check_skip.outputs.skip == 'false' }}
    name: Integration test (A100)
    runs-on:
      group: Distribute
    env:
      PIP_CACHE_DIR: /home/.cache/pip
      CACHE_DIR: /home/.cache
      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-A100
    steps:
      - name: Check docker image and run container
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> ${{ github.env }}
          docker pull $docker_image
          docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
            -v "/dev/shm:/dev/shm"  \
            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
            -v ${{ github.workspace }}/../../..:/root \
            -v /ssd1/paddle-1/action_cache:/home/.cache \
            -v ${{ github.workspace }}:/workspace \
            -e BRANCH \
            -e PR_ID \
            -e COMMIT_ID \
            -e PADDLE_ROOT \
            -e ci_scripts \
            -e CACHE_DIR \
            -e no_proxy \
            -e CI_name \
            -e PIP_CACHE_DIR \
            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
            -e GITHUB_REPO_NAME="${{ github.repository }}" \
            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
            -e GITHUB_RUN_ID="${{ github.run_id }}" \
            -e PR_USER="${{ github.event.pull_request.user.login }}" \
            -w /workspace --network host ${docker_image}

      - name: Install PaddleFormers
        id: formers_install
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          rm -rf * .[^.]*
          source /root/proxy
          mkdir -p /home/.cache/pip
          pip cache dir
          pip install --upgrade pip
          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
          cd PaddleFormers
          git status
          git config --global --add safe.directory /workspace/PaddleFormers
          git config user.name "PaddleCI"
          git config user.email "paddle_ci@example.com"
          git config pull.rebase false
          git pull --no-edit origin pull/${PR_ID}/head
          export UV_SKIP_WHEEL_FILENAME_CHECK=1
          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
          pip install -e ".[paddlefleet]" --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/ --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.2/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlefleet -y
          # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/
          # wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
          # pip uninstall paddlepaddle-gpu -y
          # pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
          echo "paddle commit:"
          python -c "import paddle; print(paddle.version.commit)"
          echo "paddlefleet commit:"
          python -c "import paddlefleet; print(paddlefleet.version.commit)"
          cd /workspace
          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
          mkdir bos
          tar xf bos_new.tar.gz -C bos
          pip install bce-python-sdk==0.8.74
          pip install coverage==7.6.1
          pip install librosa==0.11.0
          '

      - name: GLM4.5 pre-train
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
          fi
          '
        
      - name: GLM4.5 sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
          fi
          '
      
      - name: GLM4.5 lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
          fi
          '
      
      - name: GLM4.5 dpo
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
          fi
          '
      
      - name: GLM4.5 dpo_lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo_lora
          glm45_exit_code=$?
          if [[ "$glm45_exit_code" != "0" ]]; then
            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo lora.\033[0m"
            exit 1
          else
            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo lora.\033[0m"
          fi
          '

      - name: Qwen pre-train
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh pt
          '
        
      - name: Qwen sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh sft
          '

      - name: Qwen lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh lora
          '

      - name: Qwen vl sft
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh tp8 a100
          '
      
      - name: Qwen vl lora
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_lora.sh a100
          '

          
      - name: Qwen vl moe
        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
        run: |
          docker exec -t ${{ env.container_name }} /bin/bash -ce '
          source /root/proxy
          timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh moe a100
          '


      # - name: Qwen3-vl-8k-fsdp
      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
      #   run: |
      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
      #     source /root/proxy
      #     timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh fsdp h20
      #     '

      - name: Terminate and delete the container
        if: ${{ always() }}
        run: |
          set +e
          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
          docker rm -f ${{ env.container_name }}


================================================
FILE: .github/workflows/lint.yml
================================================
name: Codestyle Check

on: [push, pull_request]

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: true

env:
  PR_ID: ${{ github.event.pull_request.number }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
  BRANCH: ${{ github.event.pull_request.base.ref }}
  TASK: PaddleFormers-CI-Lint-${{ github.event.pull_request.number }}

jobs:
  Lint:
    name: Lint
    runs-on: [self-hosted, ernie-cpu-01]
    steps:
      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          python_version: "3.10"
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          docker_image="iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu20.04-gcc12.2-cuda12.3-cudnn9.0-nccl2.20.3.1-openmpi4.1.5-latest"
          docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/pip:/home/.cache/pip \
            -e BRANCH \
            -e PR_ID \
            -e COMMIT_ID \
            -e work_dir \
            -e no_proxy \
            -e python_version \
            -w /workspace ${docker_image}
      - name: Download Code
        env:
          work_dir: ${{ github.workspace }}
        run: |
          docker exec -t ${container_name} /bin/bash -c '
          rm -rf * .[^.]*
          echo "Downloading PaddleFormers.tar"
          wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
          echo "Extracting PaddleFormers.tar"
          tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
          source $work_dir/../../../proxy
          cd PaddleFormers
          git config --global user.name "PaddleCI"
          git config --global user.email "paddle_ci@example.com"
          git pull
          git submodule update --init --recursive --force
          if [ -n "${PR_ID}" ]; then
            git fetch origin pull/${PR_ID}/head
            git checkout -b PR_${PR_ID} FETCH_HEAD
            git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
            git fetch upstream ${BRANCH}
            git merge ${BRANCH} --no-edit
            git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
          else
            echo "Not in a pull_request event. Skipping PR-specific operations."
          fi
          git log --pretty=oneline -10
          if ! git show-ref --quiet refs/heads/develop; then \
              echo "local develop branch is missing, creating local develop branch that tracks remote develop branch"
              git fetch origin develop
              git branch develop --track origin/develop
          else
            echo "local develop branch exist, skipping"
          fi
          unset http_proxy && unset https_proxy
          '
      - name: Setup Environment
        run: |
          docker exec -t $container_name /bin/bash -c '
          unlink /usr/local/bin/python
          ln -sf $(which python${python_version}) /usr/local/bin/python
          set -e
          python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
          python -m pip config set global.cache-dir "/home/.cache/pip"
          python -m pip install --upgrade pip
          cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
          make install
          '
      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          set -e
          cd /workspace/PaddleFormers
          source $work_dir/../../../proxy
          make lint
          '
      
      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true

================================================
FILE: .github/workflows/model-unittest-gpu.yml
================================================
name: Model Unittest GPU CI

on:
  pull_request:
  schedule:
    - cron: "0 18 * * *"
  workflow_call:
    inputs:
      runner:
        required: false
        type: string
      image_name:
        required: false
        type: string

concurrency:
  group: model-unittest-${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id }}
  cancel-in-progress: true

env:
  PR_ID: ${{ github.event.pull_request.number || '0' }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha || github.sha }}
  TASK: PaddleFormers-CI-${{ github.event.pull_request.number }}-model-unittest-gpu
  CI_SCRIPTS_PATH: /workspace/PaddleFormers/scripts/ci_model_unittest.sh
  BRANCH: ${{ github.event.pull_request.base.ref || github.ref_name }}
  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
  CI_JOB_NAME: model-unittest-gpu-ci
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"

defaults:
  run:
    shell: bash

jobs:
  check_skip:
    name: Check skip-models-ci label
    runs-on: ubuntu-latest

    outputs:
      skip: ${{ steps.check_skip.outputs.skip }}

    steps:
      - name: Check skip-models-ci label
        id: check_skip
        shell: bash
        run: |
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            labels='${{ toJson(github.event.pull_request.labels.*.name) }}'
            echo "PR labels: $labels"

            if echo "$labels" | grep -q "skip-models-ci"; then
              echo "skip=true" >> "$GITHUB_OUTPUT"
            else
              echo "skip=false" >> "$GITHUB_OUTPUT"
            fi
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi

      - name: Skip CI but mark success
        if: steps.check_skip.outputs.skip == 'true'
        run: |
          echo "skip-models-ci label found"
          echo "Downstream GPU jobs will be skipped"

  model-unittest-gpu-ci:
    needs: check_skip
    if: needs.check_skip.outputs.skip == 'false'
    name: model-unittest-gpu-ci
    runs-on: ${{ inputs.runner || 'ernie-8gpu' || 'distrbute' }}
    steps:
      - name: Determine Image Name
        env:
          IMAGE_NAME: ${{ inputs.image_name }}
        run: |
          if [[ -n "${IMAGE_NAME}" ]]; then
            echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
          else
            echo "IMAGE_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest" >> "$GITHUB_ENV"
          fi
      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          PIP_CACHE_DIR: /root/.cache/pip
          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          echo "Workspace path: ${{ github.workspace }}"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v ${work_dir}/../../..:${work_dir}/../../.. \
            -v ${work_dir}:/workspace \
            -v /home/.cache/:/root/.cache/ \
            -v /home/paddle-1/models/:/home/models/ \
            -e "BRANCH=$BRANCH" \
            -e "AGILE_COMPILE_BRANCH=$AGILE_COMPILE_BRANCH" \
            -e "PR_ID=$PR_ID" \
            -e "COMMIT_ID=$COMMIT_ID" \
            -e "work_dir=$work_dir" \
            -e "CI_SCRIPTS_PATH=$CI_SCRIPTS_PATH" \
            -e "no_proxy=$no_proxy" \
            -e "CI_JOB_NAME=$CI_JOB_NAME" \
            -e "paddle_whl=$paddle_whl" \
            -e "FLAGS_dynamic_static_unified_comm=$FLAGS_dynamic_static_unified_comm" \
            -e "python_version=$python_version" \
            -e HF_PROXY_PATH=${work_dir}/../../../proxy_huggingface \
            -e AISTUDIO_PROXY_PATH=${work_dir}/../../../proxy_aistudio \
            -e PF_HOME=/home/models/ \
            -e PIP_CACHE_DIR \
            -w /workspace --privileged ${IMAGE_NAME}
            
      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
            rm -rf * .[^.]*
            echo "Downloading PaddleFormers.tar"
            wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
            echo "Extracting PaddleFormers.tar"
            rm -rf PaddleFormers
            tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
            echo "work_dir = ${work_dir}"
            source ${work_dir}/../../../proxy
            cd PaddleFormers
            git config --global user.name "PaddleCI"
            git config --global user.email "paddle_ci@example.com"
            git pull
            git submodule update --init --recursive --force
            if [ -n "$PR_ID" ] && [ "$PR_ID" != "0" ]; then
              git fetch origin pull/${PR_ID}/head
              git checkout -b PR_${PR_ID} FETCH_HEAD
              git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
              echo "Checking out ${BRANCH}..."
              git fetch upstream ${BRANCH}:${BRANCH}
              git merge ${BRANCH} --no-edit
              git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
            else
              echo "Not in a pull_request event. Skipping PR-specific operations."
            fi
            git log --pretty=oneline -10
            '

      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          mkdir -p /root/.cache/pip
          pip cache dir
          set -e
          rm -rf /root/.cache/aistudio/
          cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
          echo "work_dir = ${work_dir}"
          cp -r ${work_dir}/../../../models ./models
          echo "Check whether the local model file exists:"
          ls -l ./models
          bash -x scripts/regression/ci_model_unittest.sh ${paddle_whl} false ${AGILE_COMPILE_BRANCH}
          '
          
      - name: Upload Products
        if: always()
        env:
          home_path: ${{ github.workspace }}/../../..
          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
        run: |
          docker exec -t $container_name /bin/bash -c '
          if [ ! -f "${{ env.bos_file }}" ]; then
            wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
            mkdir ${{ env.home_path }}/bos
            tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
          fi
          if [ -n "$PR_ID" ] && [ "$PR_ID" != "0" ]; then
            bos_prefix="${PR_ID}/${COMMIT_ID}"
          else
            bos_prefix="schedule/$(date +%Y%m%d)"
          fi
          # logs
          cd /workspace/PaddleFormers/model_unittest_logs
          for FILE in /workspace/PaddleFormers/model_unittest_logs/*; do
            file=$(basename "$FILE")
            python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleFormers/model-unittest-gpu/${bos_prefix}/logs
            echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/model-unittest-gpu/${bos_prefix}/logs/$file"
          done
          '

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true

================================================
FILE: .github/workflows/requirements-review.yml
================================================
name: Check Requirements Need Approval

on:
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
      - 'requirements.txt'
  pull_request_review:
    types: [submitted]

jobs:
  check-review:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Check if requirements.txt changed
        id: check_diff
        run: |
          git fetch origin ${{ github.base_ref }}
          if git diff origin/${{ github.base_ref }} --name-only | grep -q '^requirements.txt$'; then
            echo "changed=true" >> $GITHUB_OUTPUT
          else
            echo "changed=false" >> $GITHUB_OUTPUT
          fi

      - name: Skip if requirements.txt not changed
        if: steps.check_diff.outputs.changed == 'false'
        run: |
          echo "requirements.txt not changed, skip approval check"

      - name: Check required reviewer approval
        if: steps.check_diff.outputs.changed == 'true'
        uses: actions/github-script@v7
        with:
          script: |
            const required = [
              "nepeplwu",
              "lugimzzz",
              "zjjlivein"
            ];

            const { data: reviews } = await github.rest.pulls.listReviews({
              owner: context.repo.owner,
              repo: context.repo.repo,
              pull_number: context.payload.pull_request.number,
            });

            const approved = new Set();
            for (const r of reviews) {
              if (
                required.includes(r.user.login) &&
                r.state === "APPROVED"
              ) {
                approved.add(r.user.login);
              }
            }

            if (approved.size === 0) {
              core.setFailed(
                `requirements.txt changed: need approval from one of ${required.join(", ")}`
              );
            } else {
              core.info(`Approved by: ${Array.from(approved).join(", ")}`);
            }

================================================
FILE: .github/workflows/rerun.yml
================================================
name: Re-run

on:
  issue_comment:
    types: [created]

jobs:
  re-run:
    if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/re-run')  && github.event.comment.user.login == github.event.issue.user.login }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Code
        uses: actions/checkout@v3

      - name: Rerun All Failed Jobs
        if: ${{ contains(github.event.comment.body, 'all-failed') }}
        uses: ./.github/actions/rerun-workflow
        with:
          PR_ID: ${{ github.event.issue.number }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          OWNER: ${{ github.repository_owner }}
          REPO: ${{ github.event.repository.name }}
          JOB_NAME: 'all-failed'

      - name: Rerun Unittest GPU
        if: ${{ contains(github.event.comment.body, 'Unittest GPU') }}
        uses: ./.github/actions/rerun-workflow
        with:
          PR_ID: ${{ github.event.issue.number }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          OWNER: ${{ github.repository_owner }}
          REPO: ${{ github.event.repository.name }}
          JOB_NAME: 'Unittest GPU CI / unittest-gpu-ci'

      - name: Rerun Unittest CPU
        if: ${{ contains(github.event.comment.body, 'Unittest CPU') }}
        uses: ./.github/actions/rerun-workflow
        with:
          PR_ID: ${{ github.event.issue.number }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          OWNER: ${{ github.repository_owner }}
          REPO: ${{ github.event.repository.name }}
          JOB_NAME: 'Unittest CPU CI / unittest-cpu-ci'

      - name: Rerun Codestyle Check
        if: ${{ contains(github.event.comment.body, 'Codestyle Check') }}
        uses: ./.github/actions/rerun-workflow
        with:
          PR_ID: ${{ github.event.issue.number }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          OWNER: ${{ github.repository_owner }}
          REPO: ${{ github.event.repository.name }}
          JOB_NAME: 'Codestyle Check / Lint'

================================================
FILE: .github/workflows/stale.yml
================================================
name: Stale

on:
  # Allow manual run via GitHub web or CLI
  workflow_dispatch:
  schedule:
    # Run daily at midnight UTC
    - cron: 0 0 * * *

permissions:
  issues: write
  pull-requests: write

jobs:
  stale:
    runs-on: ubuntu-24.04

    permissions:
      issues: write
      pull-requests: write

    steps:
      - uses: actions/stale@v6.0.1
        with:
          days-before-issue-stale: 60
          days-before-issue-close: 14
          stale-issue-label: "stale"
          stale-issue-message: "This issue is stale because it has been open for 60 days with no activity. 当前issue 60天内无活动，被标记为stale。"
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale. 当前issue 被标记为stale已有14天，即将关闭。"
          exempt-issue-labels: 'triage,keep'
          days-before-pr-stale: 60
          days-before-pr-close: -1
          stale-pr-label: "stale"
          stale-pr-message: "This Pull Request is stale because it has been open for 60 days with no activity. 当前Pull Request 60天内无活动，被标记为stale。"
          operations-per-run: 400


================================================
FILE: .github/workflows/unittest-gpu.yml
================================================
name: Unittest GPU CI

on:
  pull_request:
  schedule:
    - cron: "0 18 * * *"
  workflow_call:
    inputs:
      runner:
        required: false
        type: string
      image_name:
        required: false
        type: string

concurrency:
  group: unittest-${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id }}
  cancel-in-progress: true

env:
  PR_ID: ${{ github.event.pull_request.number || '0' }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha || github.sha }}
  TASK: PaddleFormers-CI-${{ github.event.pull_request.number }}-unittest-gpu
  ci_scripts: /workspace/PaddleFormers/scripts/unit_test
  BRANCH: ${{ github.event.pull_request.base.ref || github.ref_name }}
  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
  CI_name: unittest-gpu-ci
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
  PYTEST_EXECUTE_FLAG_FILE: ${{ github.workspace }}/../../../PYTEST_EXECUTE_FLAG_FILE/${{ github.event.pull_request.number || '0' }}/${{ github.event.pull_request.head.sha || github.sha }}/pytest_execute.flag
  PYTEST_EXECUTE_FLAG: false

defaults:
  run:
    shell: bash

jobs:
  check_skip:
    name: Check skip-unittest-ci label
    runs-on: ubuntu-latest

    outputs:
      skip: ${{ steps.check_skip.outputs.skip }}

    steps:
      - name: Check skip-unittest-ci label
        id: check_skip
        shell: bash
        run: |
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            labels='${{ toJson(github.event.pull_request.labels.*.name) }}'
            echo "PR labels: $labels"

            if echo "$labels" | grep -q "skip-unittest-ci"; then
              echo "skip=true" >> "$GITHUB_OUTPUT"
            else
              echo "skip=false" >> "$GITHUB_OUTPUT"
            fi
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi

      - name: Skip CI but mark success
        if: steps.check_skip.outputs.skip == 'true'
        run: |
          echo "skip-unittest-ci label found"
          echo "Downstream GPU jobs will be skipped"

  unittest-gpu-ci:
    needs: check_skip
    if: needs.check_skip.outputs.skip == 'false'
    name: unittest-gpu-ci
    runs-on: ${{ inputs.runner || 'ernie-8gpu' || 'distrbute' }}
    outputs:  
      pytest_execute_flag: ${{ steps.set_pytest_flag.outputs.pytest_execute_flag }}
    steps:
      - name: Determine Image Name
        env:
          IMAGE_NAME: ${{ inputs.image_name }}
        run: |
          if [[ -n "${IMAGE_NAME}" ]]; then
            echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
          else
            echo "IMAGE_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev-latest" >> "$GITHUB_ENV"
          fi
      
      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          PIP_CACHE_DIR: /root/.cache/pip
          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          echo "Workspace path: ${{ github.workspace }}"
          DOCKER_VER=$(docker version --format '{{.Server.Version}}' | cut -d. -f1,2)
          if (( $(echo "$DOCKER_VER < 19.03" | bc -l) )); then
            GPU_OPTION="--runtime=nvidia"
          else
            GPU_OPTION="--gpus all"
          fi
          echo "DOCKER_VER=${DOCKER_VER}"
          echo "GPU_OPTION=${GPU_OPTION}"
          docker run -d -t ${GPU_OPTION} --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
            -v $work_dir/../../..:$work_dir/../../.. \
            -v $work_dir:/workspace \
            -v /home/.cache/:/root/.cache/ \
            -v /home/paddle-1/models/:/home/models/ \
            -e PF_HOME=/home/models/ \
            -e "BRANCH=$BRANCH" \
            -e "AGILE_COMPILE_BRANCH=$AGILE_COMPILE_BRANCH" \
            -e "PR_ID=$PR_ID" \
            -e "COMMIT_ID=$COMMIT_ID" \
            -e "work_dir=$work_dir" \
            -e "ci_scripts=$ci_scripts" \
            -e "PYTEST_EXECUTE_FLAG_FILE=$PYTEST_EXECUTE_FLAG_FILE" \
            -e "no_proxy=$no_proxy" \
            -e "CI_JOB_NAME=$CI_JOB_NAME" \
            -e "paddle_whl=$paddle_whl" \
            -e "FLAGS_dynamic_static_unified_comm=$FLAGS_dynamic_static_unified_comm" \
            -e "python_version=$python_version" \
            -e "HF_PROXY_PATH=$work_dir/../../../proxy_huggingface" \
            -e "AISTUDIO_PROXY_PATH=$work_dir/../../../proxy_aistudio" \
            -e "HF_DATASETS_CACHE=$work_dir/../../../paddlenlp/huggingface/datasets" \
            -e "TRANSFORMERS_CACHE=$work_dir/../../../paddlenlp/huggingface" \
            -e PIP_CACHE_DIR \
            -w /workspace --privileged $IMAGE_NAME

      - name: Download Code
        run: |
          docker exec -t $container_name /bin/bash -c '
            rm -rf * .[^.]*
            echo "Downloading PaddleFormers.tar"
            wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
            echo "Extracting PaddleFormers.tar"
            rm -rf PaddleFormers
            tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
            source $work_dir/../../../proxy
            cd PaddleFormers
            git config --global user.name "PaddleCI"
            git config --global user.email "paddle_ci@example.com"
            git pull
            git submodule update --init --recursive --force
            if [ -n "$PR_ID" ] && [ "$PR_ID" != "0" ]; then
              git fetch origin pull/${PR_ID}/head
              git checkout -b PR_${PR_ID} FETCH_HEAD
              git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
              git fetch upstream ${BRANCH}:${BRANCH}
              git merge ${BRANCH} --no-edit
              git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
            else
              echo "Not in a pull_request event. Skipping PR-specific operations."
            fi
            git log --pretty=oneline -10
            '

      - name: Test
        run: |
          docker exec -t $container_name /bin/bash -c '
          ldconfig
          mkdir -p /root/.cache/pip
          pip cache dir
          set -e
          rm -rf /root/.cache/aistudio/
          cd /home/models/my_packages && dpkg -i *.deb
          cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
          source $work_dir/../../../proxy
          source $work_dir/../../../AISTUDIO_ACCESS_TOKEN
          echo "work_dir = ${work_dir}"
          cp -r ${work_dir}/../../../models ./models
          echo "Check whether the local model file exists:"
          ls -l ./models
          bash -x scripts/unit_test/ci_unittest.sh ${paddle_whl} false ${PYTEST_EXECUTE_FLAG_FILE} ${AGILE_COMPILE_BRANCH}
          '
          
      - name: Upload Products
        env:
          home_path: ${{ github.workspace }}/../../..
          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
          allure_file: ${{ github.workspace }}/../../../allure-2.19.0/bin/allure
        run: |
          docker exec -t $container_name /bin/bash -c '
          if [ ! -f "${{ env.bos_file }}" ]; then
            wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
            mkdir ${{ env.home_path }}/bos
            tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
          fi
          # if [ ! -f "${{ env.allure_file }}" ]; then
          #   wget -q --no-proxy -O ${{ env.home_path }}/allure-2.19.0.zip https://xly-devops.bj.bcebos.com/tools/allure-2.19.0.zip --no-check-certificate
          #   unzip -q ${{ env.home_path }}/allure-2.19.0.zip
          # fi
          if [ -n "$PR_ID" ] && [ "$PR_ID" != "0" ]; then
            bos_prefix="${PR_ID}/${COMMIT_ID}"
          else
            bos_prefix="schedule/github-ci-$(date +%Y%m%d)"
          fi
          # coverage.xml
          if [ -f "${PYTEST_EXECUTE_FLAG_FILE}" ]; then
            echo "PYTEST_EXECUTE_FLAG_FILE found, uploading coverage.xml."
            cd /workspace/PaddleFormers
            python ${{ env.bos_file }} coverage.xml paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
            echo "cov-report: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/coverage.xml"
          else
            echo "PYTEST_EXECUTE_FLAG_FILE not found, skipping coverage.xml upload."
          fi
          # logs
          cd /workspace/PaddleFormers/unittest_logs
          for FILE in /workspace/PaddleFormers/unittest_logs/*; do
            file=$(basename "$FILE")
            python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
            echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/$file"
          done
          # allure
          # cd /workspace/PaddleFormers/
          # ${{ env.allure_file }} generate result -o report
          # tar -czf report.tar.gz report
          # python ${{ env.bos_file }} report.tar.gz paddle-github-action/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs
          # echo "report: https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/report.tar.gz"
          '
      - name: Set pytest execute flag output
        id: set_pytest_flag  # 专门设置输出的步骤
        run: |
          # 检查标志文件是否存在，并设置作业输出
          if [ -f "${{ env.PYTEST_EXECUTE_FLAG_FILE }}" ]; then
            echo "pytest_execute_flag=true" >> $GITHUB_OUTPUT
            echo "PYTEST_EXECUTE_FLAG_FILE exists, setting flag to true"
          else
            echo "pytest_execute_flag=false" >> $GITHUB_OUTPUT
            echo "PYTEST_EXECUTE_FLAG_FILE does not exist, setting flag to false"
          fi

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f $container_name 2>/dev/null || true
          
  upload-coverage:
    name: upload-coverage
    needs: [unittest-gpu-ci]
    if: always()
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Check if should upload coverage
        id: flag_check
        run: |
          echo "needs.unittest-gpu-ci.outputs.pytest_execute_flag = ${{ needs.unittest-gpu-ci.outputs.pytest_execute_flag }}"
          if [ "${{ needs.unittest-gpu-ci.outputs.pytest_execute_flag }}" = "true" ]; then
            echo "pytest_execute_flag is true, proceeding to upload coverage."
            echo "should_upload=true" >> $GITHUB_OUTPUT
          else
            echo "pytest_execute_flag is false, skipping coverage upload."
            echo "should_upload=false" >> $GITHUB_OUTPUT
          fi

      - name: Download coverage.xml
        if: steps.flag_check.outputs.should_upload == 'true'
        env:
          PR_ID: ${{ github.event.pull_request.number || '0' }}
          COMMIT_ID: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          if [ -n "$PR_ID" ] && [ "$PR_ID" != "0" ]; then
            bos_prefix="${PR_ID}/${COMMIT_ID}"
          else
            bos_prefix="schedule/$(date +%Y%m%d)"
          fi
          echo "bos_prefix=${bos_prefix}"
          wget -q --no-proxy \
            https://paddle-github-action.bj.bcebos.com/PR/PaddleFormers/unittest-gpu/${bos_prefix}/logs/coverage.xml \
            --no-check-certificate -O coverage.xml

      - name: Fix coverage.xml paths
        if: steps.flag_check.outputs.should_upload == 'true'
        run: |
          echo "Before fix:"
          head -n 10 coverage.xml || true

          old_source=$(grep -oPm1 '(?<=<source>).*?(?=</source>)' coverage.xml || true)
          if [ -n "$old_source" ]; then
            echo "Replacing source '$old_source' with 'paddleformers'"
            sed -i "s|<source>$old_source</source>|<source>paddleformers</source>|g" coverage.xml
          else
            echo "No <source> found, injecting <source>paddleformers</source>"
            sed -i 's|<sources>|<sources>\n        <source>paddleformers</source>|' coverage.xml
          fi

          echo "After fix:"
          head -n 10 coverage.xml || true

      - name: Upload coverage to Codecov
        if: steps.flag_check.outputs.should_upload == 'true'
        uses: codecov/codecov-action@v4
        with:
          files: coverage.xml
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

================================================
FILE: .github/workflows/update-precision.yml
================================================
name: update precision

on:
  push:
    branches:
      - develop

jobs:
  determine-whether-update:
    name: Determine whether to update
    if: github.repository == 'PaddlePaddle/PaddleFormers'
    runs-on: ubuntu-latest
    outputs:
      need_change: ${{ steps.determine.outputs.need_change }}
    steps:
      - name: Determine whether to update
        id: determine
        run: |
          sleep 30
          response=$(curl -L \
          -H "Accept: application/vnd.github+json" \
          -H "Authorization: Bearer ${{ github.token }}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
          https://api.github.com/repos/PaddlePaddle/PaddleFormers/commits/${{ github.sha }}/pulls)
          pr_number=$(echo "$response" | jq -r '.[] | select(.url | contains("PaddlePaddle/PaddleFormers")) | .number')
          set +e
          wget --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/precision/PaddleFormers/${pr_number}/precision_list.txt
          if [ $? -eq 0 ]; then
            echo "need_change=true" >> "$GITHUB_OUTPUT"
          else
            echo "need_change=false" >> "$GITHUB_OUTPUT"
          fi

  update-precision:
    name: Update precision
    needs: determine-whether-update
    if: needs.determine-whether-update.outputs.need_change == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Clone PaddleFormers
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.pull_request.base.ref }}
          fetch-depth: 100

      - name: Setup python3.10
        uses: actions/setup-python@v5
        with:
          python-version: "3.10"

      - name: Update precision
        env:
          AK: ${{ secrets.AK }}
          SK: ${{ secrets.SK }}
          GITHUB_TOKEN: ${{ github.token }}
          COMMIT_ID: ${{ github.sha }}
          GITHUB_REPO_NAME: ${{ github.repository }}
        run: |
          python -m pip install bce-python-sdk==0.8.74
          wget -q -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
          mkdir bos
          tar xf bos_new.tar.gz -C bos
          bash -x tests/integration_test/update_precision.sh
          
        

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
build*
!scripts/dependence/build.sh
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.doctree
*.mo
*.pot
*.doctree

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pycharm

# vscode
.vscode
./ppdiffusers/ppdiffusers/version.py

# third party
csrc/third_party/
dataset/
output/
!tests/dataset/

# gen codes
autogen/

# cutlass kernel
!csrc/gpu/cutlass_kernels/gemm/collective/builders


#fp8
ops/csrc/fp8/deep_gemm/include/cutlass
ops/csrc/fp8/deep_gemm/include/cute
.ccls-cache
.DS_Store
.idea/
FETCH_HEAD

# vscode
.vscode
./ppdiffusers/ppdiffusers/version.py

# third party
csrc/third_party/
dataset/
output/
!tests/dataset/

# gen codes
autogen/

# cutlass kernel
!csrc/gpu/cutlass_kernels/gemm/collective/builders


#fp8
ops/csrc/fp8/deep_gemm/include/cutlass
ops/csrc/fp8/deep_gemm/include/cute
.ccls-cache

# logs and running results
paddleformers_dist_log
checkpoints

================================================
FILE: .pre-commit-config.yaml
================================================
repos:
# For Python files
-   repo: https://github.com/psf/black.git
    rev: 22.8.0
    hooks:
    -   id: black
        files: \.(py|pyi)$
        additional_dependencies: [toml]
-   repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
    -   id: isort
-   repo: https://github.com/PyCQA/flake8
    rev: 4.0.1
    hooks:
    -   id: flake8
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.1.0
    hooks:
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
        files: (?!.*paddle)^.*$
    -   id: end-of-file-fixer
        files: \.md$
    -   id: trailing-whitespace
        files: \.md$
-   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.1.14
    hooks:
    -   id: forbid-crlf
        files: \.md$
    -   id: remove-crlf
        files: \.md$
    -   id: forbid-tabs
        files: \.md$
    -   id: remove-tabs
        files: \.md$
-   repo: local
    hooks:
    -   id: copyright_checker
        name: copyright_checker
        entry: python .copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
# For Markdown files
-   repo: local
    hooks:
    -   id: add-spaces-between-chinese-and-english
        name: Add spaces between Chinese and English characters
        entry: python scripts/codestyle/check_spaces.py
        language: python
        files: \.(md|markdown)$
        pass_filenames: true
# For dead links
-   repo: local
    hooks:
    -   id: check-dead-links
        name: Check dead links
        entry: python scripts/codestyle/check_dead_links.py
        language: python
        files: \.(md|markdown|rst)$
        pass_filenames: true

================================================
FILE: .readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2
build:
  os: "ubuntu-20.04"
  tools:
    python: "3.10"

submodules:
  include: all
  recursive: true

# Build documentation in the docs/ directory with Sphinx
sphinx:
   configuration: docs/zh/conf.py

# Optionally build your docs in additional formats such as PDF
#formats:
#   - pdf

# Optionally set the version of Python and requirements required to build your docs
python:
  install:
    - requirements: docs/requirements.txt


================================================
FILE: CONTRIBUTING.md
================================================
**简体中文**🀄 | [English🌎](.github/CONTRIBUTING_en.md)

# Contributing to PaddleFormers

我们非常欢迎并希望您对`PaddleFormers`做出开源贡献。在您开始提交您的贡献之前，请先行签署[PaddlePaddle 贡献者许可协议](https://cla-assistant.io/PaddlePaddle/PaddleFormers)。
本文接下来将介绍我们的开发与贡献流程：

## 贡献方式

我们欢迎不同的向`PaddleFormers`做出贡献的方式，例如：

- 修复已知的 Issue
- 提交新的 Issue，例如提出功能需求或者 bug 报告
- 实现新的模型结构

如果您不知道从哪里开始，请查看 Issues 板块中的`Good First Issue`标签。它为您提供一个对初学者友好的已知 Issue 列表，可以降低贡献的门槛，帮助您开始为开源做出贡献。您只需在您想处理的 Issue 中告知我们您想负责此 Issue 即可。

## 开发流程

PaddleFormers 使用 [Git 分支模型](http://nvie.com/posts/a-successful-git-branching-model/)。对于常见的开源贡献，我们有以下的贡献流程：

### 1. Fork

   因为 PaddleFormers 的开发社区一直在发展，如果每位贡献者都直接向官方 Repo 提交 commit 将会难以管理。因此，请从您的分支中提交 Pull Requests。建议您通过 GitHub 的[“Fork”按钮](https://help.github.com/articles/fork-a-repo/)来创建您的 Fork 分支。

### 2. Clone

   请运行一下命令将您的分支 clone 到本地

   ```bash
   git clone https://github.com/<your-github-account>/PaddleFormers
   cd PaddleFormers
   ```

### 3. 创建本地开发分支

   对于添加新功能或修复错误等日常工作，请在开发前创建您的本地开发分支：

   ```bash
   git checkout -b my-cool-feature
   ```

### 4. 配置开发环境

   在开始编码之前，您需要设置开发环境。我们强烈建议您在虚拟环境中进行所有开发，例如[venv](https://docs.python.org/3/library/venv.html)或[conda](https://docs.conda.io/en/latest/)。
   请您设置并激活虚拟环境后，运行以下命令：

   ```bash
   make install
   ```

   这将设置 `PaddleFormers` 的所有依赖以及 [`pre-commit`](http://pre-commit.com/) 工具。

   如果您需要开发 `examples` 或 `applications` 模块并加载 `PaddleFormers`，请确保以可编辑模式（`-e`）安装 `PaddleFormers`。
   如果在虚拟环境中已经安装 `PaddleFormers` ，请使用 `pip uninstall paddleformers` 将其删除，然后以可编辑模式重新安装它
   `pip install -e .`

### 5. 开发

   当您开发时，请确保您新增的代码会被单元测试所覆盖。我们所有的单元测试都可以在 `tests` 目录下找到。
   您可以修改现有单元测试以覆盖新功能，也可以从头开始创建新测试。
   当您完成代码时，您应该确保相关的单元测试可以通过。您可以像这样运行受更改影响的测试：

   ```bash
   pytest tests/<test_to_run>.py
   ```

### 6. Commit

   我们使用 [`pre-commit`](http://pre-commit.com/)工具（包括[black](https://black.readthedocs.io/en/stable/)、[isort](https:/ /pycqa.github.io/isort/) 和
   [flake8](https://flake8.pycqa.org/en/latest/)）来检查每次提交中的代码和文档的风格。当你运行 `git commit` 时，你会看到
   类似于以下内容：

   ```text
    ➜  (my-virtual-env) git commit -m "committing my cool feature"
    black....................................................................Passed
    isort....................................................................Passed
    flake8...................................................................Passed
    check for merge conflicts................................................Passed
    check for broken symlinks............................(no files to check)Skipped
    detect private key.......................................................Passed
    fix end of files.....................................(no files to check)Skipped
    trim trailing whitespace.............................(no files to check)Skipped
    CRLF end-lines checker...............................(no files to check)Skipped
    CRLF end-lines remover...............................(no files to check)Skipped
    No-tabs checker......................................(no files to check)Skipped
    Tabs remover.........................................(no files to check)Skipped
    copyright_checker........................................................Passed
   ```

   但大多数时候事情并没有那么顺利。当您的代码或文档不符合标准时，`pre-commit` 检查将失败。

   ```text
    ➜  (my-virtual-env) git commit -m "committing my cool feature"
    black....................................................................Passed
    isort....................................................................Failed
    - hook id: isort
    - files were modified by this hook

    Fixing examples/information_extraction/waybill_ie/run_ernie_crf.py

    flake8...................................................................Passed
    check for merge conflicts................................................Passed
    check for broken symlinks............................(no files to check)Skipped
    detect private key.......................................................Passed
    fix end of files.....................................(no files to check)Skipped
    trim trailing whitespace.............................(no files to check)Skipped
    CRLF end-lines checker...............................(no files to check)Skipped
    CRLF end-lines remover...............................(no files to check)Skipped
    No-tabs checker......................................(no files to check)Skipped
    Tabs remover.........................................(no files to check)Skipped
    copyright_checker........................................................Passed
   ```

   我们的工具将自动修复大部分样式错误，但是有些错误需要手动解决。幸运的是，错误信息一般通俗易懂，很容易修复。
   解决错误后，您可以再次运行 `git add <files>` 和 `git commit`，这将再次触发 pre-commit 。
   一旦 pre-commit 检查通过，您就可以推送代码了。

   [Google](https://google.com/) 或 [StackOverflow](https://stackoverflow.com/) 是帮助您了解代码风格错误的好工具。
   如果您仍然无法弄清楚，请不要担心。您可以使用 `git commit -m "style error" --no-verify` 提交，我们很乐意在您创建 Pull Request 后帮助您。

### 7. git pull 与代码冲突

   有经验的 Git 用户经常从官方 Repo 中 git pull。因为这样子他们会及早注意到与其他人的代码冲突，并且让代码冲突更容易解决

   ```bash
   git remote add upstream https://github.com/PaddlePaddle/PaddleFormers
   git pull upstream develop
   ```

### 8. git push 与提交 Pull Request

   您可以将您的本地开发分支中的工作 push 到您的 fork 的分支中：

   ```bash
   git push origin my-cool-stuff
   ```

   git push 之后，您可以提交 Pull Request，请求[官方 repo](https://github.com/PaddlePaddle/PaddleFormers) 采纳您的开发工作。请您依照[这些步骤](https://help.github.com/articles/creating-a-pull-request/)创建 Pull Request。

### 9. 删除已经合入的本地和远程分支

   为了保持您本地的工作区和 fork 分支的干净整洁，建议您在 Pull Request 合入之后删除本地的残余分支：

   ```bash
   git push origin my-cool-stuff
   git checkout develop
   git pull upstream develop
   git branch -d my-cool-stuff
   ```

## 代码 Review

- 在您的 Pull Request 能够顺利通过本地测试以及 CI 的情况下，您可以在 Pull Request 中 @ 相关的 Reviewer，提醒他们尽快对您的 Pull Request 进行 Review。

- 请处理 Reviewer 的每一条评论。如果您已按照评论修改，请回复“完成”；否则，可以在评论下展开讨论。

- 如果您不希望您的 Reviewer 被电子邮件通知淹没，您可以[批量回复](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)。


================================================
FILE: LICENSE
================================================
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
# Makefile for PaddleFormers
#
# 	GitHb: https://github.com/PaddlePaddle/PaddleFormers
# 	Author: Paddle Team https://github.com/PaddlePaddle
#

.PHONY: all
all : lint test
check_dirs := paddleformers scripts tests 
# # # # # # # # # # # # # # # Format Block # # # # # # # # # # # # # # # 

format:
	pre-commit run isort
	pre-commit run black

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

# # # # # # # # # # # # # # # Lint Block # # # # # # # # # # # # # # # 

.PHONY: lint
lint:
	$(eval modified_py_files := $(shell python scripts/codestyle/get_modified_files.py $(check_dirs)))
	@if test -n "$(modified_py_files)"; then \
		echo ${modified_py_files}; \
		pre-commit run --files ${modified_py_files}; \
	else \
		echo "No library .py files were modified"; \
	fi	

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

# # # # # # # # # # # # # # # Test Block # # # # # # # # # # # # # # # 

.PHONY: test
test: unit-test

unit-test:
	DOWNLOAD_SOURCE=aistudio \
	PYTHONPATH=$(shell pwd) pytest -v \
		--retries 1 --retry-delay 1 \
		--durations 20 \
		--cov=./paddleformers \
		--cov-report=xml:coverage.xml

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

.PHONY: install
install:
	@echo "Checking CUDA version and selecting pip source..."
	@if ! command -v nvcc >/dev/null 2>&1; then \
	    echo "ERROR: nvcc (CUDA) not found. Please install CUDA before proceeding."; \
	    exit 1; \
	fi; \
	cuda_version=$$(nvcc --version | grep release | awk '{print $$5}' | sed 's/,//'); \
	echo "Detected CUDA version: $$cuda_version"; \
	if [ "$$cuda_version" = "12.6" ]; then \
	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu126/"; \
	elif [ "$$cuda_version" = "12.9" ]; then \
	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu129/"; \
	elif [ "$$cuda_version" = "13.0" ]; then \
	    PADDLE_SOURCE="https://www.paddlepaddle.org.cn/packages/nightly/cu130/"; \
	else \
	    PADDLE_SOURCE=""; \
	    echo "Unknown CUDA version."; \
	fi; \
	echo "Using pip source: $$PADDLE_SOURCE"; \
	pip install -r tests/requirements.txt \
	pip install -r requirements.txt --extra-index-url "$$PADDLE_SOURCE"; \
	pre-commit install


.PHONY: deploy-ppdiffusers
deploy-ppdiffusers:
	cd ppdiffusers && make install && make

.PHONY: deploy-paddle-pipelines
deploy-paddle-pipelines:
	cd pipelines && make install && make

.PHONY: deploy-paddleformers
deploy-paddleformers:
	# install related package
	make install
	# build
	python3 setup.py sdist bdist_wheel
	# upload
	twine upload --skip-existing dist/*


================================================
FILE: README.md
================================================
<p align="center">
  <img src="https://github.com/user-attachments/assets/9d1c1937-7fac-48f8-9d61-f7ac67b61b18" align="middle"  width="500" />
</p>

------------------------------------------------------------------------------------------

<p align="center">
    <a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win-pink.svg"></a>
    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
    <a href="https://github.com/PaddlePaddle/PaddleFormers/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleFormers?color=ccf"></a>
</p>

<h4 align="center">
    <a href=#最新更新> 最新更新 </a> |
    <a href=#特性> 特性 </a> |
    <a href=#安装> 安装 </a> |
    <a href=#快速体验> 快速体验 </a> |
    <a href=#社区交流> 社区交流 </a>
</h4>

# PaddleFormers
## 📝简介
PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transformers 库，旨在为 PaddlePaddle 生态提供与 Hugging Face Transformers 项目对等的模型接口与功能体验，支持大语言模型（LLM）与视觉语言模型（VLM）的训练能力。PaddleFormers 充分发挥 PaddlePaddle 在高性能训练方面的内置优势，全面支持包括张量并行、流水线并行和专家并行在内的主流大模型分布式训练策略，以及自动混合精度等加速技术，在 DeepSeek-V3、GLM-4.5-Air 等重点模型上，训练性能明显超越 Megatron-LM ，实现了高效的预训练与后训练性能。

结合业界主流优化方法与飞桨在业务实践中积累的高效特性，PaddleFormers 致力于打造**高性能、低资源占用**的训练体验，帮助用户高效便捷地完成大模型训练，而无需关注底层复杂的优化细节。

## 🆕最新更新
* 2026.01.21 - PaddleFomers v1.0版本发布啦！我们提供了针对 LLM 和 VLM 等模型的训练能力，针对 DeepSeek-V3模型和 GLM-4.5-Air 等重点模型，我们实现了极致性能优化（训练性能明显超越 Megatron-LM ）。针对 PaddleOCR-VL，我们在昆仑芯 P800、天数天垓150等国产计算芯片上进行了适配，更好的满足国内用户需求。

## ✨特性
* **丰富的模型支持：** PaddleFormers 实现了对于 100+ 主流的大语言模型和视觉语言模型的训练能力支持，涵盖了 DeepSeek-V3、GLM-4.5系列、Qwen2和 Qwen3系列、Qwen3-VL 等前沿模型。同时提供了对 ERNIE-4.5、ERNIE-4.5-VL、PaddleOCR-VL 等文心系列模型完备的训练能力。
* **高性能组网实现：** 实现了 FP8低精度训练与高性能算子优化、通信计算重叠优化、精细化存算均衡等策略，大幅提升大模型训练的计算、通信和存储效率。在 DeepSeek-V3、GLM-4.5-Air 等模型上，训练性能明显超越 Megatron-LM。
* **全流程能力支持：** PaddleFormers 实现了从预训练到后训练的全流程训练能力支持，其中后训练支持 CPT / SFT / SFT-LoRA / DPO / DPO-LoRA 等主流能力，帮助用户高效、便捷地完成大模型的迭代与优化。PaddleFormers 还实现了对 Safetensors 格式的 **全面支持** ，训练完成的模型，其存储格式与 Hugging Face 上托管的权重格式一致，可以在任意支持该格式的框架或工具中使用（如 FastDeploy / vLLM / SGLang 等）。
* **完备的训练能力支持：** PaddleFormers 实现了对于 **Function Call** 、 **Thinking** 等大模型前沿能力的训练支持，并通过 **Data Packing** 、 **Padding Free** 等数据流技术显著优化训练性能。
* **国产芯片深度适配：** 支持昆仑芯 P800、天数天垓150、沐曦 C550等国产计算平台，基于128卡昆仑芯 P800支持 DeepSeek V3的 SFT，成为最少国产算力资源后训练方案。

## 📋模型列表

<table border="1" cellpadding="8" cellspacing="0" style="width:100%; border-collapse: collapse;">
  <thead>
    <tr>
      <th style="text-align: left;">模型类型</th>
      <th style="text-align: left;">模型系列</th>
      <th style="text-align: left;">模型名称</th>
      <th style="text-align: left;">Chat Template</th>
    </tr>
  </thead>
  <tbody>
    <!-- LLM 分类 - 跨行合并开始 -->
    <tr>
      <td rowspan="10" style="vertical-align: top;">LLM</td>
      <td>DeepSeekv3</td>
      <td>deepseek-ai/DeepSeek-V3-Base、deepseek-ai/DeepSeek-V3、deepseek-ai/DeepSeek-V3-0324</td>
      <td>deepseek3</td>
    </tr>
    <tr>
      <td>🏛️ERNIE-4.5</td>
      <td>baidu/ERNIE-4.5-0.3B-Base-PT、baidu/ERNIE-4.5-0.3B-PT、baidu/ERNIE-4.5-21B-A3B-Base-PT、baidu/ERNIE-4.5-21B-A3B-PT、baidu/ERNIE-4.5-300B-A47B-Base-PT、baidu/ERNIE-4.5-300B-A47B-PT、baidu/ERNIE-4.5-21B-A3B-Thinking</td>
      <td>ernie、ernie_nothink</td>
    </tr>
    <tr>
      <td>gemma3</td>
      <td>google/gemma-3-270m、google/gemma-3-270m-it、google/gemma-3-1b-pt、google/gemma-3-1b-it、google/gemma-3-4b-pt、google/gemma-3-4b-it、google/gemma-3-12b-pt、google/gemma-3-12b-it、google/gemma-3-27b-pt、google/gemma-3-27b-it</td>
      <td>gemma</td>
    </tr>
    <tr>
      <td>GLM-4.5</td>
      <td>zai-org/GLM-4.5-Air-Base、zai-org/GLM-4.5-Air、zai-org/GLM-4.5-Base、zai-org/GLM-4.5</td>
      <td>glm4_moe</td>
    </tr>
    <tr>
      <td>gpt-oss</td>
      <td>openai/gpt-oss-20b、openai/gpt-oss-120b</td>
      <td>gpt</td>
    </tr>
    <tr>
      <td>Llama-3</td>
      <td>meta-llama/Meta-Llama-3-8B、meta-llama/Meta-Llama-3-8B-Instruct、meta-llama/Meta-Llama-3-70B、meta-llama/Meta-Llama-3-70B-Instruct、meta-llama/Llama-3.1-8B、meta-llama/Llama-3.1-8B-Instruct、meta-llama/Llama-3.1-70B、meta-llama/Llama-3.1-70B-Instruct、meta-llama/Llama-3.1-405B、meta-llama/Llama-3.1-405B-Instruct、meta-llama/Llama-3.2-1B、meta-llama/Llama-3.2-1B-Instruct、meta-llama/Llama-3.2-3B、meta-llama/Llama-3.2-3B-Instruct、meta-llama/Llama-3.3-70B-Instruct</td>
      <td>llama3</td>
    </tr>
    <tr>
      <td>phi-4</td>
      <td>microsoft/phi-4</td>
      <td>phi4</td>
    </tr>
    <tr>
      <td>Qwen2</td>
      <td>Qwen/Qwen2-0.5B、Qwen/Qwen2-0.5B-Instruct、Qwen/Qwen2-1.5B、Qwen/Qwen2-1.5B-Instruct、Qwen/Qwen2-7B、Qwen/Qwen2-7B-Instruct、Qwen/Qwen2-57B-A14B、Qwen/Qwen2-57B-A14B-Instruct、Qwen/Qwen2-72B、Qwen/Qwen2-0.5B-Instruct</td>
      <td>qwen</td>
    </tr>
    <tr>
      <td>Qwen3</td>
      <td>Qwen/Qwen3-0.6B-Base、Qwen/Qwen3-0.6B、Qwen/Qwen3-1.7B-Base、Qwen/Qwen3-1.7B、Qwen/Qwen3-4B-Base、Qwen/Qwen3-4B、Qwen/Qwen3-4B-Instruct-2507、Qwen/Qwen3-4B-Thinking-2507、Qwen/Qwen3-8B-Base、Qwen/Qwen3-8B、Qwen/Qwen3-14B-Base、Qwen/Qwen3-14B、Qwen/Qwen3-32B、Qwen/Qwen3-30B-A3B-Base、Qwen/Qwen3-30B-A3B、Qwen/Qwen3-30B-A3B-Instruct-2507、Qwen/Qwen3-30B-A3B-Thinking-2507、Qwen/Qwen3-235B-A22B、Qwen/Qwen3-235B-A22B-Instruct-2507、Qwen/Qwen3-235B-A22B-Thinking-2507</td>
      <td>qwen3、qwen3_nothink</td>
    </tr>
    <tr>
      <td>Qwen3-Next</td>
      <td>Qwen/Qwen3-Next-80B-A3B-Instruct、Qwen/Qwen3-Next-80B-A3B-Thinking</td>
      <td>qwen3、qwen3_nothink</td>
    </tr>
    <!-- VLM 分类 - 跨行合并开始 -->
    <tr>
      <td rowspan="4" style="vertical-align: top;">VLM</td>
      <td>🏛️ERNIE-4.5-VL</td>
      <td>baidu/ERNIE-4.5-VL-28B-A3B-Base-PT、baidu/ERNIE-4.5-VL-28B-A3B-PT、baidu/ERNIE-4.5-VL-424B-A47B-Base-PT、baidu/ERNIE-4.5-VL-424B-A47B-PT、baidu/ERNIE-4.5-VL-28B-A3B-Thinking</td>
      <td>ernie_vl、ernie_vl_nothink</td>
    </tr>
    <tr>
      <td>🏛️PaddleOCR-VL</td>
      <td>PaddlePaddle/PaddleOCR-VL</td>
      <td>paddleocr_vl</td>
    </tr>
    <tr>
      <td>Qwen2.5-VL</td>
      <td>Qwen/Qwen2.5-VL-3B-Instruct、Qwen/Qwen2.5-VL-7B-Instruct、Qwen/Qwen2.5-VL-32B-Instruct、Qwen/Qwen2.5-VL-72B-Instruct</td>
      <td>qwen2_vl</td>
    </tr>
    <tr>
      <td>Qwen3-VL</td>
      <td>Qwen/Qwen3-VL-2B-Instruct、Qwen/Qwen3-VL-2B-Thinking、Qwen/Qwen3-VL-4B-Instruct、Qwen/Qwen3-VL-4B-Thinking、Qwen/Qwen3-VL-8B-Instruct、Qwen/Qwen3-VL-8B-Thinking、Qwen/Qwen3-VL-32B-Instruct、Qwen/Qwen3-VL-32B-Thinking、Qwen/Qwen3-VL-30B-A3B-Instruct、Qwen/Qwen3-VL-30B-A3B-Thinking、Qwen/Qwen3-VL-235B-A22B-Instruct、Qwen/Qwen3-VL-235B-A22B-Thinking</td>
      <td>qwen3_vl、qwen3_vl_nothink</td>
    </tr>
  </tbody>
</table>

* 更多关于模型训练能力的支持细节，请参考：[PaddleFormers 模型能力矩阵](./docs/zh/model_capability.md)
* 带有🏛️标签的模型是 PaddleFormers 官方维护的模型

## 💾安装
**环境依赖**

* python ≥ 3.10
* CUDA ≥ 12.0
* PaddleFleet ≥ 0.2（仅为 GPU 训练功能依赖）

**安装依赖（GPU）**

<details>
  <summary>基于 Docker 容器的方式（<b>推荐</b>）</summary>

------
> 为了避免本地环境存在较多冲突，我们建议使用 PaddleFormers 的预置镜像来准备环境，容器中已经拉取了 PaddleFormers 仓库并完成了安装：
>
> ```shell
> # 以cuda12.6为例
> docker run --gpus all --name paddleformers-work -v $(pwd):/work  \
>     -w=/work --shm-size=512G --network=host -it \
>     ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.3.0-gpu-cuda12.6-cudnn9.5 /bin/bash
>
> # cuda12.9镜像：ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.3.0-gpu-cuda12.9-cudnn9.9
> # cuda13.0镜像：ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.3.0-gpu-cuda13.0-cudnn9.13
> ```
------

</details>

<details>
  <summary>基于 pip/源码的安装方式</summary>

------
> 我们推荐使用 `conda` / `venv` / `uv` 等虚拟环境工具管理 python 环境。
>
> ```shell
> # conda
> conda create -n paddleformers-work python=3.10 #支持python3.10～3.13
> conda activate paddleformers-work
> # venv
> python -m venv .paddleformers-work
> source .paddleformers-work/bin/activate
> # uv
> uv venv .paddleformers-work
> source .paddleformers-work/bin/activate
> ```
------
> **安装方案一：** 拉取源码安装
>
> ```shell
> # Install development version
> git clone https://github.com/PaddlePaddle/PaddleFormers.git
> cd PaddleFormers
> # cuda12.6
> python -m pip install -e '.[paddlefleet]' --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/
> # cuda12.9
> # python -m pip install -e '.[paddlefleet]' --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu129/
> # cuda13.0
> # python -m pip install -e '.[paddlefleet]' --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu130/
> ```
------
> **安装方案二：** 如果您不想拉取源码，可以基于下面的命令安装 PaddleFormers 和 PaddleFleet。
>
> ```shell
> # Install via pip
> # cuda12.6
> python -m pip install "paddleformers[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
> # cuda12.9
> # python -m pip install "paddleformers[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
> # cuda13.0
> # python -m pip install "paddleformers[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu130/
> ```
------
> **安装方案三：** 如果您只需使用 tokenizer 或者 processor，可以通过以下命令安装，这种情况下不会安装训练相关的依赖，安装速度更加快。
>
> ```shell
> python -m pip install paddleformers
> ```
------

</details>

 **安装依赖（XPU & ILUVATAR-GPU & Metax GPU）**

* [昆仑芯安装说明文档](./docs/zh/XPU_installation_guide.md)
* [天数智芯安装说明文档](./docs/zh/ILUVATAR-GPU_installation_guide.md)
* [沐曦安装说明文档](./docs/zh/Metax-GPU_installation_guide.md)

# ⚡快速体验

PaddleFormers 在 API 设计上与 Hugging Face Transformers 保持了高度一致，使用示例如下：

**使用 tokenizer**

```python
from paddleformers.transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")
print(tokenizer.encode("中华人民共和国"))
# 中华人民共和国将会被编码为两个token：
# [105492, 104773]
```

**文本生成**

```python
from paddleformers.transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", dtype="bfloat16").eval()

input_features = tokenizer("请给我一段大模型的简短介绍：", return_tensors="pd")
outputs = model.generate(**input_features, max_new_tokens=256)
output_ids = outputs[0].tolist()[0]

print(tokenizer.decode(output_ids, skip_special_tokens=True))
```

**模型训练**

```shell
paddleformers-cli train ./examples/config/sft/full.yaml
```

## 📊数据处理
* [数据集格式说明](./docs/zh/dataset_format.md)
* [Chat Template 说明](./docs/zh/chat_template_guide.md)
* [数据流参数说明](./docs/zh/data_processing_guide.md)

## 🚀模型训练 & 部署
* [PaddleFormers 命令行工具](./docs/zh/cli_usage.md)
* [训练参数配置说明](./docs/zh/training_arguments.md)
* [基于 PaddleFormers 进行模型预训练/后预训练](./docs/zh/pt_and_cpt_guide.md)
* [基于 PaddleFormers 进行指令微调（SFT & LoRA）](./docs/zh/sft_and_lora_guide.md)
* [基于 PaddleFormers 进行偏好对齐（DPO & LoRA）](./docs/zh/dpo_and_lora_guide.md)
* [基于 FastDeploy / vLLM 部署模型](./docs/zh/deployment_guide.md)

## 💻多硬件使用
* [昆仑芯使用说明文档](./docs/zh/XPU_usage_guide.md)
* [天数智芯使用说明文档](./docs/zh/ILUVATAR-GPU_usage_guide.md)
* [沐曦使用说明文档](./docs/zh/Metax-GPU_usage_guide.md)

## 🔍最佳实践
* [基于 DeepSeekv3的高效预训练](./examples/best_practices/DeepSeek-V3/)
* [基于 ERNIE-4.5的高效预训练](./examples/best_practices/ERNIE-4.5/)
* [训练一个偏好 Emoji 输出的对齐模型](./examples/best_practices/tutorials/how_to_train_an_emoji_model.md)
* [训练一个支持思考能力的模型](./examples/best_practices/tutorials/how_to_train_a_reasoning_model.md)
* [训练一个支持 Function Call 能力的模型](./examples/best_practices/tutorials/how_to_train_a_function_call_model.md)
* [基于 PaddleOCR-VL 微调实现孟加拉语识别能力](./examples/best_practices/PaddleOCR-VL/)
* [训练一个支持 Grounding 的模型](./examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md)

## ➕其他
* [如何下载模型](./docs/zh/how_to_download_model.md)
* [常见问题处理](https://github.com/PaddlePaddle/PaddleFormers/issues/3699)

## 💬社区相关

**贡献代码**

* 欢迎社区用户为 PaddleFormers 贡献代码，详情请参考 [贡献指南](CONTRIBUTING.md)。

**和我们交流**

* 微信扫描二维码并填写问卷，即可加入交流群与众多社区开发者以及官方团队深度交流.

<div align="center">
  <img src="https://github.com/user-attachments/assets/9f0a736c-b047-4912-a70f-8b1ea772c3eb" width="300" alt="qrcode">
</div>

## 🙏致谢
我们借鉴了 Hugging Face 的[Transformers](https://github.com/huggingface/transformers)🤗关于预训练模型使用的优秀设计，在此对 Hugging Face 作者及其开源社区表示感谢。

## 📜许可证
PaddleFormers 遵循[Apache-2.0开源协议](LICENSE)。


================================================
FILE: docs/en/cli_usage.md
================================================
# CLI

## Overview

CLI (Command Line Interface) provides terminal-based interaction with the program, enabling efficient and flexible execution of model training, inference, and evaluation tasks through parameterized configurations.

## Quick Start

**Installation**

Run in the PaddleFormers root directory:
```bash
python -m pip install -e .
```

Verify installation:
```bash
paddleformers-cli help
```

Expected output:
```
------------------------------------------------------------
| Usage:                                                    |
|   paddleformers-cli train : model finetuning              |
|   paddleformers-cli export : model export                 |
|   paddleformers-cli help: show helping info               |
------------------------------------------------------------
```

**GPU Configuration**

By default, all available gpus are used in CLI.
If you wan to specify certain gpus, please set CUDA_VISIBLE_DEVICES before running CLI:

```bash
# Single GPU
export CUDA_VISIBLE_DEVICES=0
# Multi GPUs
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# Single XPU
export XPU_VISIBLE_DEVICES=0
# Multi XPUs
export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# Single NPU
export ASCEND_RT_VISIBLE_DEVICES=0
# Multi NPUs
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
```

* Note: In `Chat` module, the number of gpus configured by CUDA_VISIBLE_DEVICES should be equal to `tensor_model_parallel_size` in the config.
Alternatively, you can also unset CUDA_VISIBLE_DEVICES.

**Proxy Configuration**

```bash
export HTTPS_PROXY={your_proxy}
export HTTP_PROXY={your_proxy}
```

## CLI Specific Usage

Example using the **Qwen/Qwen3-0.6B-Base** model:

### 1. Chat
To be supplemented

### 2. Model Pre-training

```bash
# Example 1: PT-Full using online dataset
paddleformers-cli train examples/config/pt/full.yaml
# Example 2: PT-Full using offline dataset
paddleformers-cli train examples/config/pt/full_offline_data.yaml
```

### 3. Model Fine-tuning

#### 3.1. SFT and LoRA Fine-tuning
```bash
# Example 1: SFT
paddleformers-cli train examples/config/sft/lora.yaml
# Example 2: SFT-Full
paddleformers-cli train examples/config/sft/full.yaml
```

#### 3.2. DPO and LoRA Fine-tuning
```bash
# Example 1: 8K seq length, DPO
paddleformers-cli train examples/config/dpo/full.yaml
# Example 2: 8K seq length, DPO-LoRA
paddleformers-cli train examples/config/dpo/lora.yaml
```

### 4. Model Evaluation
To be supplemented

### 5. Model Export
```bash
paddleformers-cli export examples/config/run_export.yaml
```

### 6. Multi-node Training

#### 6.1. Method 1

```bash
NNODES={num_nodes} MASTER_ADDR={your_master_addr} MASTER_PORT={your_master_port} RANK={rank} CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 paddleformers-cli train examples/config/sft_full.yaml
```

#### 6.2. Method 2 (mpirun)

First, write a script, such as `scripts/train_96_gpus.sh`, with the following content:
```bash
NNODES={num_nodes} MASTER_ADDR={your_master_addr} MASTER_PORT={your_master_port} RANK={rank} CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 paddleformers-cli train examples/config/sft_full.yaml
```

Then:
```bash
mpirun bash scripts/train_96_gpus.sh
```


================================================
FILE: docs/en/datasets.md
================================================
# Data Format Specification

## Pre-training offline dataset

- **CLI**: Modify the following fields in the YAML configuration file:
  - `input_dir` specify the prefix of the dataset, for example: dataset `data-1-part0.bin` need to be set to `input_dir: "1.0 ./data-1-part0"`，`1.0` is the dataset prob
  - `split` specify `train/eval` distribution ratio, such as: `split: "998,2"`, `train` is the training set, `eval` for the evaluation set
  - `dataset_type` specify as`pretrain`, such as: `dataset_type: "pretrain"`

- Example:
```yaml
dataset_type: "pretrain"
input_dir: "1.0 ./data/pre-training/demo_data/data-1-part0"
split: "998,2"
```

## Pre-training online dataset + others

- **CLI**: Modify the following fields in the YAML config file:
  - Set `train_dataset_path` / `eval_dataset_path` to the absolute or relative path of your local dataset file
  - Set `train_dataset_type` / `eval_dataset_type` to the dataset format (erniekit/chatml)
  - Set `train_dataset_prob` / `eval_dataset_prob` for multi-source dataset mixing probabilities
```yaml
# single-source
train_dataset_type: "erniekit"
train_dataset_path: "./examples/data/sft-train.jsonl"
train_dataset_prob: "1.0"

# multi-source
train_dataset_type: "erniekit,erniekit"
train_dataset_path: "./examples/data/sft-train1.jsonl,./examples/data/sft-train2.jsonl"
train_dataset_prob: "0.8,0.2"
```

- Supplement: The `truncate_packing` strategy is also supported in the online pre-training data stream, which supports truncating the data to effectively reduce padding tokens. You can use `truncate_packing` by setting it to `True`, as shown in the figure below:

<div align="center">
<img src="https://github.com/user-attachments/assets/f7ec5b76-aee7-4f64-8331-ca00cac5339a">
</div>

# Data Packing Strategy

`Packing` is a technique used to optimize batch processing by combining multiple short input sequences into a single longer sequence before feeding them into the LLM. This reduces padding overhead and improves hardware utilization (e.g., GPU/TPU efficiency).

`The greedy intokens strategy` is a token-level optimization that prioritizes filling the available token budget (e.g., max sequence length) in a greedy manner during batch processing. It ensures that the model generates as many tokens as possible within the constraints, minimizing wasted capacity.

| packing      | greedy_intokens | Packing Strategy |
|--------------|-----------------|------------------|
| false | any   | No packing  |
| true  | false | packing is enabled without greedy intokens strategy |
| true  | true  | greedy intokens packing is enabled |

# Data Sampling Strategy

Currently, four data sampling strategies are supported: `random`, `concat`, `interleave_under`, `interleave_over`

| Data Sampling Strategy | Applicable Scenarios    | Limitations | Description |
|------------------|-----------------|------------------|------------------|
| `random`           | The dataset is extremely large and strict data proportioning is required | max_steps > 0 | In `random` mode, based on the input dataset probs, a fixed-size sample pool of `num_samples_each_epoch` is constructed, and the data loader randomly acquires data from this sample pool. |
| `concat`           | Need to train all data in the datasets | None | In `concat` mode, the input dataset probs are not used. Instead, multiple datasets are directly concatenated. The size of the dataset is equal to the total size of the input multi-source datasets. When max_steps = -1, setting `num_train_epochs` allows for a complete traversal of the input datasets for `num_train_epochs` rounds. |
| `interleave_under` | When small datasets are important but have limited samples | None | The `interleave` strategy involves cross-concatenating multiple datasets according to data proportioning. `interleave_under` indicates undersampling, meaning that sampling stops as soon as one of the datasets is exhausted. |
| `interleave_over`  | When small datasets are important but have limited samples | None | The `interleave` strategy involves cross-concatenating multiple datasets according to data proportioning. `interleave_over` indicates oversampling, meaning that sampling stops only after all datasets have been exhausted. |

- Note: `num_samples_each_epoch` only works in `random` data sampling strategy.

# Attention Mask

The data stream defaults to passing in a causal Attention Mask. In the packing case, when `use_global_causal_attn` is true, it corresponds to the `Causal Attention` shown in the figure below. Different samples within a `Sequence` are visible. When `use_global_causal_attn` is false, it corresponds to the `Causal Document Attention` shown in the figure below. Different samples within a `Sequence` are not visible.

<div align="center" style="display: flex; justify-content: center; gap: 20px;">
  <div>
    <img
      src="https://github.com/user-attachments/assets/57c414e3-6783-4a40-a5bf-eb67c6129b06"
      width="200px"
      alt="Causal Attention"
    >
    <br>
    <em>Causal Attention</em>
  </div>
  <div>
    <img
      src="https://github.com/user-attachments/assets/ffd61730-32f0-4d25-8558-086d2d43aa1f"
      width="200px"
      alt="Causal Document Attention"
    >
    <br>
    <em>Causal Document Attention</em>
  </div>
</div>


================================================
FILE: docs/en/datasets_format.md
================================================
# Data Stream Format Documentation

## Data Stream File Format Support

Currently, pre-training and post-training data streams only support the `jsonl` format.

## 1. Pre-training Data Stream

### 1.1. Online Data Stream

In the pre-training data stream, each data entry is a dictionary containing the following fields:

- `text` : `str, List(str)`, pre-training text.

Sample data:

```text
{"text": ["An example of a classification problem that requires continuous input values is house price prediction. The price of a house is usually based on factors such as square footage, location, number of bedrooms and bathrooms, and features like a backyard or garage. To accurately predict house prices, these criteria must be entered into the classification model as continuous input values."]}
...
```

For ease of testing, we also provide a [demo dataset](https://paddleformers.bj.bcebos.com/datasets/pt_data.tar.gz) that can be used directly:

```shell
wget https://paddleformers.bj.bcebos.com/datasets/pt_data.tar.gz
mkdir -p data/pt && tar -xf pt_data.tar.gz -C data/pt/
```

### 1.2. Offline Data Stream

We can also choose to use offline bit pre-training data streams, which saves more memory.

For ease of testing, we also provide an [offline pre-training demo dataset](https://paddleformers.bj.bcebos.com/datasets/pretrain_offline_data.tar.gz) that can be used directly:

```shell
wget https://paddleformers.bj.bcebos.com/datasets/pretrain_offline_data.tar.gz
tar -xf pretrain_offline_data.tar.gz -C data/pre-training/
```

You can also create your own offline data stream. The method for creating an offline data stream is as follows:

Download a text dataset, such as https://modelscope.cn/datasets/BazingaLyn/mini_pretrain_dataset

The format must be jsonl, and the format of each line is like BazingaLyn/mini_pretrain_dataset/pretrain_hq_v7.jsonl:
```text
{"text": "Scrambled eggs with tomatoes\nIngredients:\n3 eggs, 1 tomato, oil, salt, sugar, cornstarch\nInstructions:..."}
{"text": "Please describe how to properly plan personal finance. Properly planning personal finance requires the following steps..."}
{"text": "Please enter a scene dialogue about marine conservation. Person A: Wow, this beach is really..."}
{"text": "Identify two different types of wine. The method of identifying wine varies depending on its type and variety, below..."}
```

Run `examples/tools/create_pretraining_data.py`, and the generated data will be saved in `./pretrain_data.bin` and `./pretrain_data.idx` in the current directory.
```text
python -u examples/tools/create_pretraining_data.py \
    --model_name_or_path "/path/to/your/Qwen3-0.6B-base" \
    --data_format "JSON" \
    --input_path "/path/to/your/BazingaLyn/mini_pretrain_dataset/pretrain_hq_v7.jsonl" \
    --append_eos \
    --output_prefix "./pretrain_data"  \
    --workers 1 \
    --log_interval 10000 \
    --data_impl "mmap"
```

- Parameter Description

| Parameter Name              | Type        | Description                 |
|--------------------|----------- |-----------------|
| `--model_name_or_path`     | string     | Model path  |
| `--data_format`    | stri

Download .txt

gitextract__9f9_ucr/

├── .copyright.hook
├── .flake8
├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CODE_OF_CONDUCT_en.md
│   ├── CONTRIBUTING_en.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── ask-question.yml
│   │   ├── bug-report.yml
│   │   ├── docs-report.yml
│   │   ├── feature-request.yml
│   │   ├── new-model.yaml
│   │   └── others.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── actions/
│   │   └── rerun-workflow/
│   │       ├── action.yml
│   │       └── rerun.sh
│   ├── codecov.yml
│   └── workflows/
│       ├── _clone_linux.yml
│       ├── _xpu_ci_test.yml
│       ├── ce-build-ci-workflow.yml
│       ├── ce-build-images.yml
│       ├── ce-build-whl.yml
│       ├── ce-deadlink.yml
│       ├── ce-unittest-gpu.yml
│       ├── check-release-pr.yaml
│       ├── cherry-pick.yml
│       ├── ci_iluvatar.yml
│       ├── ci_xpu.yml
│       ├── debug-unittest-gpu.yml
│       ├── fleet-model-test.yml
│       ├── lint.yml
│       ├── model-unittest-gpu.yml
│       ├── requirements-review.yml
│       ├── rerun.yml
│       ├── stale.yml
│       ├── unittest-gpu.yml
│       └── update-precision.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── docs/
│   ├── en/
│   │   ├── cli_usage.md
│   │   ├── datasets.md
│   │   ├── datasets_format.md
│   │   ├── image_processors.md
│   │   ├── processors.md
│   │   └── video_processors.md
│   └── zh/
│       ├── ILUVATAR-GPU_installation_guide.md
│       ├── ILUVATAR-GPU_usage_guide.md
│       ├── Metax-GPU_installation_guide.md
│       ├── Metax-GPU_usage_guide.md
│       ├── XPU_installation_guide.md
│       ├── XPU_usage_guide.md
│       ├── chat_template_guide.md
│       ├── cli_usage.md
│       ├── custom_datasets_format_zh.md
│       ├── data_processing_guide.md
│       ├── dataset_format.md
│       ├── deployment_guide.md
│       ├── dpo_and_lora_guide.md
│       ├── ernie4.5_pretraining.md
│       ├── how_to_download_model.md
│       ├── image_processors_zh.md
│       ├── model_capability.md
│       ├── processors_zh.md
│       ├── pt_and_cpt_guide.md
│       ├── sft_and_lora_guide.md
│       ├── template.md
│       ├── template_zh.md
│       ├── training_arguments.md
│       └── video_processors_zh.md
├── examples/
│   ├── FAQ.md
│   ├── README.md
│   ├── best_practices/
│   │   ├── DeepSeek-V3/
│   │   │   ├── README.md
│   │   │   ├── SFT-Practice.md
│   │   │   ├── dsv3_128k_config.yaml
│   │   │   ├── dsv3_32k_config.yaml
│   │   │   ├── dsv3_4k_config.yaml
│   │   │   ├── pretrain/
│   │   │   │   ├── config/
│   │   │   │   │   ├── config.json
│   │   │   │   │   ├── pretrain_argument.yaml
│   │   │   │   │   ├── tokenizer.json
│   │   │   │   │   └── tokenizer_config.json
│   │   │   │   ├── run.sh
│   │   │   │   └── train_gpu.sh
│   │   │   ├── run_dsv3_128k.sh
│   │   │   ├── run_dsv3_32k.sh
│   │   │   └── run_dsv3_4k.sh
│   │   ├── ERNIE-4.5/
│   │   │   └── README.md
│   │   ├── ERNIE-4.5-VL/
│   │   │   ├── README.md
│   │   │   ├── ernie45vl_32k_config.yaml
│   │   │   ├── ernie45vl_8k_config.yaml
│   │   │   └── ernie45vl_8k_lora_config.yaml
│   │   ├── PaddleOCR-VL/
│   │   │   ├── README.md
│   │   │   ├── paddleocr-vl_full_16k_config.yaml
│   │   │   ├── paddleocr-vl_lora_16k_config.yaml
│   │   │   ├── paddleocr-vl_lora_export.yaml
│   │   │   ├── run_paddleocr-vl_full_16k.sh
│   │   │   ├── run_paddleocr-vl_full_16k_4090D.sh
│   │   │   ├── run_paddleocr-vl_lora_16k.sh
│   │   │   ├── run_paddleocr-vl_lora_16k_4090D.sh
│   │   │   └── run_paddleocr-vl_lora_export.sh
│   │   ├── PaddleOCR-VL-1.5/
│   │   │   ├── README.md
│   │   │   ├── paddleocr-vl-v15_full_16k_region_config.yaml
│   │   │   ├── paddleocr-vl-v15_full_16k_table_config.yaml
│   │   │   ├── paddleocr-vl-v15_lora_16k_region_config.yaml
│   │   │   ├── paddleocr-vl-v15_lora_16k_table_config.yaml
│   │   │   ├── region_ocr.md
│   │   │   └── table_ocr.md
│   │   ├── function_call.md
│   │   └── tutorials/
│   │       ├── how_to_train_a_function_call_model.md
│   │       ├── how_to_train_a_reasoning_model.md
│   │       ├── how_to_train_a_visual_grounding_model.md
│   │       └── how_to_train_an_emoji_model.md
│   ├── config/
│   │   ├── dpo/
│   │   │   ├── full.yaml
│   │   │   ├── full_function_call.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── dpo-vl/
│   │   │   ├── full.yaml
│   │   │   ├── full_fsdp.yaml
│   │   │   ├── full_tp.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_fsdp.yaml
│   │   │   └── lora_tp.yaml
│   │   ├── iluvatar/
│   │   │   ├── ERNIE-4.5-0.3B-PT/
│   │   │   │   └── sft/
│   │   │   │       ├── full_8k.yaml
│   │   │   │       ├── lora_8k.yaml
│   │   │   │       ├── lora_export.yaml
│   │   │   │       ├── run_full_8k.sh
│   │   │   │       ├── run_lora_8k.sh
│   │   │   │       └── run_lora_export.sh
│   │   │   ├── ERNIE-4.5-21B-A3B-PT/
│   │   │   │   └── sft/
│   │   │   │       ├── full_8k.yaml
│   │   │   │       ├── lora_8k.yaml
│   │   │   │       ├── lora_export.yaml
│   │   │   │       ├── run_full_8k.sh
│   │   │   │       ├── run_lora_8k.sh
│   │   │   │       └── run_lora_export.sh
│   │   │   └── PaddleOCR-VL/
│   │   │       └── sft/
│   │   │           ├── paddleocr-vl_full_16k_config.yaml
│   │   │           ├── paddleocr-vl_lora_16k_config.yaml
│   │   │           ├── paddleocr-vl_lora_export.yaml
│   │   │           ├── run_paddleocr-vl_full_16k.sh
│   │   │           ├── run_paddleocr-vl_lora_16k.sh
│   │   │           └── run_paddleocr-vl_lora_export.sh
│   │   ├── metax/
│   │   │   ├── ERNIE-4.5-0.3B/
│   │   │   │   └── sft/
│   │   │   │       ├── lora.yaml
│   │   │   │       ├── run_lora.sh
│   │   │   │       ├── run_sft.sh
│   │   │   │       └── sft.yaml
│   │   │   └── ERNIE-4.5-21B-A3B/
│   │   │       └── sft/
│   │   │           ├── lora.yaml
│   │   │           ├── run_lora.sh
│   │   │           ├── run_sft.sh
│   │   │           └── sft.yaml
│   │   ├── pt/
│   │   │   ├── eb45_pretrain/
│   │   │   │   ├── 21b_8_gpus.yaml
│   │   │   │   ├── 300b_2016_gpus.yaml
│   │   │   │   ├── 300b_4_nodes_ce.yaml
│   │   │   │   ├── 300b_8_gpus_ci.yaml
│   │   │   │   ├── 300b_96gpus.yaml
│   │   │   │   └── 300b_96gpus_small_acc.yaml
│   │   │   ├── full.yaml
│   │   │   ├── full_offline_data.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── run_export.yaml
│   │   ├── sft/
│   │   │   ├── full.yaml
│   │   │   ├── full_function_call.yaml
│   │   │   ├── full_tp_pp.yaml
│   │   │   ├── full_tp_pp_ep.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_tp_pp.yaml
│   │   │   └── lora_tp_pp_ep.yaml
│   │   ├── sft-vl/
│   │   │   ├── full.yaml
│   │   │   ├── full_fsdp.yaml
│   │   │   ├── full_tp.yaml
│   │   │   ├── lora.yaml
│   │   │   ├── lora_fsdp.yaml
│   │   │   └── lora_tp.yaml
│   │   └── xpu/
│   │       ├── DeepseekV3/
│   │       │   └── sft/
│   │       │       ├── full_32k_config.yaml
│   │       │       ├── full_4k_config.yaml
│   │       │       ├── run_full_32k.sh
│   │       │       └── run_full_4k.sh
│   │       ├── ERNIE-4.5-0.3B/
│   │       │   └── sft/
│   │       │       ├── full_8k.yaml
│   │       │       ├── lora_8k.yaml
│   │       │       └── lora_8k_export.yaml
│   │       ├── ERNIE-4.5-21B-A3B/
│   │       │   └── sft/
│   │       │       ├── full_32k.yaml
│   │       │       ├── lora_32k.yaml
│   │       │       ├── lora_32k_export.yaml
│   │       │       └── run_lora_32k.sh
│   │       ├── ERNIE-4.5-21B-A3B-Thinking/
│   │       │   └── sft/
│   │       │       └── full_8k.yaml
│   │       ├── ERNIE-4.5-VL-28B-A3B-Thinking/
│   │       │   └── sft/
│   │       │       └── full_32k.yaml
│   │       └── PaddleOCR-VL/
│   │           └── sft/
│   │               ├── paddleocr-vl_full_16k_config.yaml
│   │               ├── paddleocr-vl_lora_16k_config.yaml
│   │               ├── paddleocr-vl_lora_export.yaml
│   │               ├── run_paddleocr-vl_full_16k.sh
│   │               ├── run_paddleocr-vl_lora_16k.sh
│   │               └── run_paddleocr-vl_lora_export.sh
│   ├── experiments/
│   │   ├── deepseek_v3_pretrain/
│   │   │   ├── README.md
│   │   │   ├── config/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── config.json
│   │   │   │   ├── configuration.py
│   │   │   │   ├── pretrain_argument.json
│   │   │   │   └── pretrain_argument.yaml
│   │   │   ├── convert_ckpt_to_sft.py
│   │   │   ├── fp8_linear.py
│   │   │   ├── kernel.py
│   │   │   ├── load_hf_ckpt.py
│   │   │   ├── modeling.py
│   │   │   ├── modeling_pp.py
│   │   │   ├── moe_gate.py
│   │   │   ├── moe_layer.py
│   │   │   ├── moe_utils.py
│   │   │   ├── run.sh
│   │   │   ├── run_pretrain.py
│   │   │   ├── script/
│   │   │   │   └── train_gpu.sh
│   │   │   └── token_dispatcher.py
│   │   ├── ernie_pretrain/
│   │   │   ├── README.md
│   │   │   ├── README_zh.md
│   │   │   ├── demo_data/
│   │   │   │   ├── data-1-part0.idx
│   │   │   │   └── data-1-part1.idx
│   │   │   ├── ernie/
│   │   │   │   ├── config.py
│   │   │   │   ├── model_config.py
│   │   │   │   ├── pretrain.py
│   │   │   │   └── src/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── callbacks/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── fp8_quant_weight_callback.py
│   │   │   │       │   ├── gc_callback.py
│   │   │   │       │   ├── logging_callback.py
│   │   │   │       │   ├── moe_correction_bias_adjust_callback.py
│   │   │   │       │   ├── moe_logging_callback.py
│   │   │   │       │   ├── ortho_loss_callback.py
│   │   │   │       │   ├── sp_grad_sync_callback.py
│   │   │   │       │   └── tensorboard_callback.py
│   │   │   │       ├── clip/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── moe_clip.py
│   │   │   │       ├── lr_schedulers/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── cosine_lr.py
│   │   │   │       │   └── wsd_lr.py
│   │   │   │       ├── tokenizers/
│   │   │   │       │   ├── tokenization_eb_v2.py
│   │   │   │       │   └── tokenizer_model/
│   │   │   │       │       ├── added_tokens.json
│   │   │   │       │       ├── special_tokens_map.json
│   │   │   │       │       ├── tokenizer.model
│   │   │   │       │       └── tokenizer_config.json
│   │   │   │       ├── trainers/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── data_parallel.py
│   │   │   │       │   ├── dygraph_optimizer/
│   │   │   │       │   │   └── hybrid_parallel_optimizer.py
│   │   │   │       │   └── pretraining_trainer.py
│   │   │   │       └── utils/
│   │   │   │           ├── __init__.py
│   │   │   │           ├── logging.py
│   │   │   │           ├── misc.py
│   │   │   │           ├── seed_utils.py
│   │   │   │           └── training_utils.py
│   │   │   ├── model_configs/
│   │   │   │   ├── ERNIE-4p5-21B-A3B/
│   │   │   │   │   └── model_config.json
│   │   │   │   └── ERNIE-4p5-300B-A47B/
│   │   │   │       └── model_config.json
│   │   │   ├── models/
│   │   │   │   ├── comm_utils.py
│   │   │   │   ├── ernie/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── configuration.py
│   │   │   │   │   ├── modeling.py
│   │   │   │   │   ├── modeling_moe.py
│   │   │   │   │   └── modeling_pp.py
│   │   │   │   ├── fp8_linear.py
│   │   │   │   ├── moe/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   ├── token_dispatcher/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── fp8_utils.py
│   │   │   │   │   │   └── moe_utils.py
│   │   │   │   │   └── top2_gate.py
│   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   └── utils.py
│   │   │   ├── requirements.txt
│   │   │   ├── scripts/
│   │   │   │   └── ERNIE-4p5-300B-A47B/
│   │   │   │       ├── ci_ce/
│   │   │   │       │   ├── train_4_nodes_ce.sh
│   │   │   │       │   └── train_8_gpus_ci.sh
│   │   │   │       ├── train_2016_gpus.sh
│   │   │   │       └── train_96_gpus.sh
│   │   │   ├── tools/
│   │   │   │   ├── sharded_to_uc/
│   │   │   │   │   ├── README_zh.md
│   │   │   │   │   ├── convert_multi_nodes_sharded_to_single_uc.sh
│   │   │   │   │   ├── convert_sharded_to_uc.py
│   │   │   │   │   ├── gather_all_ckpt.py
│   │   │   │   │   └── merge_sharding_ep.py
│   │   │   │   └── uc_to_sharded/
│   │   │   │       ├── README.md
│   │   │   │       ├── README_zh.md
│   │   │   │       └── convert_uc_to_sharded.py
│   │   │   └── yamls/
│   │   │       ├── ERNIE-4p5-21B-A3B/
│   │   │       │   └── pretrain_8_gpus.yaml
│   │   │       └── ERNIE-4p5-300B-A47B/
│   │   │           ├── ci_ce/
│   │   │           │   ├── pretrain_4_nodes_ce.yaml
│   │   │           │   └── pretrain_8_gpus_ci.yaml
│   │   │           ├── pretrain_2016_gpus.yaml
│   │   │           ├── pretrain_96_gpus.yaml
│   │   │           └── pretrain_96_gpus_small_acc.yaml
│   │   ├── glm_pretrain/
│   │   │   └── GLM4.5-Air.yaml
│   │   └── paddlefleet/
│   │       ├── glm45.json
│   │       ├── glm45_provider.py
│   │       ├── glm45_single_card.json
│   │       ├── qwen_provider.py
│   │       ├── qwen_single_card.json
│   │       ├── run_glm45.sh
│   │       └── run_pretrain.py
│   └── tools/
│       ├── create_pretraining_data.py
│       ├── gpt-oss_weight_change/
│       │   ├── README.md
│       │   └── change_weight_dtype.py
│       ├── merge.py
│       └── trans_paddlenlp2hf.py
├── paddleformers/
│   ├── __init__.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── export/
│   │   │   ├── __init__.py
│   │   │   └── export.py
│   │   ├── hparams/
│   │   │   ├── __init__.py
│   │   │   ├── data_args.py
│   │   │   ├── export_args.py
│   │   │   ├── finetuning_args.py
│   │   │   ├── generating_args.py
│   │   │   ├── model_args.py
│   │   │   ├── parser.py
│   │   │   ├── preprocess_args.py
│   │   │   └── server_args.py
│   │   ├── launcher.py
│   │   ├── train/
│   │   │   ├── __init__.py
│   │   │   ├── auto_parallel/
│   │   │   │   ├── __init__.py
│   │   │   │   └── workflow.py
│   │   │   ├── deepseek_v3_pretrain/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── configuration.py
│   │   │   │   ├── fp8_linear.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── modeling.py
│   │   │   │   ├── modeling_pp.py
│   │   │   │   ├── moe_gate.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── utils/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── convert_ckpt_to_sft.py
│   │   │   │   │   └── load_hf_ckpt.py
│   │   │   │   └── workflow.py
│   │   │   ├── dpo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data_config.py
│   │   │   │   ├── dpo_argument.py
│   │   │   │   ├── dpo_estimate_training.py
│   │   │   │   ├── dpo_trainer.py
│   │   │   │   └── workflow.py
│   │   │   ├── ernie_pretrain/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── model_config.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── comm_utils.py
│   │   │   │   │   ├── ernie/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── configuration.py
│   │   │   │   │   │   ├── modeling.py
│   │   │   │   │   │   ├── modeling_moe.py
│   │   │   │   │   │   └── modeling_pp.py
│   │   │   │   │   ├── fp8_linear.py
│   │   │   │   │   ├── moe/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   │   ├── token_dispatcher/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── fp8_utils.py
│   │   │   │   │   │   │   └── moe_utils.py
│   │   │   │   │   │   └── top2_gate.py
│   │   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── src/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── callbacks/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── fp8_quant_weight_callback.py
│   │   │   │   │   │   ├── gc_callback.py
│   │   │   │   │   │   ├── logging_callback.py
│   │   │   │   │   │   ├── moe_correction_bias_adjust_callback.py
│   │   │   │   │   │   ├── moe_logging_callback.py
│   │   │   │   │   │   ├── ortho_loss_callback.py
│   │   │   │   │   │   ├── sp_grad_sync_callback.py
│   │   │   │   │   │   └── tensorboard_callback.py
│   │   │   │   │   ├── clip/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── moe_clip.py
│   │   │   │   │   ├── lr_schedulers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── cosine_lr.py
│   │   │   │   │   │   └── wsd_lr.py
│   │   │   │   │   ├── tokenizers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── tokenization_eb_v2.py
│   │   │   │   │   ├── trainers/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── data_parallel.py
│   │   │   │   │   │   ├── dygraph_optimizer/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── hybrid_parallel_optimizer.py
│   │   │   │   │   │   └── pretraining_trainer.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── logging.py
│   │   │   │   │       ├── misc.py
│   │   │   │   │       ├── seed_utils.py
│   │   │   │   │       └── training_utils.py
│   │   │   │   └── workflow.py
│   │   │   ├── sft/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dataset_formatting.py
│   │   │   │   ├── make_data_utils.py
│   │   │   │   ├── sft_config.py
│   │   │   │   ├── sft_trainer.py
│   │   │   │   └── workflow.py
│   │   │   └── tuner.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── llm_utils.py
│   │       ├── mllm_utils.py
│   │       └── process.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── blendable_dataset.py
│   │   ├── causal_dataset.py
│   │   ├── collate.py
│   │   ├── data_collator.py
│   │   ├── dist_dataloader.py
│   │   ├── indexed_dataset.py
│   │   ├── iterator.py
│   │   ├── sampler.py
│   │   ├── tokenizer.py
│   │   └── vocab.py
│   ├── datasets/
│   │   ├── DPODataset.py
│   │   ├── SFTDataset.py
│   │   ├── __init__.py
│   │   ├── collate.py
│   │   ├── data_utils.py
│   │   ├── dataset.py
│   │   ├── loader.py
│   │   ├── reader/
│   │   │   ├── __init__.py
│   │   │   ├── convertor.py
│   │   │   ├── data_info.json
│   │   │   ├── download_manager.py
│   │   │   ├── file_reader.py
│   │   │   ├── io.py
│   │   │   ├── mix_datasets.py
│   │   │   └── multi_source_datasets.py
│   │   ├── rlhf_datasets/
│   │   │   ├── __init__.py
│   │   │   ├── protocol.py
│   │   │   └── rl_dataset.py
│   │   ├── sampler/
│   │   │   └── __init__.py
│   │   └── template/
│   │       ├── __init__.py
│   │       ├── augment_utils.py
│   │       ├── formatter.py
│   │       ├── grounding_plugin.py
│   │       ├── mm_plugin.py
│   │       ├── template.py
│   │       └── tool_utils.py
│   ├── generation/
│   │   ├── __init__.py
│   │   ├── configuration_utils.py
│   │   ├── logits_process.py
│   │   ├── stopping_criteria.py
│   │   ├── streamers.py
│   │   └── utils.py
│   ├── mergekit/
│   │   ├── __init__.py
│   │   ├── merge_config.py
│   │   ├── merge_method.py
│   │   ├── merge_model.py
│   │   ├── merge_utils.py
│   │   └── sparsify_method.py
│   ├── nn/
│   │   ├── __init__.py
│   │   ├── activation.py
│   │   ├── attention/
│   │   │   ├── __init__.py
│   │   │   ├── eager_attention.py
│   │   │   ├── flashmask_attention.py
│   │   │   ├── interface.py
│   │   │   ├── sdpa_attention.py
│   │   │   ├── sink_impl.py
│   │   │   └── utils.py
│   │   ├── criterion/
│   │   │   ├── __init__.py
│   │   │   ├── dpo_loss.py
│   │   │   ├── interface.py
│   │   │   ├── kto_loss.py
│   │   │   ├── loss_utils.py
│   │   │   └── sft_loss.py
│   │   ├── embedding.py
│   │   ├── general.py
│   │   ├── linear.py
│   │   ├── lm_head.py
│   │   ├── mlp.py
│   │   ├── moe/
│   │   │   ├── __init__.py
│   │   │   ├── abstract.py
│   │   │   ├── all_gather.py
│   │   │   ├── all_to_all.py
│   │   │   ├── moe_allgather_layer.py
│   │   │   ├── moe_alltoall_layer.py
│   │   │   ├── moe_block.py
│   │   │   ├── topk_gate.py
│   │   │   └── utils.py
│   │   ├── moe_deepep/
│   │   │   ├── __init__.py
│   │   │   ├── modular_moe_layer.py
│   │   │   ├── moe_communication.py
│   │   │   ├── moe_expert.py
│   │   │   ├── moe_factory.py
│   │   │   ├── moe_gate.py
│   │   │   ├── moe_loss.py
│   │   │   └── moe_loss_instance.py
│   │   ├── norm.py
│   │   └── pp_model.py
│   ├── peft/
│   │   ├── __init__.py
│   │   └── lora/
│   │       ├── __init__.py
│   │       ├── auto_lora_model.py
│   │       ├── lora_config.py
│   │       ├── lora_layers.py
│   │       ├── lora_model.py
│   │       ├── lora_quant_layers.py
│   │       ├── lora_quantization_layers.py
│   │       ├── loraga_utils.py
│   │       └── utils.py
│   ├── quantization/
│   │   ├── __init__.py
│   │   ├── checkpoint_quantization_utils.py
│   │   ├── hadamard_utils.py
│   │   ├── qat_utils.py
│   │   ├── qlora.py
│   │   ├── quantization_config.py
│   │   ├── quantization_linear.py
│   │   ├── quantization_utils.py
│   │   └── unified_checkpoint_quantization.py
│   ├── trainer/
│   │   ├── __init__.py
│   │   ├── argparser.py
│   │   ├── integrations.py
│   │   ├── plugins/
│   │   │   ├── __init__.py
│   │   │   ├── npu_plugin.py
│   │   │   └── timer.py
│   │   ├── trainer.py
│   │   ├── trainer_callback.py
│   │   ├── trainer_utils.py
│   │   ├── training_args.py
│   │   ├── unified_checkpoint/
│   │   │   ├── __init__.py
│   │   │   ├── async_handler.py
│   │   │   ├── check_completion.py
│   │   │   ├── load_dynamic.py
│   │   │   ├── load_local.py
│   │   │   ├── load_save_single_card.py
│   │   │   ├── sharding_split_param_utils.py
│   │   │   ├── shared_memory_utils.py
│   │   │   ├── unified_checkpoint.py
│   │   │   └── utils.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── async_save.py
│   │       ├── ckpt_converter.py
│   │       ├── doc.py
│   │       ├── helper.py
│   │       ├── offload_optimizer.py
│   │       ├── reshard/
│   │       │   ├── __init__.py
│   │       │   ├── common.py
│   │       │   ├── pp_reshard.py
│   │       │   ├── sharding_v1.py
│   │       │   └── sharding_v2.py
│   │       ├── sharding_io.py
│   │       └── zero_cost_checkpoint.py
│   ├── transformers/
│   │   ├── __init__.py
│   │   ├── activations.py
│   │   ├── aistudio_utils.py
│   │   ├── attention_utils.py
│   │   ├── audio_processing_utils.py
│   │   ├── audio_utils.py
│   │   ├── auto/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── factory.py
│   │   │   ├── feature_extraction.py
│   │   │   ├── image_processing.py
│   │   │   ├── modeling.py
│   │   │   ├── processing.py
│   │   │   ├── tokenizer.py
│   │   │   └── video_processing.py
│   │   ├── auto_utils.py
│   │   ├── cache_utils.py
│   │   ├── configuration_utils.py
│   │   ├── context_parallel_utils.py
│   │   ├── contrastive_loss.py
│   │   ├── conversion_utils.py
│   │   ├── deepseek_v3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── mfu_utils.py
│   │   │   └── modeling.py
│   │   ├── download_utils.py
│   │   ├── dpo_criterion.py
│   │   ├── embedding_utils.py
│   │   ├── ernie4_5/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── ernie4_5_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── ernie4_5_moe_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── model/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── comm_utils.py
│   │   │   │   ├── configuration.py
│   │   │   │   ├── dfnrope/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation.py
│   │   │   │   │   ├── configuration.py
│   │   │   │   │   ├── modeling.py
│   │   │   │   │   └── modeling_pp.py
│   │   │   │   ├── distributed/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── common_dist_utils.py
│   │   │   │   │   └── xpu_dist_utils.py
│   │   │   │   ├── fusion_ops/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── common_fusion_ops.py
│   │   │   │   │   └── npu_fusion_ops.py
│   │   │   │   ├── longcontext_ops.py
│   │   │   │   ├── loss/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── dpo.py
│   │   │   │   ├── modeling.py
│   │   │   │   ├── modeling_moe.py
│   │   │   │   ├── modeling_moe_pp.py
│   │   │   │   ├── modeling_moe_vl.py
│   │   │   │   ├── modeling_moe_vl_pp.py
│   │   │   │   ├── moe/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── moe_all_gather_layer.py
│   │   │   │   │   ├── moe_layer.py
│   │   │   │   │   └── topk_gate.py
│   │   │   │   ├── refined_recompute/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── sequence_parallel_utils.py
│   │   │   │   └── utils/
│   │   │   │       ├── __init__.py
│   │   │   │       └── misc.py
│   │   │   ├── modeling.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   └── vision_process.py
│   │   ├── feature_extraction_utils.py
│   │   ├── fp8_utils.py
│   │   ├── fused_a2a.py
│   │   ├── gemma3_text/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── glm4_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── glm4v_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── image_processor_fast.py
│   │   │   ├── modeling.py
│   │   │   ├── processor.py
│   │   │   └── video_processor.py
│   │   ├── glm_ocr/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── gpt_oss/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── gpt_provider.py
│   │   ├── image_processing_utils.py
│   │   ├── image_processing_utils_fast.py
│   │   ├── image_transforms.py
│   │   ├── image_utils.py
│   │   ├── kimi_k2/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── kimi_k25/
│   │   │   ├── __init__.py
│   │   │   ├── media_utils.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   ├── tool_declaration_ts.py
│   │   │   └── vision_processor.py
│   │   ├── kto_criterion.py
│   │   ├── legacy/
│   │   │   ├── __init__.py
│   │   │   ├── tokenizer_utils.py
│   │   │   └── tokenizer_utils_base.py
│   │   ├── linear_utils.py
│   │   ├── llama/
│   │   │   ├── __init__.py
│   │   │   ├── auto_dist_config.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── tokenizer.py
│   │   │   └── tokenizer_fast.py
│   │   ├── masking_utils.py
│   │   ├── mc2_parallel_linear.py
│   │   ├── model_outputs.py
│   │   ├── model_provider.py
│   │   ├── model_utils.py
│   │   ├── modeling_rope_utils.py
│   │   ├── modelscope_utils.py
│   │   ├── moe_gate.py
│   │   ├── moe_gate_auto.py
│   │   ├── moe_layer.py
│   │   ├── moe_layer_auto.py
│   │   ├── moe_utils.py
│   │   ├── ofa_utils.py
│   │   ├── optimization.py
│   │   ├── paddle_vision_utils.py
│   │   ├── paddleocr_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── image_processor.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── phi3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── tokenizer.py
│   │   ├── processing_utils.py
│   │   ├── qwen2/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── tokenizer.py
│   │   │   └── tokenizer_fast.py
│   │   ├── qwen2_5_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   └── processor.py
│   │   ├── qwen2_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen2_vl/
│   │   │   ├── __init__.py
│   │   │   ├── image_processor.py
│   │   │   ├── image_processor_fast.py
│   │   │   ├── processor.py
│   │   │   ├── video_processor.py
│   │   │   └── vision_process.py
│   │   ├── qwen3/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_5/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_next/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── qwen3_omni_moe/
│   │   │   ├── __init__.py
│   │   │   └── processor.py
│   │   ├── qwen3_vl/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   ├── modeling.py
│   │   │   ├── modeling_fleet.py
│   │   │   ├── processor.py
│   │   │   └── video_processor.py
│   │   ├── qwen3_vl_moe/
│   │   │   ├── __init__.py
│   │   │   ├── configuration.py
│   │   │   └── modeling.py
│   │   ├── refined_recompute.py
│   │   ├── ring_flash_attention.py
│   │   ├── segment_parallel_utils.py
│   │   ├── sequence_parallel_utils.py
│   │   ├── tensor_parallel_utils.py
│   │   ├── token_dispatcher.py
│   │   ├── tokenizer_utils.py
│   │   ├── tokenizer_utils_base.py
│   │   ├── utils.py
│   │   ├── video_processing_utils.py
│   │   ├── video_utils.py
│   │   ├── vocab_utils.py
│   │   └── whisper/
│   │       ├── __init__.py
│   │       └── processor.py
│   ├── triton_kernels/
│   │   ├── __init__.py
│   │   └── rope_triton.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── adamw_triton.py
│   │   ├── batch_sampler.py
│   │   ├── converter.py
│   │   ├── distributed.py
│   │   ├── doc_parser.py
│   │   ├── download/
│   │   │   ├── __init__.py
│   │   │   ├── aistudio_hub_download.py
│   │   │   ├── common.py
│   │   │   └── download.py
│   │   ├── downloader.py
│   │   ├── env.py
│   │   ├── fault_tolerance.py
│   │   ├── ie_utils.py
│   │   ├── image_utils.py
│   │   ├── import_utils.py
│   │   ├── infohub.py
│   │   ├── initializer.py
│   │   ├── lazy_import.py
│   │   ├── log.py
│   │   ├── masking_utils.py
│   │   ├── memory_utils.py
│   │   ├── moe_hybrid_parallel_optimizer.py
│   │   ├── nested.py
│   │   ├── optimizer.py
│   │   ├── paddle_patch.py
│   │   ├── pdc_sdk.py
│   │   ├── perf_utils.py
│   │   ├── profiler.py
│   │   ├── safetensors.py
│   │   ├── serialization.py
│   │   ├── tools.py
│   │   ├── type_validators.py
│   │   └── upcast_downcast_triton.py
│   └── version/
│       ├── __init__.py
│       └── git.py
├── pyproject.toml
├── requirements.txt
├── scripts/
│   ├── ci_utils/
│   │   ├── __init__.py
│   │   ├── log_analyzer.py
│   │   └── training_utils.py
│   ├── codestyle/
│   │   ├── check_dead_links.py
│   │   ├── check_spaces.py
│   │   └── get_modified_files.py
│   ├── dependence/
│   │   └── build.sh
│   ├── iluvatar_ci/
│   │   ├── base_value/
│   │   │   └── ERNIE-21B-SFT-LOSS.json
│   │   ├── config/
│   │   │   └── ERNIE-21B-SFT.yaml
│   │   ├── conftest.py
│   │   └── test_ernie_21b_sft.py
│   ├── regression/
│   │   ├── ci_model_unittest.sh
│   │   ├── test_dpo_tiny-random-glm4moe.py
│   │   ├── test_pt_tiny-random-glm4moe.py
│   │   └── test_sft_tiny-random-glm4moe.py
│   ├── unit_test/
│   │   ├── ci_unittest.sh
│   │   └── gen_allure_report.py
│   └── xpu_ci/
│       ├── README.md
│       ├── base_value/
│       │   ├── ernie_21b_sft_loss.json
│       │   └── ernie_28b_thinking_sft_loss.json
│       ├── config/
│       │   ├── ernie_21b_sft.yaml
│       │   └── ernie_vl_28b_sft.yaml
│       ├── conftest.py
│       ├── test_ernie_21b_sft.py
│       ├── test_ernie_28b_thinking_sft.py
│       └── test_example_template.py.template
├── setup.py
└── tests/
    ├── README.md
    ├── __init__.py
    ├── check_log_for_exitcode.py
    ├── common_test.py
    ├── config/
    │   ├── benchmark/
    │   │   └── config/
    │   │       ├── pt/
    │   │       │   ├── DeepSeek-V3.yaml
    │   │       │   ├── ERNIE45-21B.yaml
    │   │       │   ├── ERNIE45-300B.yaml
    │   │       │   ├── GLM4.5-Air.yaml
    │   │       │   ├── GLM4.5-Air_64k.yaml
    │   │       │   ├── GLM4.5-Air_FP8.yaml
    │   │       │   ├── Qwen3-30B-A3B-Base-64k.yaml
    │   │       │   └── Qwen3-30B-A3B-Base.yaml
    │   │       └── sft/
    │   │           ├── GLM4.5-Air.yaml
    │   │           ├── GLM4.5-Air_128k.yaml
    │   │           ├── GLM4.5-Air_64k.yaml
    │   │           ├── Qwen3-30B-A3B-Base-64k.yaml
    │   │           ├── Qwen3-30B-A3B-Base.yaml
    │   │           ├── Qwen3-VL-30B-A3B-Instruct.yaml
    │   │           └── Qwen3-VL-8B-Instruct.yaml
    │   └── ci/
    │       ├── glm45_dpo.yaml
    │       ├── glm45_dpo_lora.yaml
    │       ├── glm45_lora.yaml
    │       ├── glm45_pt.yaml
    │       ├── glm45_pt_fp8.yaml
    │       ├── glm45_pt_grouped_gemm.yaml
    │       ├── glm45_sft.yaml
    │       ├── glm45_single_pt-test.yaml
    │       ├── qwen3_multicard_lora.yaml
    │       ├── qwen3_multicard_pt.yaml
    │       ├── qwen3_multicard_sft.yaml
    │       ├── qwen3_pt.yaml
    │       ├── qwen3vl_lora.yaml
    │       ├── qwen3vl_sft.yaml
    │       ├── qwen3vl_sft_fsdp.yaml
    │       ├── qwen3vl_sft_moe.yaml
    │       ├── qwen3vl_sft_moe_a100.yaml
    │       └── qwen3vl_sft_single.yaml
    ├── conftest.py
    ├── data/
    │   ├── __init__.py
    │   ├── test_blendable_dataset.py
    │   ├── test_collate.py
    │   ├── test_data_collator.py
    │   ├── test_sampler.py
    │   └── test_vocab.py
    ├── dataset/
    │   ├── __init__.py
    │   ├── test_convertor.py
    │   ├── test_ernie_datasets.py
    │   ├── test_file_reader.py
    │   ├── test_io.py
    │   └── test_iter_datasets.py
    ├── fixtures/
    │   ├── chat_template.json
    │   ├── chat_template_with_context.json
    │   ├── dummy/
    │   │   ├── dpo/
    │   │   │   ├── eval.jsonl
    │   │   │   ├── function-call-eval.jsonl
    │   │   │   ├── function-call-train.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── dpo-vl/
    │   │   │   ├── eval.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── io/
    │   │   │   ├── train.jsonl
    │   │   │   └── train.parquet
    │   │   ├── pt/
    │   │   │   ├── eval.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── sft/
    │   │   │   ├── eval.jsonl
    │   │   │   ├── function-call-eval.jsonl
    │   │   │   ├── function-call-train.jsonl
    │   │   │   └── train.jsonl
    │   │   ├── sft-vl/
    │   │   │   ├── thinking_safety_demo.jsonl
    │   │   │   └── train.jsonl
    │   │   └── tnews/
    │   │       ├── dev.json
    │   │       └── train.json
    │   └── sample_text.txt
    ├── generation/
    │   ├── __init__.py
    │   ├── test_logits_process.py
    │   ├── test_stopping_criteria.py
    │   ├── test_streamers.py
    │   └── test_synced_gpus.py
    ├── integration_test/
    │   ├── check_loss.py
    │   ├── check_pr_approval.py
    │   ├── check_precision_approval.sh
    │   ├── glm45_a100.sh
    │   ├── glm45_dpo.sh
    │   ├── glm45_dpo_lora.sh
    │   ├── glm45_lora.sh
    │   ├── glm45_pt.sh
    │   ├── glm45_pt_ep4.sh
    │   ├── glm45_pt_fp8.sh
    │   ├── glm45_pt_grouped_gemm.sh
    │   ├── glm45_pt_single_card.sh
    │   ├── glm45_sft.sh
    │   ├── preprocess.sh
    │   ├── qwen.sh
    │   ├── qwen3_a100.sh
    │   ├── qwen3_single_card.sh
    │   ├── qwen3vl_lora.sh
    │   ├── qwen3vl_sft.sh
    │   ├── qwen3vl_sft_single_card.sh
    │   └── update_precision.sh
    ├── mergekit/
    │   ├── __init__.py
    │   ├── test_merge_config.py
    │   ├── test_merge_method.py
    │   ├── test_merge_model.py
    │   └── test_sparsify_method.py
    ├── nn/
    │   ├── __init__.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_criterion.py
    │   ├── test_embedding.py
    │   ├── test_linear.py
    │   ├── test_lm_head.py
    │   ├── test_mlp.py
    │   └── test_norm.py
    ├── parallel_launch.py
    ├── peft/
    │   ├── __init__.py
    │   ├── test_lora.py
    │   └── test_quant_lora.py
    ├── quantization/
    │   ├── __init__.py
    │   └── test_quant.py
    ├── requirements.txt
    ├── testing_utils.py
    ├── trainer/
    │   ├── test_argparser.py
    │   ├── test_hf_format_saver_tp4_sharding2.py
    │   ├── test_lora_unified_checkpoint.py
    │   ├── test_moe_unified_checkpoint.py
    │   ├── test_trainer_callback.py
    │   ├── test_trainer_visualization.py
    │   ├── test_unified_checkpoint.py
    │   ├── trainer_utils.py
    │   └── unified-ckpt-llama-170m/
    │       └── config.json
    ├── transformers/
    │   ├── __init__.py
    │   ├── auto/
    │   │   ├── __init__.py
    │   │   ├── test_configuration.py
    │   │   ├── test_feature_extraction.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   ├── test_tokenizer.py
    │   │   ├── test_tokenizer_without_paddle.py
    │   │   └── test_video_processor.py
    │   ├── deepseek_v3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── ernie4_5_moe_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   ├── test_tokenizer.py
    │   │   └── test_vision_process.py
    │   ├── gemma3_text/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── glm4_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── glm4v_moe/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── glm_ocr/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── gpt_oss/
    │   │   ├── __init__.py
    │   │   ├── test_fp4_to_bf16.py
    │   │   └── test_modeling.py
    │   ├── kimi_k2/
    │   │   └── test_modeling.py
    │   ├── kimi_k25/
    │   │   ├── __init__.py
    │   │   └── test_processor.py
    │   ├── llama/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_tokenizer.py
    │   ├── paddleocr_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── phi3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen2/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_tokenizer.py
    │   ├── qwen2_5_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   └── test_processor.py
    │   ├── qwen2_vl/
    │   │   ├── __init__.py
    │   │   ├── test_image_processor.py
    │   │   ├── test_processor.py
    │   │   ├── test_video_processor.py
    │   │   └── test_vision_process.py
    │   ├── qwen2moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3_omni_moe/
    │   │   ├── __init__.py
    │   │   └── test_processor.py
    │   ├── qwen3_vl/
    │   │   ├── __init__.py
    │   │   ├── test_modeling.py
    │   │   ├── test_processor.py
    │   │   └── test_video_processor.py
    │   ├── qwen3_vl_moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3moe/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── qwen3next/
    │   │   ├── __init__.py
    │   │   └── test_modeling.py
    │   ├── test_cache_utils.py
    │   ├── test_configuration_common.py
    │   ├── test_configuration_utils.py
    │   ├── test_conversion_common.py
    │   ├── test_conversion_tp_split_merge.py
    │   ├── test_generation_utils.py
    │   ├── test_hf_feature_extractor.py
    │   ├── test_hf_image_processor.py
    │   ├── test_hf_processor.py
    │   ├── test_hf_tokenizer.py
    │   ├── test_hf_video_processor.py
    │   ├── test_image_processing_common.py
    │   ├── test_masking_utils.py
    │   ├── test_modeling_common.py
    │   ├── test_modeling_rope_utils.py
    │   ├── test_modeling_utils.py
    │   ├── test_processing_common.py
    │   ├── test_ring_flash_attention.py
    │   ├── test_safetensors.py
    │   ├── test_segment_parallel_utils.py
    │   ├── test_tensor_parallel.py
    │   ├── test_utils.py
    │   └── test_video_processing_common.py
    ├── triton/
    │   └── test_rope_triton.py
    └── utils/
        ├── __init__.py
        ├── test_aistudio_download.py
        ├── test_downloader.py
        ├── test_import_utils.py
        ├── test_module/
        │   ├── __init__.py
        │   ├── custom_configuration.py
        │   ├── custom_model.py
        │   ├── custom_tokenizer.py
        │   └── custom_tokenizer_fast.py
        ├── test_serialization.py
        └── test_set_nccl_config.py

Download .txt

Showing preview only (552K chars total). Download the full file or copy to clipboard to get everything.

SYMBOL INDEX (6547 symbols across 355 files)

FILE: examples/experiments/deepseek_v3_pretrain/config/configuration.py
  class DeepseekV2FastConfig (line 23) | class DeepseekV2FastConfig(PretrainedConfig):
    method __init__ (line 132) | def __init__(

FILE: examples/experiments/deepseek_v3_pretrain/convert_ckpt_to_sft.py
  function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function _is_need_transpose (line 172) | def _is_need_transpose(key):
  function prepare_tensor (line 191) | def prepare_tensor(key, value):
  function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path):

FILE: examples/experiments/deepseek_v3_pretrain/fp8_linear.py
  function fp8_linear (line 54) | def fp8_linear(
  function register_scale (line 95) | def register_scale(self):
  class Linear (line 109) | class Linear(PD_Linear):
    method __init__ (line 110) | def __init__(self, *args, **kwargs):
  class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear):
    method __init__ (line 117) | def __init__(self, *args, **kwargs):
  class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear):
    method __init__ (line 124) | def __init__(self, *args, **kwargs):
  class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear):
    method __init__ (line 131) | def __init__(self, *args, **kwargs):
  class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear):
    method __init__ (line 138) | def __init__(self, *args, **kwargs):

FILE: examples/experiments/deepseek_v3_pretrain/kernel.py
  function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
  function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T...
  function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons...
  function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =...
  function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel(
  function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s...

FILE: examples/experiments/deepseek_v3_pretrain/load_hf_ckpt.py
  function paddle_name_to_hf_names_ds_v2 (line 53) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
  function paddle_name_to_hf_names (line 128) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _get_hf_prefix (line 196) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
  function _handle_expert_weights (line 206) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 221) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 234) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function prepare_tensor (line 244) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
  function load_huggingface_ckpt (line 274) | def load_huggingface_ckpt(model, huggingface_ckpt_path):

FILE: examples/experiments/deepseek_v3_pretrain/modeling.py
  function swiglu (line 109) | def swiglu(x, y=None):
  function get_use_casual_mask (line 133) | def get_use_casual_mask():
  function set_global_step (line 138) | def set_global_step(cur_step):
  function get_global_step (line 143) | def get_global_step():
  function rms_norm_fused (line 148) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
  function cast_if_needed (line 156) | def cast_if_needed(x, dtype):
  function fusion_rms_norm (line 163) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln...
  class LMHeadFunction (line 186) | class LMHeadFunction(paddle.autograd.PyLayer):
    method forward (line 188) | def forward(ctx, x, weight, transpose_y):
    method backward (line 195) | def backward(ctx, dout):
  function parallel_matmul (line 225) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para...
  class DeepseekV2MLP (line 255) | class DeepseekV2MLP(nn.Layer):
    method __init__ (line 256) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int...
    method forward (line 302) | def forward(self, x):
  class MoEGate (line 308) | class MoEGate(PretrainedMoEGate):
    method __init__ (line 309) | def __init__(
    method forward (line 355) | def forward(self, hidden_states):
  class DeepseekV2MoE (line 407) | class DeepseekV2MoE(MoELayer):
    method __init__ (line 412) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor...
    method fp8_quant_weight (line 491) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 554) | def forward(self, hidden_states):
    method post_process (line 579) | def post_process(self, hidden_states, final_hidden_states, l_aux):
  class DeepseekV2RotaryEmbedding (line 590) | class DeepseekV2RotaryEmbedding(nn.Layer):
    method __init__ (line 591) | def __init__(self, dim, max_position_embeddings=2048, base=10000):
    method _set_cos_sin_cache (line 606) | def _set_cos_sin_cache(self, seq_len):
    method forward (line 619) | def forward(self, x, seq_len=None):
  class DeepseekV2Attention (line 632) | class DeepseekV2Attention(nn.Layer):
    method __init__ (line 635) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ...
    method fp8_quant_weight (line 745) | def fp8_quant_weight(self, quant_transpose=None):
    method _init_rope (line 752) | def _init_rope(self):
    method _shape (line 784) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
    method forward (line 787) | def forward(
  class DeepseekV2DecoderLayer (line 932) | class DeepseekV2DecoderLayer(nn.Layer):
    method __init__ (line 933) | def __init__(
    method fp8_quant_weight (line 974) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 983) | def forward(
    method self_attn_compute (line 1081) | def self_attn_compute(self, hidden_states, **kwargs):
    method pre_dispatch_compute (line 1131) | def pre_dispatch_compute(self, hidden_states):
    method expert_forward_compute (line 1138) | def expert_forward_compute(self, intermediate_hidden_states, dispatche...
    method post_combine_compute (line 1151) | def post_combine_compute(self, residual, hidden_states, final_hidden_s...
  class DeepseekV2MTPLayer (line 1166) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer):
    method __init__ (line 1167) | def __init__(
    method forward (line 1179) | def forward(
  class DeepseekV2PretrainedModelFast (line 1216) | class DeepseekV2PretrainedModelFast(PretrainedModel):
    method _get_model_flops (line 1221) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs):
    method _get_hardware_flops (line 1234) | def _get_hardware_flops(self, *args, **kwargs):
    method _get_name_mappings (line 1238) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat...
    method _get_tensor_parallel_mappings (line 1298) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i...
    method _init_weights (line 1398) | def _init_weights(self, layer):
    method step_flex_token (line 1459) | def step_flex_token(self, cur_step):
  class DeepseekV2ModelFast (line 1464) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast):
    method __init__ (line 1472) | def __init__(self, config: DeepseekV2FastConfig):
    method get_input_embeddings (line 1502) | def get_input_embeddings(self):
    method set_input_embeddings (line 1505) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_...
    method recompute_training_full (line 1544) | def recompute_training_full(
    method forward (line 1575) | def forward(
  class DeepseekV2PretrainingCriterionFast (line 1786) | class DeepseekV2PretrainingCriterionFast(nn.Layer):
    method __init__ (line 1792) | def __init__(self, config: DeepseekV2FastConfig):
    method forward (line 1803) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  function yarn_find_correction_dim (line 1853) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio...
  function yarn_find_correction_range (line 1858) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p...
  function yarn_linear_ramp_mask (line 1864) | def yarn_linear_ramp_mask(min, max, dim):
  class DeepseekV2YarnRotaryEmbedding (line 1873) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    method __init__ (line 1874) | def __init__(
    method _set_cos_sin_cache (line 1894) | def _set_cos_sin_cache(self, seq_len):
  class RmsNormFunction (line 1928) | class RmsNormFunction(paddle.autograd.PyLayer):
    method forward (line 1930) | def forward(ctx, x, scale, epsilon):
    method backward (line 1939) | def backward(ctx, grad_output):
  class DeepseekV2RMSNorm (line 1953) | class DeepseekV2RMSNorm(nn.Layer):
    method __init__ (line 1954) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps...
    method forward (line 1978) | def forward(self, hidden_states):
    method extra_repr (line 1991) | def extra_repr(self):
  function apply_rotary_pos_emb (line 1995) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion...
  class FusedNormGateFunc (line 2049) | class FusedNormGateFunc(paddle.autograd.PyLayer):
    method set_temporary_vars (line 2056) | def set_temporary_vars(cls, norm_output, invar):
    method clear_temporary_vars (line 2061) | def clear_temporary_vars(cls):
    method forward (line 2066) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
    method backward (line 2076) | def backward(ctx, d_gate_logits, d_norm_output):
  class TemporaryVarContext (line 2100) | class TemporaryVarContext:
    method __init__ (line 2101) | def __init__(self, norm_output, invar):
    method __enter__ (line 2105) | def __enter__(self):
    method __exit__ (line 2108) | def __exit__(self, exc_type, exc_val, exc_tb):
  function balance_expert_assignment (line 2112) | def balance_expert_assignment(n, m, k):
  class FakeGate (line 2123) | class FakeGate(paddle.autograd.PyLayer):
    method forward (line 2125) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa...
    method backward (line 2141) | def backward(ctx, grad_output):
  class AddAuxiliaryLoss (line 2145) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 2152) | def forward(ctx, x, loss):
    method backward (line 2158) | def backward(ctx, grad_output):
  function qkv_pre_process_no_fuse (line 2166) | def qkv_pre_process_no_fuse(
  function rearrange_kv (line 2203) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads):
  function enable_to_static (line 2214) | def enable_to_static(value):
  function qkv_pre_process (line 2223) | def qkv_pre_process(
  function manul_fwd (line 2266) | def manul_fwd(
  class MemroyRecomputeAttnFunc (line 2318) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer):
    method forward (line 2320) | def forward(
    method backward (line 2515) | def backward(ctx, dout):
  class MemroyRecomputeAttn (line 2828) | class MemroyRecomputeAttn(paddle.nn.Layer):
    method __init__ (line 2829) | def __init__(
    method fp8_quant_weight (line 2907) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 2911) | def forward(self, q_init, kv_init, position_ids):
  class FusedRMSLinearFunc (line 2941) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 2943) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps):
    method backward (line 2964) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinear (line 3014) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 3015) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method fp8_quant_weight (line 3039) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 3042) | def forward(self, x):
  class FusedRMSLinearSingleFunc (line 3047) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer):
    method forward (line 3049) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 3058) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinearSingle (line 3069) | class FusedRMSLinearSingle(paddle.nn.Layer):
    method __init__ (line 3070) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method forward (line 3087) | def forward(self, x):
  class FastCrossEntropyFunction (line 3092) | class FastCrossEntropyFunction(paddle.autograd.PyLayer):
    method forward (line 3094) | def forward(ctx, preds, labels):
    method backward (line 3102) | def backward(ctx, dout):
  class DeepseekV2LMHead (line 3112) | class DeepseekV2LMHead(nn.Layer):
    method __init__ (line 3113) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None):
    method forward (line 3149) | def forward(self, hidden_states, tensor_parallel_output=None):
    method extra_repr (line 3171) | def extra_repr(self):

FILE: examples/experiments/deepseek_v3_pretrain/modeling_pp.py
  function check_accept_none_grad (line 81) | def check_accept_none_grad():
  function parse_args (line 97) | def parse_args(args):
  function return_args (line 127) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r...
  function get_attr (line 142) | def get_attr(layer, name):
  function calc_stream_wait (line 149) | def calc_stream_wait(group_id):
  class TensorMeta (line 154) | class TensorMeta:
    method __init__ (line 157) | def __init__(self, tensor):
  class PostProcessNode (line 162) | class PostProcessNode(ScheduleNode):
    method __init__ (line 163) | def __init__(
    method forward_without_residual (line 187) | def forward_without_residual(self, inputs):
    method forward (line 231) | def forward(self, inputs):
    method backward (line 277) | def backward(self, output_grad):
  class DecoderLayerNode (line 333) | class DecoderLayerNode(ScheduleNode):
    method __init__ (line 334) | def __init__(
    method dispatch_forward (line 363) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co...
    method combine_forward (line 409) | def combine_forward(self, inputs, previous_event=None):
    method dispatch_backward (line 425) | def dispatch_backward(self, output_grad):
    method combine_backward (line 464) | def combine_backward(self, output_grad):
    method forward (line 491) | def forward(self, inputs):
    method backward (line 511) | def backward(self, output_grad=None, scaler=None):
  class OverlapedScheduleChunk (line 534) | class OverlapedScheduleChunk:
    method __init__ (line 535) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True):
    method forward_backward (line 546) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class DecoderBackwardScheduleChunk (line 559) | class DecoderBackwardScheduleChunk:
    method __init__ (line 560) | def __init__(self, nodes):
    method backward (line 563) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre...
  class OverlapedScheduleNode (line 573) | class OverlapedScheduleNode:
    method __init__ (line 574) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 580) | def forward_backward(self, inputs, output_grad, event_to_wait=None):
  class FusionFp8DecoderLayerNode (line 608) | class FusionFp8DecoderLayerNode(ScheduleNode):
    method __init__ (line 609) | def __init__(
    method attn_forward (line 633) | def attn_forward(self, inputs):
    method dispatch_forward (line 669) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F...
    method mlp_forward (line 697) | def mlp_forward(self, inputs):
    method combine_forward (line 736) | def combine_forward(self, inputs, async_finish=False, previous_event=N...
    method post_process_forward (line 762) | def post_process_forward(self, inputs, with_residual=True):
    method post_process_backward (line 785) | def post_process_backward(self, output_grad, event_to_wait=None):
    method combine_backward (line 819) | def combine_backward(self, output_grad, previous_event=None, async_fin...
    method mlp_backward (line 877) | def mlp_backward(self, output_grad):
    method dispatch_backward (line 909) | def dispatch_backward(self, output_grad, async_finish=False, previous_...
    method attn_backward (line 958) | def attn_backward(self, output_grad):
    method backward_for_fusion (line 1016) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No...
    method forward (line 1087) | def forward(self, inputs):
    method backward (line 1097) | def backward(self, output_grad=None, scaler=None):
  class DenseDecoderLayerNode (line 1108) | class DenseDecoderLayerNode(ScheduleNode):
    method __init__ (line 1109) | def __init__(
    method forward (line 1119) | def forward(self, inputs):
    method backward (line 1124) | def backward(self, output_grad=None, scaler=None):
  class OverlapedFUsionScheduleNode (line 1131) | class OverlapedFUsionScheduleNode:
    method __init__ (line 1132) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1140) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class OverlapedDenseFusionScheduleNode (line 1276) | class OverlapedDenseFusionScheduleNode:
    method __init__ (line 1277) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1286) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  function build_overlapped_nodes (line 1372) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ...
  class EmbeddingFunction (line 1436) | class EmbeddingFunction(paddle.autograd.PyLayer):
    method forward (line 1438) | def forward(ctx, x, weight):
    method backward (line 1447) | def backward(ctx, dout):
  class DeepseekV2EmbeddingPipe (line 1458) | class DeepseekV2EmbeddingPipe(nn.Layer):
    method __init__ (line 1459) | def __init__(self, config: DeepseekV2FastConfig):
    method embedding_weight (line 1474) | def embedding_weight(self):
    method forward (line 1477) | def forward(self, args):
    method build_schedule_node (line 1557) | def build_schedule_node(self):
  class DeepseekV2DecoderLayerPipe (line 1561) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer):
    method forward (line 1562) | def forward(self, args):
    method attn_compute (line 1621) | def attn_compute(self, args):
    method attn_compute_for_fusion (line 1656) | def attn_compute_for_fusion(self, args):
    method mlp_compute (line 1694) | def mlp_compute(self, inputs):
    method post_process_compute (line 1741) | def post_process_compute(self, inputs):
    method post_process_compute_for_fusion (line 1777) | def post_process_compute_for_fusion(self, inputs):
    method attn_compute_dense (line 1802) | def attn_compute_dense(self, args):
    method mlp_compute_dense (line 1820) | def mlp_compute_dense(self, inputs):
    method build_schedule_node (line 1834) | def build_schedule_node(self):
  class DeepseekV2MTPLayerPipe (line 1900) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer):
    method forward (line 1901) | def forward(self, args):
    method attn_compute_for_fusion (line 1969) | def attn_compute_for_fusion(self, args):
    method build_schedule_node (line 2016) | def build_schedule_node(self):
  class DeepseekV2RMSNormPipe (line 2035) | class DeepseekV2RMSNormPipe(nn.Layer):
    method __init__ (line 2036) | def __init__(self, config):
    method forward (line 2041) | def forward(self, args):
    method build_schedule_node (line 2056) | def build_schedule_node(self):
  class DeepseekV2LMHeadPipe (line 2060) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead):
    method __init__ (line 2061) | def __init__(self, config, embedding_weight=None):
    method embedding_weight (line 2065) | def embedding_weight(self):
    method forward (line 2068) | def forward(self, args: Union[Tuple, paddle.Tensor]):
    method build_schedule_node (line 2078) | def build_schedule_node(self):
  class DeepseekV2PretrainingCriterionPipe (line 2082) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF...
    method forward (line 2083) | def forward(self, logits, labels):
    method build_schedule_node (line 2094) | def build_schedule_node(self):
  class DeepseekV2ForCausalLMPipe (line 2098) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method step_flex_token (line 2117) | def step_flex_token(self, cur_step):
    method _prepare_pipeline_inputs_func (line 2121) | def _prepare_pipeline_inputs_func(cls, inputs):
    method __init__ (line 2144) | def __init__(self, config: DeepseekV2FastConfig):
    method fp8_quant_weight (line 2309) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
    method get_loss_fn (line 2322) | def get_loss_fn(self, config):
    method overlapped_forward_backward (line 2325) | def overlapped_forward_backward(

FILE: examples/experiments/deepseek_v3_pretrain/moe_gate.py
  class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 138) | def _topk_noaux_tc(
    method top1gating (line 175) | def top1gating(
    method top2gating (line 245) | def top2gating(
    method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor:
    method topkgating (line 359) | def topkgating(
    method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor):

FILE: examples/experiments/deepseek_v3_pretrain/moe_layer.py
  function record_stream_for_multi_input (line 58) | def record_stream_for_multi_input(x):
  function stop_gradient_for_multi_input (line 66) | def stop_gradient_for_multi_input(x):
  class MoELayer (line 73) | class MoELayer(nn.Layer):
    method __init__ (line 74) | def __init__(
    method update_flex_token (line 149) | def update_flex_token(self):
    method _parse_moe_expert_parallel (line 165) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa...
    method _post_init (line 175) | def _post_init(self):
    method forward (line 186) | def forward(
    method forward_drop_token (line 207) | def forward_drop_token(
    method expert_forward (line 326) | def expert_forward(self, dispatched_input):
    method forward_flex_token (line 337) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 380) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 383) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 386) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 394) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 400) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 406) | def post_combine_compute(self, hidden_states):
  class MoEFlexTokenLayer (line 411) | class MoEFlexTokenLayer(nn.Layer):
    method __init__ (line 412) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar...
    method expert_forward (line 428) | def expert_forward(self, dispatched_input, tokens_per_expert):
    method forward (line 440) | def forward(self, hidden_states: paddle.Tensor):
    method forward_flex_token (line 451) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 494) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 497) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 500) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 508) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 514) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 520) | def post_combine_compute(self, hidden_states):
  class Fp8DispatchQuantNode (line 525) | class Fp8DispatchQuantNode:
    method __init__ (line 526) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_...
    method forward (line 533) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 561) | def backward(self, hs_grad, token_probs_grad):
  class Fp8DispatchNode (line 573) | class Fp8DispatchNode:
    method __init__ (line 574) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"):
    method forward (line 580) | def forward(
    method backward (line 609) | def backward(
  class Fp8CombineNode (line 628) | class Fp8CombineNode:
    method __init__ (line 629) | def __init__(self, token_dispatcher, name="fp8_combine_node"):
    method forward (line 635) | def forward(self, hidden_states_out, previous_event=None, async_finish...
    method backward (line 650) | def backward(self, output_combine_grad, previous_event=None, async_fin...
  class Fp8CombineQuantNode (line 661) | class Fp8CombineQuantNode:
    method __init__ (line 662) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=...
    method forward (line 669) | def forward(self, output_combine):
    method backward (line 678) | def backward(self, output_grad, event_to_wait=None):
  class FusionMlpNode (line 704) | class FusionMlpNode:
    method __init__ (line 709) | def __init__(
    method set_recompute_fwd_gate_up (line 745) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up):
    method reset_statue (line 748) | def reset_statue(self):
    method prepare_env_subbatch (line 773) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s...
    method gemm_forward_subbatch (line 790) | def gemm_forward_subbatch(
    method gemm_backward_subbatch (line 825) | def gemm_backward_subbatch(
    method forward (line 886) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1024) | def backward(self, hidden_states_out_grad):
  class FusionMoeNode (line 1127) | class FusionMoeNode:
    method __init__ (line 1128) | def __init__(
    method forward (line 1161) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 1189) | def backward(self, output_grad):
  class FusionMoe (line 1204) | class FusionMoe(paddle.autograd.PyLayer):
    method forward (line 1206) | def forward(
    method backward (line 1225) | def backward(ctx, output_grad):

FILE: examples/experiments/deepseek_v3_pretrain/moe_utils.py
  function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self):
  function _holder_size (line 45) | def _holder_size(self):
  function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute_fast (line 69) | def permute_fast(
  function unpermute_fast (line 90) | def unpermute_fast(
  class UnZipNode (line 132) | class UnZipNode:
    method __init__ (line 133) | def __init__(self, name="unzip"):
    method reset_statue (line 138) | def reset_statue(self):
    method forward (line 143) | def forward(
    method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind...
  class ZipNode (line 203) | class ZipNode:
    method __init__ (line 204) | def __init__(self, name="zip"):
    method forward (line 208) | def forward(
    method backward (line 218) | def backward(
  class PermuteNode (line 264) | class PermuteNode:
    method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"):
    method reset_status (line 269) | def reset_status(self):
    method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic...
    method backward (line 287) | def backward(self, out_grad, dispatched_probs):
  class UnPermuteNode (line 300) | class UnPermuteNode:
    method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"):
    method reset_status (line 305) | def reset_status(self):
    method forward (line 314) | def forward(
    method backward (line 352) | def backward(self, out_grad, out_grad_scale):
  function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped...
  function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype):
  function get_env_device (line 420) | def get_env_device():

FILE: examples/experiments/deepseek_v3_pretrain/run_pretrain.py
  class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 122) | def __post_init__(self):
  class DataArguments (line 180) | class DataArguments:
  class ModelArguments (line 213) | class ModelArguments:
  function create_pretrained_dataset (line 244) | def create_pretrained_dataset(
  function get_train_data_file (line 318) | def get_train_data_file(args):
  class PretrainingTrainer (line 343) | class PretrainingTrainer(Trainer):
    method __init__ (line 344) | def __init__(self, *args, **kwargs):
    method evaluate (line 348) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 388) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 398) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function main (line 409) | def main():

FILE: examples/experiments/deepseek_v3_pretrain/token_dispatcher.py
  class _DeepepManager (line 30) | class _DeepepManager(_DispatchManager):
    method __init__ (line 52) | def __init__(
    method setup_metadata (line 73) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten...
    method dispatch (line 81) | def dispatch(
    method _indices_to_multihot (line 93) | def _indices_to_multihot(self, indices, probs):
    method get_dispatched_metadata (line 118) | def get_dispatched_metadata(self) -> paddle.Tensor:
    method get_number_of_tokens_per_expert (line 121) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor:
    method combine (line 127) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
    method get_permuted_hidden_states_by_experts (line 133) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle....
    method get_permuted_hidden_states_by_experts_fast (line 145) | def get_permuted_hidden_states_by_experts_fast(
    method get_restored_hidden_states_by_experts (line 155) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle....
    method get_restored_hidden_states_by_experts_fast (line 167) | def get_restored_hidden_states_by_experts_fast(
  class MoETokenDispatcher (line 186) | class MoETokenDispatcher:
    method __init__ (line 191) | def __init__(self, ep_group) -> None:
    method ep_group (line 198) | def ep_group(self):
    method ep_size (line 203) | def ep_size(self):
    method token_permutation (line 208) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso...
    method token_unpermutation (line 222) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd...
  class MoEFlexTokenDispatcher (line 235) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 240) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method token_permutation (line 252) | def token_permutation(
    method token_unpermutation (line 265) | def token_unpermutation(
  class MoEFlexTokenDispatcherFast (line 276) | class MoEFlexTokenDispatcherFast:
    method __init__ (line 281) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method ep_group (line 294) | def ep_group(self):
    method ep_size (line 299) | def ep_size(self):
    method pre_dispatch (line 303) | def pre_dispatch(self, hidden_states, probs, routing_map):
    method post_dispatch (line 313) | def post_dispatch(self, hidden_states, dispatched_indices):
    method pre_combine (line 321) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm...
    method post_combine (line 327) | def post_combine(self, hidden_states):
    method token_permutation (line 331) | def token_permutation(
    method token_unpermutation (line 349) | def token_unpermutation(
  class PreDispatchNode (line 367) | class PreDispatchNode:
    method __init__ (line 368) | def __init__(self, token_dispatcher):
    method reset_status (line 372) | def reset_status(self):
    method forward (line 378) | def forward(self, routing_map, probs):
    method backward (line 393) | def backward(self, token_probs_g):

FILE: examples/experiments/ernie_pretrain/ernie/config.py
  function get_config (line 24) | def get_config(verbose=False):

FILE: examples/experiments/ernie_pretrain/ernie/model_config.py
  class ModelConfig (line 22) | class ModelConfig:

FILE: examples/experiments/ernie_pretrain/ernie/pretrain.py
  function log_trainer_start (line 76) | def log_trainer_start():
  function load_huggingface_checkpoint (line 86) | def load_huggingface_checkpoint(model, args):
  function get_expected_state_dict (line 181) | def get_expected_state_dict(model, **kwargs):
  function update_model_config_from_args (line 249) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di...
  function get_tp_split_ckpt (line 259) | def get_tp_split_ckpt(args, path):
  class AllArguments (line 271) | class AllArguments(PreTrainingArguments):
    method __post_init__ (line 272) | def __post_init__(self):
  class ExpConfig (line 277) | class ExpConfig:
  function create_pretrained_dataset (line 283) | def create_pretrained_dataset(args):
  function main (line 329) | def main():

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/fp8_quant_weight_callback.py
  function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key):
  class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback):
    method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/gc_callback.py
  class GCCallback (line 20) | class GCCallback(TrainerCallback):
    method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs):
    method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/logging_callback.py
  class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(
    method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_correction_bias_adjust_callback.py
  class MoECorrectionBiasAdjustCallback (line 28) | class MoECorrectionBiasAdjustCallback(TrainerCallback):
    method __init__ (line 29) | def __init__(self, lr, use_sp):
    method on_optimizer_end (line 34) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_logging_callback.py
  function tensor_md5 (line 45) | def tensor_md5(tensor):
  class GlobalRNGCallback (line 51) | class GlobalRNGCallback(TrainerCallback):
    method on_step_end (line 52) | def on_step_end(self, args, state, control, model, **kwargs):
  class MoeLoggingCallback (line 57) | class MoeLoggingCallback(TrainerCallback):
    method __init__ (line 58) | def __init__(self, optimizer):
    method on_log (line 69) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_step_end (line 75) | def on_step_end(self, args, state, control, model, **kwargs):
    method on_save (line 114) | def on_save(self, args, state, control, model, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/ortho_loss_callback.py
  class OrthogonalCallback (line 22) | class OrthogonalCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(self, ortho_loss_lambda):
    method on_optimizer_end (line 26) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/sp_grad_sync_callback.py
  class SPGradSyncCallback (line 28) | class SPGradSyncCallback(TrainerCallback):
    method __init__ (line 29) | def __init__(self, model):
    method on_optimizer_begin (line 41) | def on_optimizer_begin(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/tensorboard_callback.py
  function is_tensorboard_available (line 29) | def is_tensorboard_available():
  function rewrite_logs (line 33) | def rewrite_logs(d):
  class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback):
    method __init__ (line 50) | def __init__(
    method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/clip/moe_clip.py
  class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase):
    method __init__ (line 29) | def __init__(
    method __str__ (line 49) | def __str__(self):
    method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None):
    method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads):

FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/cosine_lr.py
  function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup(

FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/wsd_lr.py
  function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup(

FILE: examples/experiments/ernie_pretrain/ernie/src/tokenizers/tokenization_eb_v2.py
  class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer):
    method __init__ (line 40) | def __init__(
    method space_token (line 72) | def space_token(self):
    method space_token_id (line 76) | def space_token_id(self):
    method gend_token (line 80) | def gend_token(self):
    method gend_token_id (line 84) | def gend_token_id(self):
    method im_start_id (line 88) | def im_start_id(self):
    method im_end_id (line 92) | def im_end_id(self):
    method vocab_size (line 96) | def vocab_size(self):
    method get_vocab (line 99) | def get_vocab(self):
    method _tokenize (line 104) | def _tokenize(self, text):
    method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id):
    method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens):
    method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs):
    method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    method _decode (line 169) | def _decode(self, *args, **kwargs):
    method _pad (line 179) | def _pad(
  function add_special_tokens (line 239) | def add_special_tokens(

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/data_parallel.py
  class DataParallel (line 22) | class DataParallel(paddle.DataParallel):
    method init_reducer (line 23) | def init_reducer(self):
  function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None:

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py
  class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad:
    method __init__ (line 38) | def __init__(self, clip, hcg, timers=None):
    method _global_norm (line 53) | def _global_norm(
    method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads):
    method _comm_and_clip (line 277) | def _comm_and_clip(
    method __getattr__ (line 330) | def __getattr__(self, item):
    method __call__ (line 333) | def __call__(self, params_grads):
  class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase):
    method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy):

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/pretraining_trainer.py
  function distributed_optimizer_maybe_overwrite (line 97) | def distributed_optimizer_maybe_overwrite(
  class PreTrainingArguments (line 122) | class PreTrainingArguments(TrainingArguments):
    method use_moe (line 274) | def use_moe(self):  # noqa: F811
    method use_moe (line 278) | def use_moe(self, value):
    method need_data (line 283) | def need_data(self):
    method combine_batch (line 287) | def combine_batch(self):
    method reeao_dataset_rank (line 291) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 295) | def reeao_dataset_world_size(self):
    method __post_init__ (line 298) | def __post_init__(self):
  class WeightedDistributedSampler (line 412) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 413) | def __init__(
    method set_epoch (line 459) | def set_epoch(self, epoch=0, consumed_samples=0):
    method gen_data_seq (line 464) | def gen_data_seq(self):
    method load_data_seq_from_cache (line 477) | def load_data_seq_from_cache(self):
    method gen_data_seq_weighted (line 490) | def gen_data_seq_weighted(self, num_examples, data_type=None):
    method roundup_and_shard (line 568) | def roundup_and_shard(self, indices):
    method __len__ (line 590) | def __len__(self):
    method __iter__ (line 593) | def __iter__(self):
  class DummySampler (line 661) | class DummySampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 662) | def __init__(self, dataset, batch_size=1, **kwargs):
    method __len__ (line 665) | def __len__(self):
    method __iter__ (line 668) | def __iter__(self):
  class PretrainingTrainer (line 673) | class PretrainingTrainer(Trainer):
    method __init__ (line 674) | def __init__(self, args=None, model=None, callbacks=[], **kwargs):
    method autocast_smart_context_manager (line 695) | def autocast_smart_context_manager(self):
    method _load_optimizer_state (line 727) | def _load_optimizer_state(self, checkpoint):
    method _save_moe_weights (line 776) | def _save_moe_weights(self, output_dir):
    method _wrap_model (line 823) | def _wrap_model(self, model, training=True):
    method _new_gradclip (line 990) | def _new_gradclip(self):
    method evaluate (line 1036) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_pipeline_step (line 1067) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only...
    method restore_dataloader_status (line 1073) | def restore_dataloader_status(self):
    method _get_eval_sampler (line 1118) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 1128) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
    method _maybe_log_save_evaluate (line 1138) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_...
    method create_scheduler (line 1302) | def create_scheduler(self, num_training_steps):
    method create_optimizer (line 1326) | def create_optimizer(self, lr_scheduler=None):
    method save_model (line 1388) | def save_model(self, output_dir=None):
    method _load_rng_state (line 1394) | def _load_rng_state(self, checkpoint):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/logging.py
  function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/misc.py
  class SmoothedValue (line 39) | class SmoothedValue:
    method __init__ (line 40) | def __init__(
    method update (line 49) | def update(self, value):
    method global_avg (line 60) | def global_avg(self):
    method reset (line 63) | def reset(self):
  class TrainingLogs (line 68) | class TrainingLogs:
    method __new__ (line 71) | def __new__(cls, *args, **kw):
    method __init__ (line 76) | def __init__(self):
    method set_trainer_interval (line 84) | def set_trainer_interval(self, trainer, logging_interval):
    method global_meters_keys (line 89) | def global_meters_keys(self):
    method global_meters_keys (line 93) | def global_meters_keys(self, lst):
    method enable_skip_zero (line 96) | def enable_skip_zero(self, keys=[]):
    method update (line 104) | def update(self, **kwargs):
    method is_enabled (line 108) | def is_enabled(self):
    method __setitem__ (line 111) | def __setitem__(self, k, v):
    method __getitem__ (line 119) | def __getitem__(self, v):
    method __getattr__ (line 122) | def __getattr__(self, attr):
    method dict (line 129) | def dict(self, use_async=False):
    method reset (line 183) | def reset(self):
    method take_snapshot (line 188) | def take_snapshot(self):
    method restore_snapshot (line 191) | def restore_snapshot(self):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/seed_utils.py
  function set_seed (line 26) | def set_seed(seed):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/training_utils.py
  function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc...

FILE: examples/experiments/ernie_pretrain/models/comm_utils.py
  function scatter (line 33) | def scatter(input, group=None, axis=0):
  function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0):
  function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
  function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True):
  function all_gather (line 112) | def all_gather(input, group=None, axis=0):
  function reduce_scatter (line 131) | def reduce_scatter(input, group=None):
  function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...
  function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s...
  function profile (line 293) | def profile(name, use_event=True):

FILE: examples/experiments/ernie_pretrain/models/ernie/configuration.py
  class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig):
    method __init__ (line 72) | def __init__(
    method __setattr__ (line 402) | def __setattr__(self, name: str, value):
    method register_nonsaveable_keys (line 413) | def register_nonsaveable_keys(self, keys):
    method use_moe (line 422) | def use_moe(self) -> bool:
    method to_json_string (line 425) | def to_json_string(self, use_diff: bool = True) -> str:

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling.py
  function get_triangle_upper_mask (line 127) | def get_triangle_upper_mask(x, mask=None):
  function gqa_qkv_split_func (line 139) | def gqa_qkv_split_func(
  function gqa_qkv_merge_func (line 169) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h...
  function parallel_matmul (line 190) | def parallel_matmul(
  function calc_lm_head_logits (line 231) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para...
  function finfo (line 261) | def finfo(dtype: paddle.dtype = None):
  function masked_fill (line 279) | def masked_fill(x, mask, value):
  function mem_eff_attn (line 284) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa...
  function inbatch_pack_offset_to_attn_mask_start_row_indices (line 321) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs...
  function scaled_dot_product_attention (line 336) | def scaled_dot_product_attention(
  function _make_causal_mask (line 454) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
  function _expand_mask (line 468) | def _expand_mask(mask, dtype, tgt_length):
  class FusedDropoutImpl (line 483) | class FusedDropoutImpl(nn.Layer):
    method __init__ (line 484) | def __init__(self, prob, mode):
    method forward (line 491) | def forward(self, x, y):
  class RMSNorm (line 499) | class RMSNorm(nn.Layer):
    method __init__ (line 500) | def __init__(self, config):
    method forward (line 514) | def forward(self, hidden_states):
  class RotaryEmbedding (line 530) | class RotaryEmbedding(nn.Layer):
    method __init__ (line 531) | def __init__(self, dim, max_position_embeddings=4096, base=10000):
    method forward (line 547) | def forward(self, x, seq_len=None):
    method rotate_half (line 555) | def rotate_half(cls, x):
    method apply_rotary_pos_emb (line 562) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio...
  class RopeEmbeddingLegacy (line 580) | class RopeEmbeddingLegacy(nn.Layer):
    method __init__ (line 581) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a...
    method forward (line 588) | def forward(self, seq_length, position_ids=None):
    method apply_rotary (line 604) | def apply_rotary(self, rp, q, k):
    method apply_rotary_3d (line 626) | def apply_rotary_3d(self, rp, q, k, position_ids):
    method forward_single (line 694) | def forward_single(self, position_ids):
    method apply_rotary_single (line 709) | def apply_rotary_single(x, rope_emb):
  class ErnieMLP (line 717) | class ErnieMLP(nn.Layer):
    method __init__ (line 718) | def __init__(self, config):
    method forward (line 798) | def forward(self, x):
  class ErnieAttention (line 835) | class ErnieAttention(nn.Layer):
    method __init__ (line 836) | def __init__(self, config, layer_idx=0):
    method forward (line 997) | def forward(
    method rope_attn (line 1095) | def rope_attn(
  class ErnieDecoderLayer (line 1195) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 1196) | def __init__(self, config, layer_idx=0):
    method forward (line 1210) | def forward(
  class ErniePretrainedModel (line 1265) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1270) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1341) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1426) | def _init_weights(self, layer):
  class ErnieModel (line 1474) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1475) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1502) | def get_input_embeddings(self):
    method set_input_embeddings (line 1505) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1528) | def recompute_training(
    method forward (line 1558) | def forward(
  class FusedHeadParallelCrossEntropy (line 1692) | class FusedHeadParallelCrossEntropy(PyLayer):
    method forward (line 1694) | def forward(
    method backward (line 1812) | def backward(ctx, loss_all_grad, labels_all_grad):
  class ErniePretrainingCriterion (line 1930) | class ErniePretrainingCriterion(paddle.nn.Layer):
    method __init__ (line 1931) | def __init__(self, config, return_tuple=True):
    method forward (line 1946) | def forward(self, prediction_scores, masked_lm_labels):
    method forward_impl_with_fused_head_loss_fn (line 2002) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde...
    method forward_impl_with_calc_logits (line 2037) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state...
    method loss_impl (line 2049) | def loss_impl(self, prediction_scores, masked_lm_labels):
    method forward_impl (line 2055) | def forward_impl(self, prediction_scores, masked_lm_labels):
  class ErnieLMHead (line 2110) | class ErnieLMHead(nn.Layer):
    method __init__ (line 2111) | def __init__(self, config):
    method forward (line 2150) | def forward(self, hidden_states, tensor_parallel_output=None):
    method sharded_state_dict (line 2169) | def sharded_state_dict(
  class ErnieForCausalLM (line 2178) | class ErnieForCausalLM(ErniePretrainedModel):
    method __init__ (line 2181) | def __init__(self, config):
    method _post_init (line 2213) | def _post_init(self, original_init, *args, **kwargs):
    method get_input_embeddings (line 2222) | def get_input_embeddings(self):
    method set_input_embeddings (line 2225) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2228) | def get_output_embeddings(self):
    method set_output_embeddings (line 2231) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2234) | def set_decoder(self, decoder):
    method get_decoder (line 2237) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2241) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2254) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2285) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2323) | def forward(
    method sharded_state_dict (line 2382) | def sharded_state_dict(self, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py
  class BaseModelOutputWithPastAndCrossAttentions (line 96) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput):
  class CausalLMOutputWithCrossAttentions (line 103) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput):
  function get_gate (line 123) | def get_gate(
  function build_mpdp_group (line 177) | def build_mpdp_group():
  function _parse_moe_group (line 198) | def _parse_moe_group(
  function moe_ep2mp (line 245) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf...
  function moe_statedict_cherry_pick (line 294) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf...
  function moe_statedict_upcycle (line 319) | def moe_statedict_upcycle(
  class ErnieMoeMLP (line 491) | class ErnieMoeMLP(ErnieMLP):
    method __init__ (line 492) | def __init__(self, config, is_shared_expert=False):
    method forward (line 504) | def forward(self, x, use_comm=True):
  class ErnieMoeDenseExpert (line 566) | class ErnieMoeDenseExpert(nn.Layer):
    method __init__ (line 567) | def __init__(self, config):
    method forward (line 615) | def forward(self, x):
  class BMMLinear (line 642) | class BMMLinear(nn.Layer):
    method __init__ (line 643) | def __init__(self, experts, d_in, d_out, use_bias=False):
    method forward (line 651) | def forward(self, x):
  class ErnieMoeMLPFused (line 657) | class ErnieMoeMLPFused(nn.Layer):
    method __init__ (line 658) | def __init__(self, config):
    method __len__ (line 676) | def __len__(self):
    method __iter__ (line 679) | def __iter__(self):
    method forward (line 682) | def forward(self, x):
  class FusedLinearAddNormFunc (line 692) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer):
    method forward (line 694) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps):
    method backward (line 704) | def backward(ctx, d_rms_norm_out, d_residual_out):
  class FusedLinearAddNorm (line 723) | class FusedLinearAddNorm(paddle.nn.Layer):
    method __init__ (line 724) | def __init__(self, hidden_size, eps=1e-6) -> None:
    method forward (line 742) | def forward(self, x, residual):
  class FusedRMSLinearFunc (line 746) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 748) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 756) | def backward(ctx, d_qkv):
  class FusedRMSLinear (line 766) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 767) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h...
    method forward (line 786) | def forward(self, x):
  class ErnieMoEAttention (line 790) | class ErnieMoEAttention(ErnieAttention):
    method __init__ (line 791) | def __init__(self, config, layer_idx):
    method forward (line 814) | def forward(
  class FakeMoERouterLoss (line 934) | class FakeMoERouterLoss(PyLayer):
    method forward (line 936) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 944) | def backward(ctx, out_grad):
  class ErnieDecoderLayer (line 953) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 954) | def __init__(self, config, layer_idx):
    method training (line 1040) | def training(self):
    method training (line 1044) | def training(self, new):
    method fp8_quant_weight (line 1050) | def fp8_quant_weight(self):
    method _init_gate_and_experts (line 1055) | def _init_gate_and_experts(self, layer_idx):
    method _init_shared_experts (line 1106) | def _init_shared_experts(self):
    method _init_dense_experts (line 1124) | def _init_dense_experts(self, layer_idx):
    method forward (line 1147) | def forward(
    method model_parallel_dropout (line 1231) | def model_parallel_dropout(self):
  class ErniePretrainedModel (line 1238) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1243) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1313) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1431) | def _init_weights(self, layer):
  class ErnieModel (line 1510) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1511) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1576) | def get_input_embeddings(self):
    method set_input_embeddings (line 1579) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1583) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1602) | def recompute_training(
    method forward (line 1657) | def forward(
  class ErniePretrainingCriterion (line 1890) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 1891) | def __init__(self, config, return_tuple=True):
    method forward (line 1906) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  class ErnieMoEForCausalLM (line 1959) | class ErnieMoEForCausalLM(ErniePretrainedModel):
    method __init__ (line 1962) | def __init__(self, config):
    method _post_init (line 1987) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 2019) | def set_state_dict(self, state_dict, *args, **kwargs):
    method get_input_embeddings (line 2037) | def get_input_embeddings(self):
    method set_input_embeddings (line 2040) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2043) | def get_output_embeddings(self):
    method set_output_embeddings (line 2046) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2049) | def set_decoder(self, decoder):
    method get_decoder (line 2052) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2056) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2069) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2101) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2140) | def forward(
    method sharded_state_dict (line 2209) | def sharded_state_dict(self, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_pp.py
  class ErnieEmbeddingPipe (line 90) | class ErnieEmbeddingPipe(nn.Layer):
    method __init__ (line 91) | def __init__(self, config):
    method embedding_weight (line 107) | def embedding_weight(self):
    method forward (line 110) | def forward(self, args):
  class MTPEmbeddingPipe (line 204) | class MTPEmbeddingPipe(ErnieEmbeddingPipe):
    method __init__ (line 205) | def __init__(self, config):
    method embedding_weight (line 209) | def embedding_weight(self):
    method forward (line 212) | def forward(self, args):
  class EmptyLayer (line 231) | class EmptyLayer(nn.Layer):
    method __init__ (line 232) | def __init__(self):
    method forward (line 235) | def forward(self, x):
  class ErnieDecoderLayerPipe (line 239) | class ErnieDecoderLayerPipe(ErnieDecoderLayer):
    method __init__ (line 240) | def __init__(self, config, layer_idx, use_full_recompute=False):
    method forward (line 247) | def forward(self, args):
  class RMSNormPipe (line 351) | class RMSNormPipe(RMSNorm):
    method __init__ (line 352) | def __init__(self, config):
    method forward (line 357) | def forward(self, args):
  class ErnieMoELMHeadPipe (line 385) | class ErnieMoELMHeadPipe(ErnieMoELMHead):
    method forward (line 386) | def forward(self, args):
  class MTPLayer (line 397) | class MTPLayer(nn.Layer):
    method __init__ (line 398) | def __init__(self, config):
    method forward (line 432) | def forward(self, args):
    method forward_impl (line 441) | def forward_impl(self, *args):
  class ErniePretrainingCriterionPipe (line 518) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion):
    method __init__ (line 519) | def __init__(self, config):
    method forward (line 522) | def forward(self, logits, labels):
  class PipelinePretrainedModel (line 536) | class PipelinePretrainedModel(PretrainedModel):
    method __init__ (line 537) | def __init__(self, config, *args, **kwargs):
    method init (line 541) | def init(self, config, *args, **kwargs):
    method add_sequential_layer (line 546) | def add_sequential_layer(self, layer_desc, name_prefix=""):
    method get_sequential_layers (line 549) | def get_sequential_layers(self):
    method get_sequential_name_prefixs (line 552) | def get_sequential_name_prefixs(self):
    method get_shardlayer_prefix (line 555) | def get_shardlayer_prefix(self, name_splited):
    method _set_pipeline_name_mapping (line 566) | def _set_pipeline_name_mapping(self, mappings=None):
    method _check_shared_model_state (line 626) | def _check_shared_model_state(self):
    method state_dict (line 647) | def state_dict(self, *args, **kwargs):
    method _init_weights (line 659) | def _init_weights(self, layer):
    method sharded_state_dict (line 731) | def sharded_state_dict(self, *args, **kwargs):
  function get_pp_vp_split_layers (line 765) | def get_pp_vp_split_layers(config):
  class ErnieMoEForCausalLMPipe (line 797) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method _prepare_pipeline_inputs_func (line 810) | def _prepare_pipeline_inputs_func(cls, data):
    method __init__ (line 844) | def __init__(
    method get_loss_fn (line 995) | def get_loss_fn(self, config):
    method rename_model_params (line 998) | def rename_model_params(self, func):
    method fp8_quant_weight (line 1005) | def fp8_quant_weight(self):
    method _post_init (line 1011) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 1028) | def set_state_dict(self, state_dict, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/fp8_linear.py
  function fp8_gemm (line 44) | def fp8_gemm(
  function padding (line 94) | def padding(x, axis):
  class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 128) | def forward(ctx, x, w1, w2):
    method backward (line 208) | def backward(ctx, do3):
  class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 333) | def forward(ctx, x, w1, w2):
    method backward (line 393) | def backward(ctx, do3):
  class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer):
    method __init__ (line 526) | def __init__(self, config):
    method forward (line 557) | def forward(self, x):

FILE: examples/experiments/ernie_pretrain/models/moe/moe_layer.py
  function set_grad_in_dtype_non_consistent (line 65) | def set_grad_in_dtype_non_consistent(ctx):
  class Fp8MoeGateDispatchAndQuant (line 71) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer):
    method forward (line 75) | def forward(
    method backward (line 127) | def backward(ctx, *grads):
  function recompute_fwd_gate_up_func (line 146) | def recompute_fwd_gate_up_func(config, layer_idx):
  class MoEStatics (line 156) | class MoEStatics(nn.Layer):
    method __init__ (line 157) | def __init__(self, config, layer_idx):
  class GateCombine (line 188) | class GateCombine(PyLayer):
    method forward (line 190) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 198) | def backward(ctx, grad_y, *_):
  class FusionFP8Expert (line 207) | class FusionFP8Expert(paddle.autograd.PyLayer):
    method forward (line 209) | def forward(ctx, hidden_states, custom_map):
    method backward (line 226) | def backward(ctx, output_grad):
  class AlltoAll (line 237) | class AlltoAll(PyLayer):
    method forward (line 239) | def forward(ctx, x, group, sync_op=True):
    method backward (line 252) | def backward(ctx, *dx):
  class AlltoAllExpertOverlap (line 256) | class AlltoAllExpertOverlap(PyLayer):
    method forward (line 258) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i...
    method backward (line 294) | def backward(ctx, out_grad):
  class AlltoAllAsync (line 313) | class AlltoAllAsync(PyLayer):
    method forward (line 315) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 336) | def backward(ctx, dx_out, *fn_out_grads):
  function dispatching (line 356) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining_fused (line 383) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False):
  class ReshapeKeepGradDtype (line 392) | class ReshapeKeepGradDtype(PyLayer):
    method forward (line 394) | def forward(ctx, x, shape):
    method backward (line 400) | def backward(ctx, grad):
  class MOELayer (line 404) | class MOELayer(nn.Layer):
    method __init__ (line 412) | def __init__(
    method forward_experts (line 515) | def forward_experts(self, dispatched_input):
    method fp8_quant_weight (line 551) | def fp8_quant_weight(self):
    method fused_gate_logits_process (line 582) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo...
    method gate_distpach_and_quant (line 595) | def gate_distpach_and_quant(self, input, token_type_ids):
    method gate_and_distpach (line 680) | def gate_and_distpach(self, input, token_type_ids):
    method _calc_router_loss (line 792) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 823) | def calc_router_loss_and_logging(
    method combine_expert_output (line 846) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 854) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 858) | def all2all_expert_overlap(self, x, group):
    method forward (line 885) | def forward(
    method sharded_state_dict (line 1059) | def sharded_state_dict(
  class FP8FusedWLCHFunc (line 1071) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer):
    method forward (line 1073) | def forward(
    method backward (line 1126) | def backward(ctx, output_grad):
  class MlpNode (line 1141) | class MlpNode:
    method __init__ (line 1142) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ...
    method reset_status (line 1157) | def reset_status(self):
    method release_mem (line 1165) | def release_mem(self):
    method forward (line 1170) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1211) | def backward(self, hidden_states_out_grad):
  class Fp8FusedMoeFunc (line 1235) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer):
    method forward (line 1237) | def forward(
    method backward (line 1261) | def backward(ctx, output_grad):

FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py
  function _get_fp8_weight_and_scale (line 43) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
  function fused_stack_transpose_quant (line 63) | def fused_stack_transpose_quant(weight_list, transpose=False):
  function split_group_gemm (line 81) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ...
  function has_config (line 119) | def has_config(config_map, key):
  class ExpertsGroupGemmNode (line 123) | class ExpertsGroupGemmNode:
    method __init__ (line 136) | def __init__(self, experts, custom_map, name="moe_experts_node"):
    method reset_status (line 160) | def reset_status(self):
    method fwd_gate_up (line 166) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex...
    method fwd_swiglu (line 223) | def fwd_swiglu(self, o1):
    method fwd_down (line 239) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe...
    method fwd_down_no_probs (line 300) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_...
    method bwd_down_input (line 358) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ...
    method bwd_down_input_no_prob (line 428) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_...
    method bwd_swiglu (line 468) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 485) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte...
    method bwd_down_weight (line 543) | def bwd_down_weight(self, out_grad, o2, expert_w2):
    method bwd_gate_up_weight (line 627) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 706) | def forward(self, hs_out, unzipped_probs, tokens_per_expert):
    method backward (line 725) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex...
    method forward_no_prob (line 740) | def forward_no_prob(self, hs_out, tokens_per_expert):
    method backward_no_prob (line 752) | def backward_no_prob(self, out_grad, tokens_per_expert):
  class ExpertsGroupGemmContiguousNode (line 774) | class ExpertsGroupGemmContiguousNode:
    method __init__ (line 787) | def __init__(
    method reset_status (line 832) | def reset_status(self):
    method gen_m_indices (line 841) | def gen_m_indices(self, tokens_per_expert):
    method fwd_gate_up (line 862) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca...
    method fwd_swiglu (line 936) | def fwd_swiglu(self, o1):
    method fwd_down (line 940) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert):
    method bwd_down_input (line 1006) | def bwd_down_input(self, expert_w2, unzipped_grad, o1):
    method bwd_swiglu (line 1083) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 1087) | def bwd_gate_up_input(self, do1, expert_w1):
    method fused_transpose_split_quant (line 1144) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...
    method bwd_down_weight (line 1169) | def bwd_down_weight(self, do3, o2, expert_w2):
    method bwd_gate_up_weight (line 1245) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 1309) | def forward(
    method backward (line 1333) | def backward(self, out_grad, a2a_async_fn=None):
  class ExpertsGroupGemmWLCHNode (line 1426) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode):
    method __init__ (line 1442) | def __init__(
    method gen_m_indices (line 1478) | def gen_m_indices(self, tokens_per_expert):
    method fused_transpose_split_quant (line 1498) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...

FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py
  function inplace_offload (line 24) | def inplace_offload(x):
  function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024):
  function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk):
  function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute (line 105) | def permute(
  function unpermute (line 128) | def unpermute(
  class UnZipNode (line 163) | class UnZipNode:
    method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"):
    method reset_status (line 190) | def reset_status(self):
    method forward (line 196) | def forward(
    method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_...
  class ZipNode (line 268) | class ZipNode:
    method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"):
    method forward (line 292) | def forward(
    method backward (line 326) | def backward(

FILE: examples/experiments/ernie_pretrain/models/moe/top2_gate.py
  function cal_aux_loss_func (line 35) | def cal_aux_loss_func(
  function masked_fill (line 85) | def masked_fill(x, mask, value):
  class CalAuxLossFunctor (line 90) | class CalAuxLossFunctor(paddle.autograd.PyLayer):
    method forward (line 92) | def forward(
    method backward (line 122) | def backward(ctx, out_grad):
  function cast_if_needed (line 130) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 134) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 136) | def forward(ctx, x, w):
    method backward (line 142) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 155) | def gate_detach_matmul(x, weight, use_fuse):
  function compute_optimal_transport (line 164) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  class Top2Gate (line 178) | class Top2Gate(nn.Layer):
    method __init__ (line 191) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 259) | def _create_gate_parameter(self):
    method forward (line 269) | def forward(
    method get_capacity (line 301) | def get_capacity(self, num_tokens, cap_factor=None):
    method top2_gating (line 316) | def top2_gating(self, logits, cap=None, correction_bias=None):
    method _cal_aux_loss (line 388) | def _cal_aux_loss(
    method _cal_orthogonal_loss (line 433) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):
    method _cal_orthogonal_loss_opt_each_weight (line 448) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
  function cal_orthogonal_loss_opt_each_weight_func (line 455) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e...
  class TopKGateFused (line 473) | class TopKGateFused(Top2Gate):
    method forward (line 474) | def forward(

FILE: examples/experiments/ernie_pretrain/models/sequence_parallel_utils.py
  function get_hcg (line 44) | def get_hcg():
  function get_async_loader (line 51) | def get_async_loader():
  function hack_offload_wait (line 64) | def hack_offload_wait(task):
  function hack_reload_wait (line 68) | def hack_reload_wait(task):
  class ScatterOp (line 72) | class ScatterOp(PyLayer):
    method forward (line 74) | def forward(ctx, input, axis=0, group=None):
    method backward (line 80) | def backward(ctx, grad):
  class GatherOp (line 84) | class GatherOp(PyLayer):
    method forward (line 86) | def forward(ctx, input, axis=0, group=None):
    method backward (line 92) | def backward(ctx, grad):
  class AllGatherOp (line 96) | class AllGatherOp(PyLayer):
    method forward (line 98) | def forward(ctx, input, group=None):
    method backward (line 103) | def backward(ctx, grad):
  class ReduceScatterOp (line 107) | class ReduceScatterOp(PyLayer):
    method forward (line 109) | def forward(ctx, input, group=None):
    method backward (line 115) | def backward(ctx, grad):
  class AllGatherVarlenOp (line 119) | class AllGatherVarlenOp(PyLayer):
    method forward (line 121) | def forward(ctx, input, group=None):
    method backward (line 160) | def backward(ctx, grad):
  class GemmReduceScatterOp (line 174) | class GemmReduceScatterOp(PyLayer):
    method forward (line 176) | def forward(ctx, input, weight, group):
    method backward (line 183) | def backward(ctx, grad):
  class AllGatherGemmOp (line 204) | class AllGatherGemmOp(PyLayer):
    method forward (line 206) | def forward(ctx, input, weight, group):
    method backward (line 214) | def backward(ctx, grad):
  function sequence_parallel_sparse_mask_labels (line 231) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
  function mark_as_sequence_parallel_parameter (line 247) | def mark_as_sequence_parallel_parameter(parameter):
  function is_sequence_parallel_parameter (line 251) | def is_sequence_parallel_parameter(parameter):
  function create_fused_allreduce_gradient_hook (line 255) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st...
  function create_non_fused_allreduce_gradient_hook (line 272) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False):
  function register_sequence_parallel_allreduce_hooks (line 295) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para...
  function is_fused_matmul_bias_supported (line 318) | def is_fused_matmul_bias_supported():
  class ColumnSequenceParallelLinear (line 334) | class ColumnSequenceParallelLinear(Layer):
    method __init__ (line 335) | def __init__(
    method forward (line 427) | def forward(self, x, use_comm=True):
    method sharded_state_dict (line 447) | def sharded_state_dict(
  class MPScale (line 455) | class MPScale(PyLayer):
    method forward (line 457) | def forward(ctx, x, mp_degree):
    method backward (line 462) | def backward(ctx, dout):
  class RowSequenceParallelLinear (line 466) | class RowSequenceParallelLinear(Layer):
    method __init__ (line 467) | def __init__(
    method forward (line 563) | def forward(self, x):
    method sharded_state_dict (line 594) | def sharded_state_dict(

FILE: examples/experiments/ernie_pretrain/models/utils.py
  function get_global_training_logs (line 31) | def get_global_training_logs():
  function global_training_logs_enabled (line 47) | def global_training_logs_enabled():
  function inplace_offload (line 52) | def inplace_offload(tensor):
  function detach_and_requires_grad_ (line 57) | def detach_and_requires_grad_(*args):
  class FakeClone (line 65) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 67) | def forward(ctx, input):
    method backward (line 76) | def backward(ctx, grad_output):
  function manual_backward (line 80) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  class FakeGather (line 118) | class FakeGather(paddle.autograd.PyLayer):
    method forward (line 120) | def forward(ctx, input, indices):
    method backward (line 130) | def backward(ctx, grad_output):
  class FusedUnpermutation (line 139) | class FusedUnpermutation(paddle.autograd.PyLayer):
    method forward (line 141) | def forward(
    method backward (line 178) | def backward(ctx, output_tokens_grad):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/convert_sharded_to_uc.py
  function parse_args (line 35) | def parse_args():
  function convert_ckpt (line 53) | def convert_ckpt(args):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/gather_all_ckpt.py
  function parse_args (line 20) | def parse_args():
  function parse_path (line 39) | def parse_path(args):
  function get_ip_list (line 50) | def get_ip_list(args):
  function gather_ckpt (line 63) | def gather_ckpt(org_path, tgt_path, hostnames, local_host):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/merge_sharding_ep.py
  class Timer (line 53) | class Timer:
    method __init__ (line 54) | def __init__(self, name="name"):
    method __enter__ (line 57) | def __enter__(self):
    method __exit__ (line 61) | def __exit__(self, exc_type, exc_val, exc_tb):
  function strtobool (line 66) | def strtobool(s):
  function execute_cmd (line 76) | def execute_cmd(cmd, ignore_error=False):
  function parse_args (line 85) | def parse_args():
  function save_ckpt (line 100) | def save_ckpt(ckpt, save_dir, rank_info, mp_degree, pp_degree=0, ep_degr...
  class Client (line 127) | class Client:
    method __init__ (line 128) | def __init__(self, args, base_path, nproc_per_node=8, nnodes=1, node_r...
    method _get_expert_param_shape (line 174) | def _get_expert_param_shape(self, meta):
    method _expert_id (line 183) | def _expert_id(self, s_name):
    method _global_expert_id (line 191) | def _global_expert_id(self, local_id, ep_rank):
    method _get_num_experts_per_rank (line 194) | def _get_num_experts_per_rank(self):
    method _gen_node_id_map (line 205) | def _gen_node_id_map(self):
    method _modify_expert_id (line 233) | def _modify_expert_id(self, s_name, new_id):
    method merge_and_save (line 243) | def merge_and_save(
    method _merge_sharding_for_dense_params (line 311) | def _merge_sharding_for_dense_params(self, parallel_2_ckpt_map, ignore...
    method _replicate_fused_param (line 334) | def _replicate_fused_param(self, local_params, indices_or_sections, co...
    method _replicate_dense_params (line 346) | def _replicate_dense_params(self, dense_params):
    method _merge_sharding_for_expert_params (line 359) | def _merge_sharding_for_expert_params(self, parallel_2_ckpt_map, ignor...
    method _extend_ep_degree_for_expert_params (line 383) | def _extend_ep_degree_for_expert_params(self, expert_params, dst_ep_de...
    method _get_final_ckpts (line 404) | def _get_final_ckpts(
    method _read_ckpts (line 467) | def _read_ckpts(self, args):
    method _read_ckpt (line 480) | def _read_ckpt(self, mp, pp, sd, include_opt_state):
    method _read_all_ckpts_by_pp_stage (line 483) | def _read_all_ckpts_by_pp_stage(self, pp_stage, include_opt_state=False):
    method _merge_and_save (line 504) | def _merge_and_save(self, mp_rank, save_dir, include_opt_state, ignore...
    method _merge_pp_ckpts (line 549) | def _merge_pp_ckpts(self, rank_info, ckpts, is_opt):
    method _get_param_meta (line 564) | def _get_param_meta(self, mp_rank, ep_rank=None):
    method _merge_sharding_param_ckpts (line 584) | def _merge_sharding_param_ckpts(
    method _concat_crop_reshape (line 620) | def _concat_crop_reshape(self, arrs, shape, name, ignore_sharding_padd...
    method _get_opt_state_key_and_type (line 639) | def _get_opt_state_key_and_type(self, name):
    method _merge_sharding_opt_ckpts (line 664) | def _merge_sharding_opt_ckpts(self, mp_rank, ckpts, ignore_sharding_pa...
    method _cal_ep_rank (line 759) | def _cal_ep_rank(self, sd_rank, mp_rank):
    method load_ckpt (line 764) | def load_ckpt(self, mp_rank, pp_rank, sharding_rank, include_opt_state):
    method weight_suffix (line 821) | def weight_suffix(self, mp_rank, pp_rank, sharding_rank):
    method load_model_meta (line 834) | def load_model_meta(self):
    method move_useful_file (line 841) | def move_useful_file(self, save_dir):
  function merge_and_save (line 851) | def merge_and_save(args):

FILE: examples/experiments/ernie_pretrain/tools/uc_to_sharded/convert_uc_to_sharded.py
  function parse_args (line 31) | def parse_args():
  function find_files (line 40) | def find_files(path, suffixes):
  class Checkpoint (line 56) | class Checkpoint:
    method __init__ (line 57) | def __init__(self, args):
    method map_to_org_model (line 95) | def map_to_org_model(self, layer_name):
    method load_from_org_model (line 104) | def load_from_org_model(self, layer_name):
    method process_one_pdparam (line 122) | def process_one_pdparam(self, pdparam_path):
    method process_pdparams (line 134) | def process_pdparams(self):
    method load_from_org_model_with_tensor_name (line 138) | def load_from_org_model_with_tensor_name(self, tensor_name, structure_...
    method process_one_pdopt (line 184) | def process_one_pdopt(self, pdopt_path):
    method process_pdopts (line 246) | def process_pdopts(self):
  function convert_ckpt (line 251) | def convert_ckpt(args):

FILE: examples/experiments/paddlefleet/glm45_provider.py
  class GLMMoEModelProvider (line 31) | class GLMMoEModelProvider(GPTModelProvider):
  class GLM45ModelProvider355B (line 88) | class GLM45ModelProvider355B(GLMMoEModelProvider):
  class GLM45AirModelProvider106B (line 107) | class GLM45AirModelProvider106B(GLMMoEModelProvider):
  class GLM45AirModelDebugProvider (line 127) | class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
  class GLM45AirModelDebugProviderFP8 (line 148) | class GLM45AirModelDebugProviderFP8(GLM45AirModelDebugProvider):
  class GLM45AirModelSingleCardDebugProvider (line 154) | class GLM45AirModelSingleCardDebugProvider(GLMMoEModelProvider):

FILE: examples/experiments/paddlefleet/qwen_provider.py
  class Qwen3MoEModelProvider (line 31) | class Qwen3MoEModelProvider(GPTModelProvider):
  class Qwen3MoEModelProvider30B_A3B (line 75) | class Qwen3MoEModelProvider30B_A3B(Qwen3MoEModelProvider):
  class Qwen3MoEModelSingleCardProvider (line 89) | class Qwen3MoEModelSingleCardProvider(Qwen3MoEModelProvider):

FILE: examples/experiments/paddlefleet/run_pretrain.py
  class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 118) | def __post_init__(self):
  class DataArguments (line 175) | class DataArguments:
  class ModelArguments (line 208) | class ModelArguments:
  function create_pretrained_dataset (line 246) | def create_pretrained_dataset(
  function get_train_data_file (line 357) | def get_train_data_file(args):
  class PretrainingTrainer (line 382) | class PretrainingTrainer(Trainer):
    method __init__ (line 383) | def __init__(self, *args, **kwargs):
    method evaluate (line 387) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 427) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 437) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function _set_random_seed (line 448) | def _set_random_seed(
  function main (line 474) | def main():

FILE: examples/tools/create_pretraining_data.py
  function print_datetime (line 40) | def print_datetime(string):
  function get_args (line 45) | def get_args():
  function lexical_analysis_fn (line 100) | def lexical_analysis_fn():
  function chinese_segmentation_fn (line 112) | def chinese_segmentation_fn():
  function jieba_segmentation_fn (line 124) | def jieba_segmentation_fn():
  function get_whole_word_mask_tokens (line 134) | def get_whole_word_mask_tokens(tokens, words, max_word_length=6):
  class IdentitySplitter (line 199) | class IdentitySplitter(object):
    method tokenize (line 200) | def tokenize(self, *text):
  class NewlineSplitter (line 204) | class NewlineSplitter:
    method tokenize (line 205) | def tokenize(self, text):
  class Converter (line 209) | class Converter(object):
    method __init__ (line 210) | def __init__(self, args):
    method initializer (line 213) | def initializer(self):
    method remove_repeated_chars (line 269) | def remove_repeated_chars(text, max_repeated_len=100):
    method encode (line 284) | def encode(self, json_line):
  function main (line 306) | def main():

FILE: examples/tools/gpt-oss_weight_change/change_weight_dtype.py
  function find_safetensors_files (line 46) | def find_safetensors_files(directory):
  function endswith (line 55) | def endswith(key, prefix_list):
  function save_single_safetenors (line 62) | def save_single_safetenors(save_path, state_dict, rank, total_files_size...
  function fp4_to_bf16 (line 73) | def fp4_to_bf16(load_path, save_path):
  function bf16_to_fp4 (line 96) | def bf16_to_fp4(load_path, save_path):

FILE: examples/tools/merge.py
  function print_datetime (line 25) | def print_datetime(string):
  function merge_sft_datasets (line 30) | def merge_sft_datasets(input_dirs, output_dir):
  function main (line 120) | def main(args):

FILE: examples/tools/trans_paddlenlp2hf.py
  function parse_arguments (line 28) | def parse_arguments():
  function load_safetensors_state_dict (line 44) | def load_safetensors_state_dict(input_dir):
  function trans_paddlenlp2hf (line 61) | def trans_paddlenlp2hf():

FILE: paddleformers/__init__.py
  function compare_version (line 32) | def compare_version(v1, v2):
  function _check_dependency_versions (line 42) | def _check_dependency_versions():

FILE: paddleformers/cli/cli.py
  function main (line 57) | def main():

FILE: paddleformers/cli/export/export.py
  function check_download_repo (line 33) | def check_download_repo(model_name_or_path, download_hub=None):
  function logger_merge_config (line 52) | def logger_merge_config(merge_config, lora_merge):
  function run_export (line 83) | def run_export(args: Optional[dict[str, Any]] = None) -> None:

FILE: paddleformers/cli/hparams/data_args.py
  class DataArguments (line 19) | class DataArguments:

FILE: paddleformers/cli/hparams/export_args.py
  class ExportArguments (line 19) | class ExportArguments:

FILE: paddleformers/cli/hparams/finetuning_args.py
  class PreTrainingArguments (line 28) | class PreTrainingArguments(TrainingArguments):
    method need_data (line 102) | def need_data(self):
    method reeao_dataset_rank (line 120) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 138) | def reeao_dataset_world_size(self):
  class VLSFTTrainingArguments (line 148) | class VLSFTTrainingArguments(PreTrainingArguments):
  class SFTTrainingArguments (line 156) | class SFTTrainingArguments(TrainingArguments):
  class DPOTrainingArguments (line 169) | class DPOTrainingArguments(TrainingArguments):
  class FinetuningArguments (line 232) | class FinetuningArguments(
    method __post_init__ (line 300) | def __post_init__(self):

FILE: paddleformers/cli/hparams/generating_args.py
  class StreamOptions (line 18) | class StreamOptions:
    method __init__ (line 25) | def __init__(self, max_count: int = 100):
  class GeneratingArguments (line 35) | class GeneratingArguments:

FILE: paddleformers/cli/hparams/model_args.py
  class VisionArguments (line 20) | class VisionArguments:
  class FP8MemConfigs (line 38) | class FP8MemConfigs:
  class FP8FusedOpsConfigs (line 47) | class FP8FusedOpsConfigs:
  class ErniePretrainArgument (line 56) | class ErniePretrainArgument:
  class ModelArguments (line 80) | class ModelArguments:
    method __post_init__ (line 234) | def __post_init__(self):

FILE: paddleformers/cli/hparams/parser.py
  function _load_custom_template (line 84) | def _load_custom_template(custom_path):
  function read_args (line 94) | def read_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -...
  function _parse_args (line 115) | def _parse_args(
  function _parse_train_args (line 152) | def _parse_train_args(args: Optional[Union[dict[str, Any], list[str]]] =...
  function _parse_eval_args (line 166) | def _parse_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = ...
  function _parse_server_args (line 180) | def _parse_server_args(args: Optional[Union[dict[str, Any], list[str]]] ...
  function _parse_export_args (line 194) | def _parse_export_args(args: Optional[Union[dict[str, Any], list[str]]] ...
  function get_train_args (line 208) | def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = No...
  function get_eval_args (line 260) | def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = Non...
  function get_server_args (line 273) | def get_server_args(args: Optional[Union[dict[str, Any], list[str]]] = N...
  function get_export_args (line 286) | def get_export_args(args: Optional[Union[dict[str, Any], list[str]]] = N...

FILE: paddleformers/cli/hparams/preprocess_args.py
  class BasePreprocessArguments (line 25) | class BasePreprocessArguments:
    method __post_init__ (line 26) | def __post_init__(self):
  class UtteranceProcessorArguments (line 31) | class UtteranceProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 39) | def __post_init__(self):
  class CoarseProcessorArguments (line 46) | class CoarseProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 57) | def __post_init__(self):
  class InputIdsMassageArguments (line 64) | class InputIdsMassageArguments(BasePreprocessArguments):
    method __post_init__ (line 92) | def __post_init__(self):
  class ImageModificationProcessorArguments (line 102) | class ImageModificationProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 112) | def __post_init__(self):
  class End2EndProcessorArgumentsHelper (line 117) | class End2EndProcessorArgumentsHelper(BasePreprocessArguments):
    method __post_init__ (line 124) | def __post_init__(self):
  class End2EndProcessorArguments (line 129) | class End2EndProcessorArguments(
    method __post_init__ (line 136) | def __post_init__(self):

FILE: paddleformers/cli/hparams/server_args.py
  class ServerArguments (line 19) | class ServerArguments:

FILE: paddleformers/cli/launcher.py
  function launch (line 21) | def launch():

FILE: paddleformers/cli/train/auto_parallel/workflow.py
  function create_pretrained_dataset (line 42) | def create_pretrained_dataset(
  function get_train_data_file (line 116) | def get_train_data_file(args):
  class PretrainingTrainer (line 141) | class PretrainingTrainer(Trainer):
    method __init__ (line 142) | def __init__(self, *args, **kwargs):
  function run_auto_parallel (line 147) | def run_auto_parallel(model_args, data_args, generating_args, training_a...

FILE: paddleformers/cli/train/deepseek_v3_pretrain/configuration.py
  class DeepseekV2FastConfig (line 22) | class DeepseekV2FastConfig(PretrainedConfig):
    method __init__ (line 131) | def __init__(

FILE: paddleformers/cli/train/deepseek_v3_pretrain/fp8_linear.py
  function fp8_linear (line 54) | def fp8_linear(
  function register_scale (line 95) | def register_scale(self):
  class Linear (line 109) | class Linear(PD_Linear):
    method __init__ (line 110) | def __init__(self, *args, **kwargs):
  class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear):
    method __init__ (line 117) | def __init__(self, *args, **kwargs):
  class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear):
    method __init__ (line 124) | def __init__(self, *args, **kwargs):
  class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear):
    method __init__ (line 131) | def __init__(self, *args, **kwargs):
  class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear):
    method __init__ (line 138) | def __init__(self, *args, **kwargs):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/kernel.py
  function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
  function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T...
  function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons...
  function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =...
  function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel(
  function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s...

FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling.py
  function swiglu (line 110) | def swiglu(x, y=None):
  function get_use_casual_mask (line 134) | def get_use_casual_mask():
  function set_global_step (line 139) | def set_global_step(cur_step):
  function get_global_step (line 144) | def get_global_step():
  function rms_norm_fused (line 149) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
  function cast_if_needed (line 157) | def cast_if_needed(x, dtype):
  function fusion_rms_norm (line 164) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln...
  class LMHeadFunction (line 187) | class LMHeadFunction(paddle.autograd.PyLayer):
    method forward (line 189) | def forward(ctx, x, weight, transpose_y):
    method backward (line 196) | def backward(ctx, dout):
  function parallel_matmul (line 226) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para...
  class DeepseekV2MLP (line 256) | class DeepseekV2MLP(nn.Layer):
    method __init__ (line 257) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int...
    method forward (line 303) | def forward(self, x):
  class MoEGate (line 309) | class MoEGate(PretrainedMoEGate):
    method __init__ (line 310) | def __init__(
    method forward (line 356) | def forward(self, hidden_states):
  class DeepseekV2MoE (line 408) | class DeepseekV2MoE(MoELayer):
    method __init__ (line 413) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor...
    method fp8_quant_weight (line 492) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 555) | def forward(self, hidden_states):
    method post_process (line 580) | def post_process(self, hidden_states, final_hidden_states, l_aux):
  class DeepseekV2RotaryEmbedding (line 591) | class DeepseekV2RotaryEmbedding(nn.Layer):
    method __init__ (line 592) | def __init__(self, dim, max_position_embeddings=2048, base=10000):
    method _set_cos_sin_cache (line 607) | def _set_cos_sin_cache(self, seq_len):
    method forward (line 620) | def forward(self, x, seq_len=None):
  class DeepseekV2Attention (line 633) | class DeepseekV2Attention(nn.Layer):
    method __init__ (line 636) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ...
    method fp8_quant_weight (line 746) | def fp8_quant_weight(self, quant_transpose=None):
    method _init_rope (line 753) | def _init_rope(self):
    method _shape (line 785) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
    method forward (line 788) | def forward(
  class DeepseekV2DecoderLayer (line 933) | class DeepseekV2DecoderLayer(nn.Layer):
    method __init__ (line 934) | def __init__(
    method fp8_quant_weight (line 975) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 984) | def forward(
    method self_attn_compute (line 1082) | def self_attn_compute(self, hidden_states, **kwargs):
    method pre_dispatch_compute (line 1132) | def pre_dispatch_compute(self, hidden_states):
    method expert_forward_compute (line 1139) | def expert_forward_compute(self, intermediate_hidden_states, dispatche...
    method post_combine_compute (line 1152) | def post_combine_compute(self, residual, hidden_states, final_hidden_s...
  class DeepseekV2MTPLayer (line 1167) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer):
    method __init__ (line 1168) | def __init__(
    method forward (line 1180) | def forward(
  class DeepseekV2PretrainedModelFast (line 1217) | class DeepseekV2PretrainedModelFast(PretrainedModel):
    method _get_model_flops (line 1222) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs):
    method _get_hardware_flops (line 1235) | def _get_hardware_flops(self, *args, **kwargs):
    method _get_name_mappings (line 1239) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat...
    method _get_tensor_parallel_mappings (line 1299) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i...
    method _init_weights (line 1399) | def _init_weights(self, layer):
    method step_flex_token (line 1460) | def step_flex_token(self, cur_step):
  class DeepseekV2ModelFast (line 1465) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast):
    method __init__ (line 1473) | def __init__(self, config: DeepseekV2FastConfig):
    method get_input_embeddings (line 1503) | def get_input_embeddings(self):
    method set_input_embeddings (line 1506) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1510) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_...
    method recompute_training_full (line 1545) | def recompute_training_full(
    method forward (line 1576) | def forward(
  class DeepseekV2PretrainingCriterionFast (line 1787) | class DeepseekV2PretrainingCriterionFast(nn.Layer):
    method __init__ (line 1793) | def __init__(self, config: DeepseekV2FastConfig):
    method forward (line 1804) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  function yarn_find_correction_dim (line 1854) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio...
  function yarn_find_correction_range (line 1859) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p...
  function yarn_linear_ramp_mask (line 1865) | def yarn_linear_ramp_mask(min, max, dim):
  class DeepseekV2YarnRotaryEmbedding (line 1874) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    method __init__ (line 1875) | def __init__(
    method _set_cos_sin_cache (line 1895) | def _set_cos_sin_cache(self, seq_len):
  class RmsNormFunction (line 1929) | class RmsNormFunction(paddle.autograd.PyLayer):
    method forward (line 1931) | def forward(ctx, x, scale, epsilon):
    method backward (line 1940) | def backward(ctx, grad_output):
  class DeepseekV2RMSNorm (line 1954) | class DeepseekV2RMSNorm(nn.Layer):
    method __init__ (line 1955) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps...
    method forward (line 1979) | def forward(self, hidden_states):
    method extra_repr (line 1992) | def extra_repr(self):
  function apply_rotary_pos_emb (line 1996) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion...
  class FusedNormGateFunc (line 2050) | class FusedNormGateFunc(paddle.autograd.PyLayer):
    method set_temporary_vars (line 2057) | def set_temporary_vars(cls, norm_output, invar):
    method clear_temporary_vars (line 2062) | def clear_temporary_vars(cls):
    method forward (line 2067) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
    method backward (line 2077) | def backward(ctx, d_gate_logits, d_norm_output):
  class TemporaryVarContext (line 2101) | class TemporaryVarContext:
    method __init__ (line 2102) | def __init__(self, norm_output, invar):
    method __enter__ (line 2106) | def __enter__(self):
    method __exit__ (line 2109) | def __exit__(self, exc_type, exc_val, exc_tb):
  function balance_expert_assignment (line 2113) | def balance_expert_assignment(n, m, k):
  class FakeGate (line 2124) | class FakeGate(paddle.autograd.PyLayer):
    method forward (line 2126) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa...
    method backward (line 2142) | def backward(ctx, grad_output):
  class AddAuxiliaryLoss (line 2146) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 2153) | def forward(ctx, x, loss):
    method backward (line 2159) | def backward(ctx, grad_output):
  function qkv_pre_process_no_fuse (line 2167) | def qkv_pre_process_no_fuse(
  function rearrange_kv (line 2204) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads):
  function enable_to_static (line 2215) | def enable_to_static(value):
  function qkv_pre_process (line 2224) | def qkv_pre_process(
  function manul_fwd (line 2267) | def manul_fwd(
  class MemroyRecomputeAttnFunc (line 2319) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer):
    method forward (line 2321) | def forward(
    method backward (line 2516) | def backward(ctx, dout):
  class MemroyRecomputeAttn (line 2829) | class MemroyRecomputeAttn(paddle.nn.Layer):
    method __init__ (line 2830) | def __init__(
    method fp8_quant_weight (line 2908) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 2912) | def forward(self, q_init, kv_init, position_ids):
  class FusedRMSLinearFunc (line 2942) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 2944) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps):
    method backward (line 2965) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinear (line 3015) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 3016) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method fp8_quant_weight (line 3040) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 3043) | def forward(self, x):
  class FusedRMSLinearSingleFunc (line 3048) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer):
    method forward (line 3050) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 3059) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinearSingle (line 3070) | class FusedRMSLinearSingle(paddle.nn.Layer):
    method __init__ (line 3071) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method forward (line 3088) | def forward(self, x):
  class FastCrossEntropyFunction (line 3093) | class FastCrossEntropyFunction(paddle.autograd.PyLayer):
    method forward (line 3095) | def forward(ctx, preds, labels):
    method backward (line 3103) | def backward(ctx, dout):
  class DeepseekV2LMHead (line 3113) | class DeepseekV2LMHead(nn.Layer):
    method __init__ (line 3114) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None):
    method forward (line 3150) | def forward(self, hidden_states, tensor_parallel_output=None):
    method extra_repr (line 3172) | def extra_repr(self):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling_pp.py
  function check_accept_none_grad (line 82) | def check_accept_none_grad():
  function parse_args (line 98) | def parse_args(args):
  function return_args (line 128) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r...
  function get_attr (line 143) | def get_attr(layer, name):
  function calc_stream_wait (line 150) | def calc_stream_wait(group_id):
  class TensorMeta (line 155) | class TensorMeta:
    method __init__ (line 158) | def __init__(self, tensor):
  class PostProcessNode (line 163) | class PostProcessNode(ScheduleNode):
    method __init__ (line 164) | def __init__(
    method forward_without_residual (line 188) | def forward_without_residual(self, inputs):
    method forward (line 232) | def forward(self, inputs):
    method backward (line 278) | def backward(self, output_grad):
  class DecoderLayerNode (line 334) | class DecoderLayerNode(ScheduleNode):
    method __init__ (line 335) | def __init__(
    method dispatch_forward (line 364) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co...
    method combine_forward (line 410) | def combine_forward(self, inputs, previous_event=None):
    method dispatch_backward (line 426) | def dispatch_backward(self, output_grad):
    method combine_backward (line 465) | def combine_backward(self, output_grad):
    method forward (line 492) | def forward(self, inputs):
    method backward (line 512) | def backward(self, output_grad=None, scaler=None):
  class OverlapedScheduleChunk (line 535) | class OverlapedScheduleChunk:
    method __init__ (line 536) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True):
    method forward_backward (line 547) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class DecoderBackwardScheduleChunk (line 560) | class DecoderBackwardScheduleChunk:
    method __init__ (line 561) | def __init__(self, nodes):
    method backward (line 564) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre...
  class OverlapedScheduleNode (line 574) | class OverlapedScheduleNode:
    method __init__ (line 575) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 581) | def forward_backward(self, inputs, output_grad, event_to_wait=None):
  class FusionFp8DecoderLayerNode (line 609) | class FusionFp8DecoderLayerNode(ScheduleNode):
    method __init__ (line 610) | def __init__(
    method attn_forward (line 634) | def attn_forward(self, inputs):
    method dispatch_forward (line 670) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F...
    method mlp_forward (line 698) | def mlp_forward(self, inputs):
    method combine_forward (line 737) | def combine_forward(self, inputs, async_finish=False, previous_event=N...
    method post_process_forward (line 763) | def post_process_forward(self, inputs, with_residual=True):
    method post_process_backward (line 786) | def post_process_backward(self, output_grad, event_to_wait=None):
    method combine_backward (line 820) | def combine_backward(self, output_grad, previous_event=None, async_fin...
    method mlp_backward (line 878) | def mlp_backward(self, output_grad):
    method dispatch_backward (line 910) | def dispatch_backward(self, output_grad, async_finish=False, previous_...
    method attn_backward (line 959) | def attn_backward(self, output_grad):
    method backward_for_fusion (line 1017) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No...
    method forward (line 1088) | def forward(self, inputs):
    method backward (line 1098) | def backward(self, output_grad=None, scaler=None):
  class DenseDecoderLayerNode (line 1109) | class DenseDecoderLayerNode(ScheduleNode):
    method __init__ (line 1110) | def __init__(
    method forward (line 1120) | def forward(self, inputs):
    method backward (line 1125) | def backward(self, output_grad=None, scaler=None):
  class OverlapedFUsionScheduleNode (line 1132) | class OverlapedFUsionScheduleNode:
    method __init__ (line 1133) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1141) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class OverlapedDenseFusionScheduleNode (line 1277) | class OverlapedDenseFusionScheduleNode:
    method __init__ (line 1278) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1287) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  function build_overlapped_nodes (line 1373) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ...
  class EmbeddingFunction (line 1437) | class EmbeddingFunction(paddle.autograd.PyLayer):
    method forward (line 1439) | def forward(ctx, x, weight):
    method backward (line 1448) | def backward(ctx, dout):
  class DeepseekV2EmbeddingPipe (line 1459) | class DeepseekV2EmbeddingPipe(nn.Layer):
    method __init__ (line 1460) | def __init__(self, config: DeepseekV2FastConfig):
    method embedding_weight (line 1475) | def embedding_weight(self):
    method forward (line 1478) | def forward(self, args):
    method build_schedule_node (line 1558) | def build_schedule_node(self):
  class DeepseekV2DecoderLayerPipe (line 1562) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer):
    method forward (line 1563) | def forward(self, args):
    method attn_compute (line 1622) | def attn_compute(self, args):
    method attn_compute_for_fusion (line 1657) | def attn_compute_for_fusion(self, args):
    method mlp_compute (line 1695) | def mlp_compute(self, inputs):
    method post_process_compute (line 1742) | def post_process_compute(self, inputs):
    method post_process_compute_for_fusion (line 1778) | def post_process_compute_for_fusion(self, inputs):
    method attn_compute_dense (line 1803) | def attn_compute_dense(self, args):
    method mlp_compute_dense (line 1821) | def mlp_compute_dense(self, inputs):
    method build_schedule_node (line 1835) | def build_schedule_node(self):
  class DeepseekV2MTPLayerPipe (line 1901) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer):
    method forward (line 1902) | def forward(self, args):
    method attn_compute_for_fusion (line 1970) | def attn_compute_for_fusion(self, args):
    method build_schedule_node (line 2017) | def build_schedule_node(self):
  class DeepseekV2RMSNormPipe (line 2036) | class DeepseekV2RMSNormPipe(nn.Layer):
    method __init__ (line 2037) | def __init__(self, config):
    method forward (line 2042) | def forward(self, args):
    method build_schedule_node (line 2057) | def build_schedule_node(self):
  class DeepseekV2LMHeadPipe (line 2061) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead):
    method __init__ (line 2062) | def __init__(self, config, embedding_weight=None):
    method embedding_weight (line 2066) | def embedding_weight(self):
    method forward (line 2069) | def forward(self, args: Union[Tuple, paddle.Tensor]):
    method build_schedule_node (line 2079) | def build_schedule_node(self):
  class DeepseekV2PretrainingCriterionPipe (line 2083) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF...
    method forward (line 2084) | def forward(self, logits, labels):
    method build_schedule_node (line 2095) | def build_schedule_node(self):
  class DeepseekV2ForCausalLMPipe (line 2099) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method step_flex_token (line 2118) | def step_flex_token(self, cur_step):
    method _prepare_pipeline_inputs_func (line 2122) | def _prepare_pipeline_inputs_func(cls, inputs):
    method __init__ (line 2145) | def __init__(self, config: DeepseekV2FastConfig):
    method fp8_quant_weight (line 2310) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
    method get_loss_fn (line 2323) | def get_loss_fn(self, config):
    method overlapped_forward_backward (line 2326) | def overlapped_forward_backward(

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_gate.py
  class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 138) | def _topk_noaux_tc(
    method top1gating (line 175) | def top1gating(
    method top2gating (line 245) | def top2gating(
    method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor:
    method topkgating (line 359) | def topkgating(
    method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_layer.py
  function record_stream_for_multi_input (line 59) | def record_stream_for_multi_input(x):
  function stop_gradient_for_multi_input (line 67) | def stop_gradient_for_multi_input(x):
  class MoELayer (line 74) | class MoELayer(nn.Layer):
    method __init__ (line 75) | def __init__(
    method update_flex_token (line 150) | def update_flex_token(self):
    method _parse_moe_expert_parallel (line 166) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa...
    method _post_init (line 176) | def _post_init(self):
    method forward (line 187) | def forward(
    method forward_drop_token (line 208) | def forward_drop_token(
    method expert_forward (line 327) | def expert_forward(self, dispatched_input):
    method forward_flex_token (line 338) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 381) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 384) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 387) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 395) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 401) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 407) | def post_combine_compute(self, hidden_states):
  class MoEFlexTokenLayer (line 412) | class MoEFlexTokenLayer(nn.Layer):
    method __init__ (line 413) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar...
    method expert_forward (line 429) | def expert_forward(self, dispatched_input, tokens_per_expert):
    method forward (line 441) | def forward(self, hidden_states: paddle.Tensor):
    method forward_flex_token (line 452) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 495) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 498) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 501) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 509) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 515) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 521) | def post_combine_compute(self, hidden_states):
  class Fp8DispatchQuantNode (line 526) | class Fp8DispatchQuantNode:
    method __init__ (line 527) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_...
    method forward (line 534) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 562) | def backward(self, hs_grad, token_probs_grad):
  class Fp8DispatchNode (line 574) | class Fp8DispatchNode:
    method __init__ (line 575) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"):
    method forward (line 581) | def forward(
    method backward (line 610) | def backward(
  class Fp8CombineNode (line 629) | class Fp8CombineNode:
    method __init__ (line 630) | def __init__(self, token_dispatcher, name="fp8_combine_node"):
    method forward (line 636) | def forward(self, hidden_states_out, previous_event=None, async_finish...
    method backward (line 651) | def backward(self, output_combine_grad, previous_event=None, async_fin...
  class Fp8CombineQuantNode (line 662) | class Fp8CombineQuantNode:
    method __init__ (line 663) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=...
    method forward (line 670) | def forward(self, output_combine):
    method backward (line 679) | def backward(self, output_grad, event_to_wait=None):
  class FusionMlpNode (line 705) | class FusionMlpNode:
    method __init__ (line 710) | def __init__(
    method set_recompute_fwd_gate_up (line 746) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up):
    method reset_statue (line 749) | def reset_statue(self):
    method prepare_env_subbatch (line 774) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s...
    method gemm_forward_subbatch (line 791) | def gemm_forward_subbatch(
    method gemm_backward_subbatch (line 826) | def gemm_backward_subbatch(
    method forward (line 887) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1025) | def backward(self, hidden_states_out_grad):
  class FusionMoeNode (line 1128) | class FusionMoeNode:
    method __init__ (line 1129) | def __init__(
    method forward (line 1162) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 1190) | def backward(self, output_grad):
  class FusionMoe (line 1205) | class FusionMoe(paddle.autograd.PyLayer):
    method forward (line 1207) | def forward(
    method backward (line 1226) | def backward(ctx, output_grad):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_utils.py
  function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self):
  function _holder_size (line 45) | def _holder_size(self):
  function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute_fast (line 69) | def permute_fast(
  function unpermute_fast (line 90) | def unpermute_fast(
  class UnZipNode (line 132) | class UnZipNode:
    method __init__ (line 133) | def __init__(self, name="unzip"):
    method reset_statue (line 138) | def reset_statue(self):
    method forward (line 143) | def forward(
    method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind...
  class ZipNode (line 203) | class ZipNode:
    method __init__ (line 204) | def __init__(self, name="zip"):
    method forward (line 208) | def forward(
    method backward (line 218) | def backward(
  class PermuteNode (line 264) | class PermuteNode:
    method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"):
    method reset_status (line 269) | def reset_status(self):
    method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic...
    method backward (line 287) | def backward(self, out_grad, dispatched_probs):
  class UnPermuteNode (line 300) | class UnPermuteNode:
    method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"):
    method reset_status (line 305) | def reset_status(self):
    method forward (line 314) | def forward(
    method backward (line 352) | def backward(self, out_grad, out_grad_scale):
  function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped...
  function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype):
  function get_env_device (line 420) | def get_env_device():

FILE: paddleformers/cli/train/deepseek_v3_pretrain/token_dispatcher.py
  class _DeepepManager (line 31) | class _DeepepManager(_DispatchManager):
    method __init__ (line 53) | def __init__(
    method setup_metadata (line 74) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten...
    method dispatch (line 82) | def dispatch(
    method _indices_to_multihot (line 94) | def _indices_to_multihot(self, indices, probs):
    method get_dispatched_metadata (line 119) | def get_dispatched_metadata(self) -> paddle.Tensor:
    method get_number_of_tokens_per_expert (line 122) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor:
    method combine (line 128) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
    method get_permuted_hidden_states_by_experts (line 134) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle....
    method get_permuted_hidden_states_by_experts_fast (line 146) | def get_permuted_hidden_states_by_experts_fast(
    method get_restored_hidden_states_by_experts (line 156) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle....
    method get_restored_hidden_states_by_experts_fast (line 168) | def get_restored_hidden_states_by_experts_fast(
  class MoETokenDispatcher (line 187) | class MoETokenDispatcher:
    method __init__ (line 192) | def __init__(self, ep_group) -> None:
    method ep_group (line 199) | def ep_group(self):
    method ep_size (line 204) | def ep_size(self):
    method token_permutation (line 209) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso...
    method token_unpermutation (line 223) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd...
  class MoEFlexTokenDispatcher (line 236) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 241) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method token_permutation (line 253) | def token_permutation(
    method token_unpermutation (line 266) | def token_unpermutation(
  class MoEFlexTokenDispatcherFast (line 277) | class MoEFlexTokenDispatcherFast:
    method __init__ (line 282) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method ep_group (line 295) | def ep_group(self):
    method ep_size (line 300) | def ep_size(self):
    method pre_dispatch (line 304) | def pre_dispatch(self, hidden_states, probs, routing_map):
    method post_dispatch (line 314) | def post_dispatch(self, hidden_states, dispatched_indices):
    method pre_combine (line 322) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm...
    method post_combine (line 328) | def post_combine(self, hidden_states):
    method token_permutation (line 332) | def token_permutation(
    method token_unpermutation (line 350) | def token_unpermutation(
  class PreDispatchNode (line 368) | class PreDispatchNode:
    method __init__ (line 369) | def __init__(self, token_dispatcher):
    method reset_status (line 373) | def reset_status(self):
    method forward (line 379) | def forward(self, routing_map, probs):
    method backward (line 394) | def backward(self, token_probs_g):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/convert_ckpt_to_sft.py
  function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function _is_need_transpose (line 172) | def _is_need_transpose(key):
  function prepare_tensor (line 191) | def prepare_tensor(key, value):
  function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/load_hf_ckpt.py
  function paddle_name_to_hf_names_ds_v2 (line 57) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
  function paddle_name_to_hf_names (line 132) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _get_hf_prefix (line 200) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
  function _handle_expert_weights (line 210) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 225) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 238) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function prepare_tensor (line 248) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
  function load_huggingface_ckpt (line 278) | def load_huggingface_ckpt(model, huggingface_ckpt_path):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/workflow.py
  class PreTrainingArguments (line 64) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 91) | def __post_init__(self):
  class DataArguments (line 111) | class DataArguments:
  class ModelArguments (line 144) | class ModelArguments:
  function create_pretrained_dataset (line 175) | def create_pretrained_dataset(
  function get_train_data_file (line 249) | def get_train_data_file(args):
  class PretrainingTrainer (line 274) | class PretrainingTrainer(Trainer):
    method __init__ (line 275) | def __init__(self, *args, **kwargs):
    method evaluate (line 279) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 319) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 329) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function run_dsv3_pretrain (line 340) | def run_dsv3_pretrain(model_args, data_args, generating_args, training_a...

FILE: paddleformers/cli/train/dpo/data_config.py
  class DataConfig (line 21) | class DataConfig:

FILE: paddleformers/cli/train/dpo/dpo_argument.py
  class DPOTrainingArguments (line 30) | class DPOTrainingArguments(TrainingArguments):
    method __post_init__ (line 59) | def __post_init__(self):
  class DPOConfig (line 93) | class DPOConfig:
  class DPODataArgument (line 112) | class DPODataArgument(DataConfig):
  class DPOModelArgument (line 122) | class DPOModelArgument:

FILE: paddleformers/cli/train/dpo/dpo_estimate_training.py
  function calculate_acc_steps (line 31) | def calculate_acc_steps(num_samples, train_batch, dataset_world_size, pe...
  function dpo_estimate_training (line 59) | def dpo_estimate_training(tokenizer, data_args, training_args, dataset_c...

FILE: paddleformers/cli/train/dpo/dpo_trainer.py
  function disable_dropout_in_model (line 44) | def disable_dropout_in_model(model: paddle.nn.Layer) -> None:
  class DPOTrainer (line 51) | class DPOTrainer(Trainer):
    method __init__ (line 56) | def __init__(
    method get_batch_metrics (line 124) | def get_batch_metrics(self, ref_model, model, batch, train_eval="train"):
    method compute_loss (line 193) | def compute_loss(self, model, inputs):
    method _wrap_ref_model (line 206) | def _wrap_ref_model(self, model):
    method _wrap_model (line 227) | def _wrap_model(self, model, training=True):
    method evaluate (line 245) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_step (line 252) | def prediction_step(self, model, inputs, prediction_loss_only=False, i...
    method store_metrics (line 278) | def store_metrics(self, metrics, train_eval="train"):
    method log (line 283) | def log(self, logs, **kwargs):
    method fleet_prediction_pipeline_step (line 301) | def fleet_prediction_pipeline_step(
    method prediction_pipeline_step (line 407) | def prediction_pipeline_step(
    method log_metric (line 492) | def log_metric(
    method training_pipeline_step (line 542) | def training_pipeline_step(self, model, inputs):
    method disable_lora (line 624) | def disable_lora(self, model):
    method enable_lora (line 630) | def enable_lora(self, model):
    method reset_dpo_infohub (line 636) | def reset_dpo_infohub(self):
    method broadcast_last_stage_infohub_tensor (line 641) | def broadcast_last_stage_infohub_tensor(self):
  function prepare_pipeline_dpo_inputs_func (line 688) | def prepare_pipeline_dpo_inputs_func(inputs):
  function _prepare_pipeline_dpo_inputs_func_fleet (line 732) | def _prepare_pipeline_dpo_inputs_func_fleet(inputs):
  function fleet_merge_dpo_labels (line 771) | def fleet_merge_dpo_labels(labels, logprobs):

FILE: paddleformers/cli/train/dpo/workflow.py
  function run_dpo (line 65) | def run_dpo(

FILE: paddleformers/cli/train/ernie_pretrain/model_config.py
  class ModelConfig (line 22) | class ModelConfig:

FILE: paddleformers/cli/train/ernie_pretrain/models/comm_utils.py
  function scatter (line 33) | def scatter(input, group=None, axis=0):
  function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0):
  function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
  function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True):
  function all_gather (line 112) | def all_gather(input, group=None, axis=0):
  function reduce_scatter (line 131) | def reduce_scatter(input, group=None):
  function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...
  function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s...
  function profile (line 293) | def profile(name, use_event=True):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py
  class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig):
    method __init__ (line 72) | def __init__(
    method __setattr__ (line 398) | def __setattr__(self, name: str, value):
    method register_nonsaveable_keys (line 409) | def register_nonsaveable_keys(self, keys):
    method use_moe (line 418) | def use_moe(self) -> bool:
    method to_json_string (line 421) | def to_json_string(self, use_diff: bool = True) -> str:

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling.py
  function get_triangle_upper_mask (line 131) | def get_triangle_upper_mask(x, mask=None):
  function gqa_qkv_split_func (line 143) | def gqa_qkv_split_func(
  function gqa_qkv_merge_func (line 173) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h...
  function parallel_matmul (line 194) | def parallel_matmul(
  function calc_lm_head_logits (line 235) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para...
  function finfo (line 265) | def finfo(dtype: paddle.dtype = None):
  function masked_fill (line 283) | def masked_fill(x, mask, value):
  function mem_eff_attn (line 288) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa...
  function inbatch_pack_offset_to_attn_mask_start_row_indices (line 325) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs...
  function scaled_dot_product_attention (line 340) | def scaled_dot_product_attention(
  function _make_causal_mask (line 458) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
  function _expand_mask (line 472) | def _expand_mask(mask, dtype, tgt_length):
  class FusedDropoutImpl (line 487) | class FusedDropoutImpl(nn.Layer):
    method __init__ (line 488) | def __init__(self, prob, mode):
    method forward (line 495) | def forward(self, x, y):
  class RMSNorm (line 503) | class RMSNorm(nn.Layer):
    method __init__ (line 504) | def __init__(self, config):
    method forward (line 518) | def forward(self, hidden_states):
  class RotaryEmbedding (line 534) | class RotaryEmbedding(nn.Layer):
    method __init__ (line 535) | def __init__(self, dim, max_position_embeddings=4096, base=10000):
    method forward (line 551) | def forward(self, x, seq_len=None):
    method rotate_half (line 559) | def rotate_half(cls, x):
    method apply_rotary_pos_emb (line 566) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio...
  class RopeEmbeddingLegacy (line 584) | class RopeEmbeddingLegacy(nn.Layer):
    method __init__ (line 585) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a...
    method forward (line 592) | def forward(self, seq_length, position_ids=None):
    method apply_rotary (line 608) | def apply_rotary(self, rp, q, k):
    method apply_rotary_3d (line 630) | def apply_rotary_3d(self, rp, q, k, position_ids):
    method forward_single (line 698) | def forward_single(self, position_ids):
    method apply_rotary_single (line 713) | def apply_rotary_single(x, rope_emb):
  class ErnieMLP (line 721) | class ErnieMLP(nn.Layer):
    method __init__ (line 722) | def __init__(self, config):
    method forward (line 802) | def forward(self, x):
  class ErnieAttention (line 839) | class ErnieAttention(nn.Layer):
    method __init__ (line 840) | def __init__(self, config, layer_idx=0):
    method forward (line 1001) | def forward(
    method rope_attn (line 1099) | def rope_attn(
  class ErnieDecoderLayer (line 1199) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 1200) | def __init__(self, config, layer_idx=0):
    method forward (line 1214) | def forward(
  class ErniePretrainedModel (line 1269) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1274) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1345) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1430) | def _init_weights(self, layer):
  class ErnieModel (line 1478) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1479) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1506) | def get_input_embeddings(self):
    method set_input_embeddings (line 1509) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1513) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1532) | def recompute_training(
    method forward (line 1562) | def forward(
  class FusedHeadParallelCrossEntropy (line 1696) | class FusedHeadParallelCrossEntropy(PyLayer):
    method forward (line 1698) | def forward(
    method backward (line 1816) | def backward(ctx, loss_all_grad, labels_all_grad):
  class ErniePretrainingCriterion (line 1934) | class ErniePretrainingCriterion(paddle.nn.Layer):
    method __init__ (line 1935) | def __init__(self, config, return_tuple=True):
    method forward (line 1950) | def forward(self, prediction_scores, masked_lm_labels):
    method forward_impl_with_fused_head_loss_fn (line 2006) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde...
    method forward_impl_with_calc_logits (line 2041) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state...
    method loss_impl (line 2053) | def loss_impl(self, prediction_scores, masked_lm_labels):
    method forward_impl (line 2059) | def forward_impl(self, prediction_scores, masked_lm_labels):
  class ErnieLMHead (line 2114) | class ErnieLMHead(nn.Layer):
    method __init__ (line 2115) | def __init__(self, config):
    method forward (line 2154) | def forward(self, hidden_states, tensor_parallel_output=None):
    method sharded_state_dict (line 2173) | def sharded_state_dict(
  class ErnieForCausalLM (line 2182) | class ErnieForCausalLM(ErniePretrainedModel):
    method __init__ (line 2185) | def __init__(self, config):
    method _post_init (line 2217) | def _post_init(self, original_init, *args, **kwargs):
    method get_input_embeddings (line 2226) | def get_input_embeddings(self):
    method set_input_embeddings (line 2229) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2232) | def get_output_embeddings(self):
    method set_output_embeddings (line 2235) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2238) | def set_decoder(self, decoder):
    method get_decoder (line 2241) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2245) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2258) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2289) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2327) | def forward(
    method sharded_state_dict (line 2386) | def sharded_state_dict(self, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_moe.py
  class BaseModelOutputWithPastAndCrossAttentions (line 109) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput):
  class CausalLMOutputWithCrossAttentions (line 116) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput):
  function get_gate (line 136) | def get_gate(
  function build_mpdp_group (line 190) | def build_mpdp_group():
  function _parse_moe_group (line 211) | def _parse_moe_group(
  function moe_ep2mp (line 258) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf...
  function moe_statedict_cherry_pick (line 307) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf...
  function moe_statedict_upcycle (line 332) | def moe_statedict_upcycle(
  class ErnieMoeMLP (line 504) | class ErnieMoeMLP(ErnieMLP):
    method __init__ (line 505) | def __init__(self, config, is_shared_expert=False):
    method forward (line 517) | def forward(self, x, use_comm=True):
  class ErnieMoeDenseExpert (line 579) | class ErnieMoeDenseExpert(nn.Layer):
    method __init__ (line 580) | def __init__(self, config):
    method forward (line 628) | def forward(self, x):
  class BMMLinear (line 655) | class BMMLinear(nn.Layer):
    method __init__ (line 656) | def __init__(self, experts, d_in, d_out, use_bias=False):
    method forward (line 664) | def forward(self, x):
  class ErnieMoeMLPFused (line 670) | class ErnieMoeMLPFused(nn.Layer):
    method __init__ (line 671) | def __init__(self, config):
    method __len__ (line 689) | def __len__(self):
    method __iter__ (line 692) | def __iter__(self):
    method forward (line 695) | def forward(self, x):
  class FusedLinearAddNormFunc (line 705) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer):
    method forward (line 707) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps):
    method backward (line 717) | def backward(ctx, d_rms_norm_out, d_residual_out):
  class FusedLinearAddNorm (line 736) | class FusedLinearAddNorm(paddle.nn.Layer):
    method __init__ (line 737) | def __init__(self, hidden_size, eps=1e-6) -> None:
    method forward (line 755) | def forward(self, x, residual):
  class FusedRMSLinearFunc (line 759) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 761) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 769) | def backward(ctx, d_qkv):
  class FusedRMSLinear (line 779) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 780) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h...
    method forward (line 799) | def forward(self, x):
  class ErnieMoEAttention (line 803) | class ErnieMoEAttention(ErnieAttention):
    method __init__ (line 804) | def __init__(self, config, layer_idx):
    method forward (line 827) | def forward(
  class FakeMoERouterLoss (line 947) | class FakeMoERouterLoss(PyLayer):
    method forward (line 949) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 957) | def backward(ctx, out_grad):
  class ErnieDecoderLayer (line 966) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 967) | def __init__(self, config, layer_idx):
    method training (line 1053) | def training(self):
    method training (line 1057) | def training(self, new):
    method fp8_quant_weight (line 1063) | def fp8_quant_weight(self):
    method _init_gate_and_experts (line 1068) | def _init_gate_and_experts(self, layer_idx):
    method _init_shared_experts (line 1119) | def _init_shared_experts(self):
    method _init_dense_experts (line 1137) | def _init_dense_experts(self, layer_idx):
    method forward (line 1160) | def forward(
    method model_parallel_dropout (line 1244) | def model_parallel_dropout(self):
  class ErniePretrainedModel (line 1251) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1256) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1326) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1446) | def _init_weights(self, layer):
  class ErnieModel (line 1525) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1526) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1591) | def get_input_embeddings(self):
    method set_input_embeddings (line 1594) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1598) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1617) | def recompute_training(
    method forward (line 1672) | def forward(
  class ErniePretrainingCriterion (line 1905) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 1906) | def __init__(self, config, return_tuple=True):
    method forward (line 1921) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  class ErnieMoEForCausalLM (line 1974) | class ErnieMoEForCausalLM(ErniePretrainedModel):
    method __init__ (line 1977) | def __init__(self, config):
    method _post_init (line 2002) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 2034) | def set_state_dict(self, state_dict, *args, **kwargs):
    method get_input_embeddings (line 2052) | def get_input_embeddings(self):
    method set_input_embeddings (line 2055) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2058) | def get_output_embeddings(self):
    method set_output_embeddings (line 2061) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2064) | def set_decoder(self, decoder):
    method get_decoder (line 2067) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2071) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2084) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2116) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2155) | def forward(
    method sharded_state_dict (line 2224) | def sharded_state_dict(self, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_pp.py
  class ErnieEmbeddingPipe (line 95) | class ErnieEmbeddingPipe(nn.Layer):
    method __init__ (line 96) | def __init__(self, config):
    method embedding_weight (line 112) | def embedding_weight(self):
    method forward (line 115) | def forward(self, args):
  class MTPEmbeddingPipe (line 209) | class MTPEmbeddingPipe(ErnieEmbeddingPipe):
    method __init__ (line 210) | def __init__(self, config):
    method embedding_weight (line 214) | def embedding_weight(self):
    method forward (line 217) | def forward(self, args):
  class EmptyLayer (line 236) | class EmptyLayer(nn.Layer):
    method __init__ (line 237) | def __init__(self):
    method forward (line 240) | def forward(self, x):
  class ErnieDecoderLayerPipe (line 244) | class ErnieDecoderLayerPipe(ErnieDecoderLayer):
    method __init__ (line 245) | def __init__(self, config, layer_idx, use_full_recompute=False):
    method forward (line 252) | def forward(self, args):
  class RMSNormPipe (line 356) | class RMSNormPipe(RMSNorm):
    method __init__ (line 357) | def __init__(self, config):
    method forward (line 362) | def forward(self, args):
  class ErnieMoELMHeadPipe (line 390) | class ErnieMoELMHeadPipe(ErnieMoELMHead):
    method forward (line 391) | def forward(self, args):
  class MTPLayer (line 402) | class MTPLayer(nn.Layer):
    method __init__ (line 403) | def __init__(self, config):
    method forward (line 437) | def forward(self, args):
    method forward_impl (line 446) | def forward_impl(self, *args):
  class ErniePretrainingCriterionPipe (line 523) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion):
    method __init__ (line 524) | def __init__(self, config):
    method forward (line 527) | def forward(self, logits, labels):
  class PipelinePretrainedModel (line 541) | class PipelinePretrainedModel(PretrainedModel):
    method __init__ (line 542) | def __init__(self, config, *args, **kwargs):
    method init (line 546) | def init(self, config, *args, **kwargs):
    method add_sequential_layer (line 551) | def add_sequential_layer(self, layer_desc, name_prefix=""):
    method get_sequential_layers (line 554) | def get_sequential_layers(self):
    method get_sequential_name_prefixs (line 557) | def get_sequential_name_prefixs(self):
    method get_shardlayer_prefix (line 560) | def get_shardlayer_prefix(self, name_splited):
    method _set_pipeline_name_mapping (line 571) | def _set_pipeline_name_mapping(self, mappings=None):
    method _check_shared_model_state (line 631) | def _check_shared_model_state(self):
    method state_dict (line 652) | def state_dict(self, *args, **kwargs):
    method _init_weights (line 664) | def _init_weights(self, layer):
    method sharded_state_dict (line 736) | def sharded_state_dict(self, *args, **kwargs):
  function get_pp_vp_split_layers (line 770) | def get_pp_vp_split_layers(config):
  class ErnieMoEForCausalLMPipe (line 802) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method _prepare_pipeline_inputs_func (line 815) | def _prepare_pipeline_inputs_func(cls, data):
    method __init__ (line 849) | def __init__(
    method get_loss_fn (line 1000) | def get_loss_fn(self, config):
    method rename_model_params (line 1003) | def rename_model_params(self, func):
    method fp8_quant_weight (line 1010) | def fp8_quant_weight(self):
    method _post_init (line 1016) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 1033) | def set_state_dict(self, state_dict, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/fp8_linear.py
  function fp8_gemm (line 44) | def fp8_gemm(
  function padding (line 94) | def padding(x, axis):
  class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 128) | def forward(ctx, x, w1, w2):
    method backward (line 208) | def backward(ctx, do3):
  class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 333) | def forward(ctx, x, w1, w2):
    method backward (line 393) | def backward(ctx, do3):
  class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer):
    method __init__ (line 526) | def __init__(self, config):
    method forward (line 557) | def forward(self, x):

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/moe_layer.py
  function set_grad_in_dtype_non_consistent (line 73) | def set_grad_in_dtype_non_consistent(ctx):
  class Fp8MoeGateDispatchAndQuant (line 79) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer):
    method forward (line 83) | def forward(
    method backward (line 135) | def backward(ctx, *grads):
  function recompute_fwd_gate_up_func (line 154) | def recompute_fwd_gate_up_func(config, layer_idx):
  class MoEStatics (line 164) | class MoEStatics(nn.Layer):
    method __init__ (line 165) | def __init__(self, config, layer_idx):
  class GateCombine (line 196) | class GateCombine(PyLayer):
    method forward (line 198) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 206) | def backward(ctx, grad_y, *_):
  class FusionFP8Expert (line 215) | class FusionFP8Expert(paddle.autograd.PyLayer):
    method forward (line 217) | def forward(ctx, hidden_states, custom_map):
    method backward (line 234) | def backward(ctx, output_grad):
  class AlltoAll (line 245) | class AlltoAll(PyLayer):
    method forward (line 247) | def forward(ctx, x, group, sync_op=True):
    method backward (line 260) | def backward(ctx, *dx):
  class AlltoAllExpertOverlap (line 264) | class AlltoAllExpertOverlap(PyLayer):
    method forward (line 266) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i...
    method backward (line 302) | def backward(ctx, out_grad):
  class AlltoAllAsync (line 321) | class AlltoAllAsync(PyLayer):
    method forward (line 323) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 344) | def backward(ctx, dx_out, *fn_out_grads):
  function dispatching (line 364) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining_fused (line 391) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False):
  class ReshapeKeepGradDtype (line 400) | class ReshapeKeepGradDtype(PyLayer):
    method forward (line 402) | def forward(ctx, x, shape):
    method backward (line 408) | def backward(ctx, grad):
  class MOELayer (line 412) | class MOELayer(nn.Layer):
    method __init__ (line 420) | def __init__(
    method forward_experts (line 523) | def forward_experts(self, dispatched_input):
    method fp8_quant_weight (line 559) | def fp8_quant_weight(self):
    method fused_gate_logits_process (line 590) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo...
    method gate_distpach_and_quant (line 603) | def gate_distpach_and_quant(self, input, token_type_ids):
    method gate_and_distpach (line 688) | def gate_and_distpach(self, input, token_type_ids):
    method _calc_router_loss (line 800) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 831) | def calc_router_loss_and_logging(
    method combine_expert_output (line 854) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 862) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 866) | def all2all_expert_overlap(self, x, group):
    method forward (line 893) | def forward(
    method sharded_state_dict (line 1067) | def sharded_state_dict(
  class FP8FusedWLCHFunc (line 1079) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer):
    method forward (line 1081) | def forward(
    method backward (line 1134) | def backward(ctx, output_grad):
  class MlpNode (line 1149) | class MlpNode:
    method __init__ (line 1150) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ...
    method reset_status (line 1165) | def reset_status(self):
    method release_mem (line 1173) | def release_mem(self):
    method forward (line 1178) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1219) | def backward(self, hidden_states_out_grad):
  class Fp8FusedMoeFunc (line 1243) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer):
    method forward (line 1245) | def forward(
    method backward (line 1269) | def backward(ctx, output_grad):

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py
  function _get_fp8_weight_and_scale (line 44) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
  function fused_stack_transpose_quant (line 64) | def fused_stack_transpose_quant(weight_list, transpose=False):
  function split_group_gemm (line 82) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ...
  function has_config (line 120) | def has_config(config_map, key):
  class ExpertsGroupGemmNode (line 124) | class ExpertsGroupGemmNode:
    method __init__ (line 137) | def __init__(self, experts, custom_map, name="moe_experts_node"):
    method reset_status (line 161) | def reset_status(self):
    method fwd_gate_up (line 167) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex...
    method fwd_swiglu (line 224) | def fwd_swiglu(self, o1):
    method fwd_down (line 240) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe...
    method fwd_down_no_probs (line 301) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_...
    method bwd_down_input (line 359) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ...
    method bwd_down_input_no_prob (line 429) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_...
    method bwd_swiglu (line 469) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 486) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte...
    method bwd_down_weight (line 544) | def bwd_down_weight(self, out_grad, o2, expert_w2):
    method bwd_gate_up_weight (line 628) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 707) | def forward(self, hs_out, unzipped_probs, tokens_per_expert):
    method backward (line 726) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex...
    method forward_no_prob (line 741) | def forward_no_prob(self, hs_out, tokens_per_expert):
    method backward_no_prob (line 753) | def backward_no_prob(self, out_grad, tokens_per_expert):
  class ExpertsGroupGemmContiguousNode (line 775) | class ExpertsGroupGemmContiguousNode:
    method __init__ (line 788) | def __init__(
    method reset_status (line 833) | def reset_status(self):
    method gen_m_indices (line 842) | def gen_m_indices(self, tokens_per_expert):
    method fwd_gate_up (line 863) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca...
    method fwd_swiglu (line 937) | def fwd_swiglu(self, o1):
    method fwd_down (line 941) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert):
    method bwd_down_input (line 1007) | def bwd_down_input(self, expert_w2, unzipped_grad, o1):
    method bwd_swiglu (line 1084) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 1088) | def bwd_gate_up_input(self, do1, expert_w1):
    method fused_transpose_split_quant (line 1145) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...
    method bwd_down_weight (line 1170) | def bwd_down_weight(self, do3, o2, expert_w2):
    method bwd_gate_up_weight (line 1246) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 1310) | def forward(
    method backward (line 1334) | def backward(self, out_grad, a2a_async_fn=None):
  class ExpertsGroupGemmWLCHNode (line 1427) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode):
    method __init__ (line 1443) | def __init__(
    method gen_m_indices (line 1479) | def gen_m_indices(self, tokens_per_expert):
    method fused_transpose_split_quant (line 1499) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py
  function inplace_offload (line 24) | def inplace_offload(x):
  function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024):
  function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk):
  function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute (line 105) | def permute(
  function unpermute (line 128) | def unpermute(
  class UnZipNode (line 163) | class UnZipNode:
    method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"):
    method reset_status (line 190) | def reset_status(self):
    method forward (line 196) | def forward(
    method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_...
  class ZipNode (line 268) | class ZipNode:
    method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"):
    method forward (line 292) | def forward(
    method backward (line 326) | def backward(

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/top2_gate.py
  function cal_aux_loss_func (line 37) | def cal_aux_loss_func(
  function masked_fill (line 87) | def masked_fill(x, mask, value):
  class CalAuxLossFunctor (line 92) | class CalAuxLossFunctor(paddle.autograd.PyLayer):
    method forward (line 94) | def forward(
    method backward (line 124) | def backward(ctx, out_grad):
  function cast_if_needed (line 132) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 136) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 138) | def forward(ctx, x, w):
    method backward (line 144) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 157) | def gate_detach_matmul(x, weight, use_fuse):
  function compute_optimal_transport (line 166) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  class Top2Gate (line 180) | class Top2Gate(nn.Layer):
    method __init__ (line 193) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 261) | def _create_gate_parameter(self):
    method forward (line 271) | def forward(
    method get_capacity (line 303) | def get_capacity(self, num_tokens, cap_factor=None):
    method top2_gating (line 318) | def top2_gating(self, logits, cap=None, correction_bias=None):
    method _cal_aux_loss (line 390) | def _cal_aux_loss(
    method _cal_orthogonal_loss (line 435) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):
    method _cal_orthogonal_loss_opt_each_weight (line 450) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
  function cal_orthogonal_loss_opt_each_weight_func (line 457) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e...
  class TopKGateFused (line 475) | class TopKGateFused(Top2Gate):
    method forward (line 476) | def forward(

FILE: paddleformers/cli/train/ernie_pretrain/models/sequence_parallel_utils.py
  function get_hcg (line 49) | def get_hcg():
  function get_async_loader (line 56) | def get_async_loader():
  function hack_offload_wait (line 69) | def hack_offload_wait(task):
  function hack_reload_wait (line 73) | def hack_reload_wait(task):
  class ScatterOp (line 77) | class ScatterOp(PyLayer):
    method forward (line 79) | def forward(ctx, input, axis=0, group=None):
    method backward (line 85) | def backward(ctx, grad):
  class GatherOp (line 89) | class GatherOp(PyLayer):
    method forward (line 91) | def forward(ctx, input, axis=0, group=None):
    method backward (line 97) | def backward(ctx, grad):
  class AllGatherOp (line 101) | class AllGatherOp(PyLayer):
    method forward (line 103) | def forward(ctx, input, group=None):
    method backward (line 108) | def backward(ctx, grad):
  class ReduceScatterOp (line 112) | class ReduceScatterOp(PyLayer):
    method forward (line 114) | def forward(ctx, input, group=None):
    method backward (line 120) | def backward(ctx, grad):
  class AllGatherVarlenOp (line 124) | class AllGatherVarlenOp(PyLayer):
    method forward (line 126) | def forward(ctx, input, group=None):
    method backward (line 165) | def backward(ctx, grad):
  class GemmReduceScatterOp (line 179) | class GemmReduceScatterOp(PyLayer):
    method forward (line 181) | def forward(ctx, input, weight, group):
    method backward (line 188) | def backward(ctx, grad):
  class AllGatherGemmOp (line 209) | class AllGatherGemmOp(PyLayer):
    method forward (line 211) | def forward(ctx, input, weight, group):
    method backward (line 219) | def backward(ctx, grad):
  function sequence_parallel_sparse_mask_labels (line 236) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
  function mark_as_sequence_parallel_parameter (line 252) | def mark_as_sequence_parallel_parameter(parameter):
  function is_sequence_parallel_parameter (line 256) | def is_sequence_parallel_parameter(parameter):
  function create_fused_allreduce_gradient_hook (line 260) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st...
  function create_non_fused_allreduce_gradient_hook (line 277) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False):
  function register_sequence_parallel_allreduce_hooks (line 300) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para...
  function is_fused_matmul_bias_supported (line 323) | def is_fused_matmul_bias_supported():
  class ColumnSequenceParallelLinear (line 339) | class ColumnSequenceParallelLinear(Layer):
    method __init__ (line 340) | def __init__(
    method forward (line 432) | def forward(self, x, use_comm=True):
    method sharded_state_dict (line 452) | def sharded_state_dict(
  class MPScale (line 460) | class MPScale(PyLayer):
    method forward (line 462) | def forward(ctx, x, mp_degree):
    method backward (line 467) | def backward(ctx, dout):
  class RowSequenceParallelLinear (line 471) | class RowSequenceParallelLinear(Layer):
    method __init__ (line 472) | def __init__(
    method forward (line 568) | def forward(self, x):
    method sharded_state_dict (line 599) | def sharded_state_dict(

FILE: paddleformers/cli/train/ernie_pretrain/models/utils.py
  function get_global_training_logs (line 31) | def get_global_training_logs():
  function global_training_logs_enabled (line 49) | def global_training_logs_enabled():
  function inplace_offload (line 54) | def inplace_offload(tensor):
  function detach_and_requires_grad_ (line 59) | def detach_and_requires_grad_(*args):
  class FakeClone (line 67) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 69) | def forward(ctx, input):
    method backward (line 78) | def backward(ctx, grad_output):
  function manual_backward (line 82) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  class FakeGather (line 120) | class FakeGather(paddle.autograd.PyLayer):
    method forward (line 122) | def forward(ctx, input, indices):
    method backward (line 132) | def backward(ctx, grad_output):
  class FusedUnpermutation (line 141) | class FusedUnpermutation(paddle.autograd.PyLayer):
    method forward (line 143) | def forward(
    method backward (line 180) | def backward(ctx, output_tokens_grad):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/fp8_quant_weight_callback.py
  function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key):
  class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback):
    method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/gc_callback.py
  class GCCallback (line 20) | class GCCallback(TrainerCallback):
    method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs):
    method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/logging_callback.py
  class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(
    method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_correction_bias_adjust_callback.py
  class MoECorrectionBiasAdjustCallback (line 30) | class MoECorrectionBiasAdjustCallback(TrainerCallback):
    method __init__ (line 31) | def __init__(self, lr, use_sp):
    method on_optimizer_end (line 36) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_logging_callback.py
  function tensor_md5 (line 46) | def tensor_md5(tensor):
  class GlobalRNGCallback (line 52) | class GlobalRNGCallback(TrainerCallback):
    method on_step_end (line 53) | def on_step_end(self, args, state, control, model, **kwargs):
  class MoeLoggingCallback (line 58) | class MoeLoggingCallback(TrainerCallback):
    method __init__ (line 59) | def __init__(self, optimizer):
    method on_log (line 70) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_step_end (line 76) | def on_step_end(self, args, state, control, model, **kwargs):
    method on_save (line 115) | def on_save(self, args, state, control, model, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/ortho_loss_callback.py
  class OrthogonalCallback (line 24) | class OrthogonalCallback(TrainerCallback):
    method __init__ (line 25) | def __init__(self, ortho_loss_lambda):
    method on_optimizer_end (line 28) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/sp_grad_sync_callback.py
  class SPGradSyncCallback (line 30) | class SPGradSyncCallback(TrainerCallback):
    method __init__ (line 31) | def __init__(self, model):
    method on_optimizer_begin (line 43) | def on_optimizer_begin(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/tensorboard_callback.py
  function is_tensorboard_available (line 29) | def is_tensorboard_available():
  function rewrite_logs (line 33) | def rewrite_logs(d):
  class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback):
    method __init__ (line 50) | def __init__(
    method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/clip/moe_clip.py
  class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase):
    method __init__ (line 29) | def __init__(
    method __str__ (line 49) | def __str__(self):
    method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None):
    method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads):

FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/cosine_lr.py
  function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup(

FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/wsd_lr.py
  function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup(

FILE: paddleformers/cli/train/ernie_pretrain/src/tokenizers/tokenization_eb_v2.py
  class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer):
    method __init__ (line 40) | def __init__(
    method space_token (line 72) | def space_token(self):
    method space_token_id (line 76) | def space_token_id(self):
    method gend_token (line 80) | def gend_token(self):
    method gend_token_id (line 84) | def gend_token_id(self):
    method im_start_id (line 88) | def im_start_id(self):
    method im_end_id (line 92) | def im_end_id(self):
    method vocab_size (line 96) | def vocab_size(self):
    method get_vocab (line 99) | def get_vocab(self):
    method _tokenize (line 104) | def _tokenize(self, text):
    method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id):
    method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens):
    method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs):
    method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    method _decode (line 169) | def _decode(self, *args, **kwargs):
    method _pad (line 179) | def _pad(
  function add_special_tokens (line 239) | def add_special_tokens(

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/data_parallel.py
  class DataParallel (line 22) | class DataParallel(paddle.DataParallel):
    method init_reducer (line 23) | def init_reducer(self):
  function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None:

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py
  class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad:
    method __init__ (line 38) | def __init__(self, clip, hcg, timers=None):
    method _global_norm (line 53) | def _global_norm(
    method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads):
    method _comm_and_clip (line 277) | def _comm_and_clip(
    method __getattr__ (line 330) | def __getattr__(self, item):
    method __call__ (line 333) | def __call__(self, params_grads):
  class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase):
    method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy):

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/pretraining_trainer.py
  function distributed_optimizer_maybe_overwrite (line 109) | def distributed_optimizer_maybe_overwrite(
  class PreTrainingArguments (line 134) | class PreTrainingArguments(TrainingArguments):
    method use_moe (line 286) | def use_moe(self):  # noqa: F811
    method use_moe (line 290) | def use_moe(self, value):
    method need_data (line 295) | def need_data(self):
    method combine_batch (line 299) | def combine_batch(self):
    method reeao_dataset_rank (line 303) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 307) | def reeao_dataset_world_size(self):
    method __post_init__ (line 310) | def __post_init__(self):
  class WeightedDistributedSampler (line 424) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 425) | def __init__(
    method set_epoch (line 471) | def set_epoch(self, epoch=0, consumed_samples=0):
    method gen_data_seq (line 476) | def gen_data_seq(self):
    method load_data_seq_from_cache (line 489) | def load_data_seq_from_cache(self):
    method gen_data_seq_weighted (line 502) | def gen_data_seq_weighted(self, num_examples, data_type=None):
    method roundup_and_shard (line 580) | def roundup_and_shard(self, indices):
    method __len__ (line 602) | def __len__(self):
    method __iter__ (line 605) | def __iter__(self):
  class DummySampler (line 673) | class DummySampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 674) | def __init__(self, dataset, batch_size=1, **kwargs):
    method __len__ (line 677) | def __len__(self):
    method __iter__ (line 680) | def __iter__(self):
  class PretrainingTrainer (line 685) | class PretrainingTrainer(Trainer):
    method __init__ (line 686) | def __init__(self, args=None, model=None, callbacks=[], **kwargs):
    method autocast_smart_context_manager (line 707) | def autocast_smart_context_manager(self):
    method _load_optimizer_state (line 739) | def _load_optimizer_state(self, checkpoint):
    method _save_moe_weights (line 788) | def _save_moe_weights(self, output_dir):
    method _wrap_model (line 835) | def _wrap_model(self, model, training=True):
    method _new_gradclip (line 1004) | def _new_gradclip(self):
    method evaluate (line 1050) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_pipeline_step (line 1081) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only...
    method restore_dataloader_status (line 1087) | def restore_dataloader_status(self):
    method _get_eval_sampler (line 1132) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 1142) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
    method _maybe_log_save_evaluate (line 1152) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_...
    method create_scheduler (line 1316) | def create_scheduler(self, num_training_steps):
    method create_optimizer (line 1340) | def create_optimizer(self, lr_scheduler=None):
    method save_model (line 1404) | def save_model(self, output_dir=None):
    method _load_rng_state (line 1410) | def _load_rng_state(self, checkpoint):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/logging.py
  function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/misc.py
  class SmoothedValue (line 42) | class SmoothedValue:
    method __init__ (line 43) | def __init__(
    method update (line 52) | def update(self, value):
    method global_avg (line 63) | def global_avg(self):
    method reset (line 66) | def reset(self):
  class TrainingLogs (line 71) | class TrainingLogs:
    method __new__ (line 74) | def __new__(cls, *args, **kw):
    method __init__ (line 79) | def __init__(self):
    method set_trainer_interval (line 87) | def set_trainer_interval(self, trainer, logging_interval):
    method global_meters_keys (line 92) | def global_meters_keys(self):
    method global_meters_keys (line 96) | def global_meters_keys(self, lst):
    method enable_skip_zero (line 99) | def enable_skip_zero(self, keys=[]):
    method update (line 107) | def update(self, **kwargs):
    method is_enabled (line 111) | def is_enabled(self):
    method __setitem__ (line 114) | def __setitem__(self, k, v):
    method __getitem__ (line 122) | def __getitem__(self, v):
    method __getattr__ (line 125) | def __getattr__(self, attr):
    method dict (line 132) | def dict(self, use_async=False):
    method reset (line 186) | def reset(self):
    method take_snapshot (line 191) | def take_snapshot(self):
    method restore_snapshot (line 194) | def restore_snapshot(self):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/seed_utils.py
  function set_seed (line 26) | def set_seed(seed):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/training_utils.py
  function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc...

FILE: paddleformers/cli/train/ernie_pretrain/workflow.py
  function log_trainer_start (line 84) | def log_trainer_start():
  function load_huggingface_checkpoint (line 94) | def load_huggingface_checkpoint(model, args):
  function get_expected_state_dict (line 189) | def get_expected_state_dict(model, **kwargs):
  function update_model_config_from_args (line 257) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di...
  function get_tp_split_ckpt (line 267) | def get_tp_split_ckpt(args, path):
  class AllArguments (line 279) | class AllArguments(PreTrainingArguments):
    method __post_init__ (line 280) | def __post_init__(self):
  class ExpConfig (line 285) | class ExpConfig:
  function create_pretrained_dataset (line 291) | def create_pretrained_dataset(args):
  function run_ernie_pretrain (line 337) | def run_ernie_pretrain(model_args, data_args, generating_args, training_...

FILE: paddleformers/cli/train/sft/dataset_formatting.py
  function conversations_formatting_function (line 31) | def conversations_formatting_function(tokenizer: AutoTokenizer, messages...
  function instructions_formatting_function (line 49) | def instructions_formatting_function(tokenizer: AutoTokenizer):
  function paddleformers_instructions_formatting_function (line 75) | def paddleformers_instructions_formatting_function(tokenizer: AutoTokeni...
  function get_formatting_func_from_dataset (line 101) | def get_formatting_func_from_dataset(dataset: Union[Dataset], tokenizer:...

FILE: paddleformers/cli/train/sft/make_data_utils.py
  class DataGenerator (line 18) | class DataGenerator:
    method __init__ (line 21) | def __init__(self, data_source):
    method __iter__ (line 34) | def __iter__(self):
    method __next__ (line 41) | def __next__(self):

FILE: paddleformers/cli/train/sft/sft_config.py
  class SFTConfig (line 30) | class SFTConfig(TrainingArguments):
    method __post_init__ (line 87) | def __post_init__(self):

FILE: paddleformers/cli/train/sft/sft_trainer.py
  class SFTTrainer (line 58) | class SFTTrainer(Trainer):
    method __init__ (line 59) | def __init__(
    method _prepare_dataset (line 197) | def _prepare_dataset(
    method _prepare_non_packed_dataloader (line 248) | def _prepare_non_packed_dataloader(
    method prediction_step (line 319) | def prediction_step(
    method log (line 378) | def log(self, logs: Dict[str, float], **kwargs) -> None:
    method get_ptq_dataloader (line 386) | def get_ptq_dataloader(self, ptq_ds):
    method ptq_loop (line 411) | def ptq_loop(

FILE: paddleformers/cli/train/sft/workflow.py
  function create_pretrained_dataset (line 89) | def create_pretrained_dataset(training_args, data_args, model_args):
  function run_sft (line 167) | def run_sft(
  function create_peft_model (line 735) | def create_peft_model(model_args, training_args, dtype, model):

FILE: paddleformers/cli/train/tuner.py
  function check_path (line 25) | def check_path(path):
  function _training_function (line 33) | def _training_function(config: dict[str, Any]) -> None:
  function run_tuner (line 71) | def run_tuner(args: Optional[dict[str, Any]] = None) -> None:

FILE: paddleformers/cli/utils/llm_utils.py
  function compute_metrics (line 44) | def compute_metrics(eval_preds):
  function get_lora_target_modules (line 55) | def get_lora_target_modules(model):
  function get_infer_model_path (line 424) | def get_infer_model_path(input_dir, model_prefix):
  function deserialize_from_file (line 432) | def deserialize_from_file(fp):
  function get_alibi_slopes (line 462) | def get_alibi_slopes(num_heads):
  function pad_batch_data (line 477) | def pad_batch_data(insts, masks=None, pad_id=0, return_seq_len=False, pa...
  function dybatch_preprocess (line 505) | def dybatch_preprocess(
  function load_real_time_tokens (line 735) | def load_real_time_tokens():
  function init_chat_template (line 752) | def init_chat_template(
  function get_model_max_position_embeddings (line 799) | def get_model_max_position_embeddings(config: PretrainedConfig) -> Optio...
  function read_res (line 812) | def read_res(
  function read_res_dynamic_insert (line 850) | def read_res_dynamic_insert(
  function speculate_read_res (line 899) | def speculate_read_res(
  function get_rotary_position_embedding (line 950) | def get_rotary_position_embedding(position_ids, head_dim, rope_theta=100...
  function init_dist_env (line 1000) | def init_dist_env():
  function get_eos_token_id (line 1044) | def get_eos_token_id(
  function set_triton_cache (line 1066) | def set_triton_cache(model_name_or_path, mode):

FILE: paddleformers/cli/utils/mllm_utils.py
  class MLLMModelMapping (line 27) | class MLLMModelMapping:
  class ModelKeys (line 37) | class ModelKeys:
  class MultiModelKeys (line 52) | class MultiModelKeys(ModelKeys):
    method __post_init__ (line 57) | def __post_init__(self):
  function register_multimodel_keys (line 66) | def register_multimodel_keys(multimodel_key: ModelKeys, *, exist_ok: boo...
  function get_multimodel_target_modules (line 73) | def get_multimodel_target_modules(model_type: Optional[str]) -> Optional...
  function get_multimodel_lora_target_modules (line 79) | def get_multimodel_lora_target_modules(model, target_modules, freeze_con...
  function freeze_model_parameters (line 131) | def freeze_model_parameters(model, freeze_config):

FILE: paddleformers/cli/utils/process.py
  function terminate_process_tree (line 26) | def terminate_process_tree(pid: int) -> None:
  function is_env_enabled (line 61) | def is_env_enabled(env_var: str, default: str = "0") -> bool:
  function is_valid_model_dir (line 66) | def is_valid_model_dir(directory: str) -> bool:
  function detect_device (line 75) | def detect_device() -> str:
  function set_ascend_environment (line 98) | def set_ascend_environment():
  function remove_paddle_shm_files (line 163) | def remove_paddle_shm_files():
  function set_cuda_environment (line 174) | def set_cuda_environment():
  function set_env_if_empty (line 197) | def set_env_if_empty(key, value):
  function add_new_special_tokens (line 212) | def add_new_special_tokens(tokenizer, path):

FILE: paddleformers/data/blendable_dataset.py
  function print_rank_0 (line 26) | def print_rank_0(*args, **kwargs):
  class BlendableDataset (line 31) | class BlendableDataset(paddle.io.Dataset):
    method __init__ (line 32) | def __init__(self, datasets, weights, size, share_folder, *, data_cach...
    method __len__ (line 175) | def __len__(self):
    method __getitem__ (line 178) | def __getitem__(self, idx):

FILE: paddleformers/data/causal_dataset.py
  function get_logits (line 35) | def get_logits(batch_ids, max_retries=1, timeout=1200, retry_delay=1, pr...
  function check_data_split (line 82) | def check_data_split(splits_string, do_train, do_eval, do_predict):
  function get_train_valid_test_split_ (line 102) | def get_train_valid_test_split_(splits_string, size):
  function get_datasets_weights_and_num_samples (line 129) | def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num...
  function print_rank_0 (line 160) | def print_rank_0(*args, **kwargs):
  function build_train_valid_test_datasets (line 165) | def build_train_valid_test_datasets(
  function _build_train_valid_test_datasets (line 261) | def _build_train_valid_test_datasets(
  function get_indexed_dataset_ (line 338) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
  class GPTDataset (line 350) | class GPTDataset(paddle.io.Dataset):
    method __init__ (line 351) | def __init__(
    method __len__ (line 425) | def __len__(self):
    method __getitem__ (line 430) | def __getitem__(self, idx):
  function _build_index_mappings (line 523) | def _build_index_mappings(
  function _num_tokens (line 713) | def _num_tokens(documents, sizes):
  function _num_epochs (line 718) | def _num_epochs(tokens_per_epoch, seq_length, num_samples):
  function _build_doc_idx (line 733) | def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
  function _build_sample_idx (line 749) | def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per...
  function _build_shuffle_idx (line 797) | def _build_shuffle_idx(num_samples, total_size, np_rng):

FILE: paddleformers/data/collate.py
  class Stack (line 26) | class Stack(object):
    method __init__ (line 38) | def __init__(self, axis=0, dtype=None):
    method __call__ (line 42) | def __call__(self, data):
  class Pad (line 72) | class Pad(object):
    method __init__ (line 95) | def __init__(self, pad_val=0, axis=0, ret_length=None, dtype=None, pad...
    method __call__ (line 102) | def __call__(self, data):
  class Tuple (line 169) | class Tuple(object):
    method __init__ (line 187) | def __init__(self, fn, *args):
    method __call__ (line 200) | def __call__(self, data):
  class Dict (line 247) | class Dict(object):
    method __init__ (line 266) | def __init__(self, fn):
    method __call__ (line 280) | def __call__(self, data):

FILE: paddleformers/data/data_collator.py
  class DataCollatorMixin (line 61) | class DataCollatorMixin:
    method __call__ (line 62) | def __call__(self, features, return_tensors=None):
  function default_data_collator (line 73) | def default_data_collator(features: List[InputDataClass], return_tensors...
  function paddle_default_data_collator (line 96) | def paddle_default_data_collator(features: List[InputDataClass]) -> Dict...
  function numpy_default_data_collator (line 128) | def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[...
  class DefaultDataCollator (line 162) | class DefaultDataCollator(DataCollatorMixin):
    method __call__ (line 179) | def __call__(self, features: List[Dict[str, Any]], return_tensors=None...
  class DataCollatorWithPadding (line 186) | class DataCollatorWithPadding:
    method __call__ (line 202) | def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
  class DataCollatorForTokenClassification (line 225) | class DataCollatorForTokenClassification(DataCollatorMixin):
    method paddle_call (line 262) | def paddle_call(self, features):
    method numpy_call (line 299) | def numpy_call(self, features):
  class DataCollatorForSeq2Seq (line 330) | class DataCollatorForSeq2Seq:
    method __call__ (line 376) | def __call__(self, features, return_tensors=None):
  class DataCollatorForEmbedding (line 434) | class DataCollatorForEmbedding:
    method __call__ (line 448) | def __call__(self, batch, return_tensors=None) -> Any:
    method process_data (line 507) | def process_data(self, data, pad_idx, max_len):
    method pad_batch_data (line 516) | def pad_batch_data(insts, pad_id=0, max_seq_len=None, return_seq_len=F...
    method gen_self_attn_mask (line 531) | def gen_self_attn_mask(batch_token_ids: List[List[int]], max_seq_len: ...
    method gen_attn_mask_start_row_indices (line 543) | def gen_attn_mask_start_row_indices(batch_token_ids: List[List[int]], ...
  function _paddle_collate_batch (line 561) | def _paddle_collate_batch(examples, tokenizer, pad_to_multiple_of: Optio...
  function _numpy_collate_batch (line 599) | def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Option...
  function tolist (line 633) | def tolist(x):
  class DataCollatorForLanguageModeling (line 642) | class DataCollatorForLanguageModeling(DataCollatorMixin):
    method paddle_call (line 671) | def paddle_call(self, examples: List[Union[List[int], Any, Dict[str, A...
    method paddle_mask_tokens (line 695) | def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optiona...
    method numpy_call (line 736) | def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, An...
    method numpy_mask_tokens (line 758) | def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional...

FILE: paddleformers/data/dist_dataloader.py
  class DummyDataset (line 27) | class DummyDataset(paddle.io.Dataset):
    method __len__ (line 32) | def __len__(self):
  class IterableDummyDataset (line 36) | class IterableDummyDataset(paddle.io.IterableDataset):
    method __iter__ (line 37) | def __iter__(self):
  class DistDataLoader (line 41) | class DistDataLoader(paddle.io.DataLoader):
    method __init__ (line 46) | def __init__(
    method _dataloader_iter (line 132) | def _dataloader_iter(self):
    method __len__ (line 137) | def __len__(self):
    method __iter__ (line 143) | def __iter__(self):
    method _broadcast_data (line 146) | def _broadcast_data(self, data):
    method __next__ (line 201) | def __next__(self):
  function init_dataloader_comm_group (line 213) | def init_dataloader_comm_group():

FILE: paddleformers/data/indexed_dataset.py
  function print_rank_0 (line 40) | def print_rank_0(*args, **kwargs):
  function __best_fitting_dtype (line 45) | def __best_fitting_dtype(vocab_size=None):
  function get_available_dataset_impl (line 52) | def get_available_dataset_impl():
  function make_dataset (line 56) | def make_dataset(path, impl, skip_warmup=False):
  function make_sft_dataset (line 72) | def make_sft_dataset(path, dataclass, skip_warmup=False, impl="mmap"):
  function dataset_exists (line 85) | def dataset_exists(path, impl):
  function read_longs (line 92) | def read_longs(f, n):
  function write_longs (line 98) | def write_longs(f, a):
  function read_shorts (line 102) | def read_shorts(f, n):
  function write_shorts (line 108) | def write_shorts(f, a):
  function code (line 126) | def code(dtype):
  function index_file_path (line 133) | def index_file_path(prefix_path):
  function sft_index_file_path (line 137) | def sft_index_file_path(prefix_path):
  function sft_data_file_path (line 141) | def sft_data_file_path(prefix_path, dataclass):
  function data_file_path (line 149) | def data_file_path(prefix_path):
  function loss_mask_file_path (line 153) | def loss_mask_file_path(prefix_path):
  function create_doc_idx (line 157) | def create_doc_idx(sizes):
  class IndexedDataset (line 165) | class IndexedDataset(paddle.io.Dataset):
    method __init__ (line 170) | def __init__(self, path):
    method read_index (line 176) | def read_index(self, path):
    method read_data (line 193) | def read_data(self, path):
    method check_index (line 196) | def check_index(self, i):
    method __del__ (line 200) | def __del__(self):
    method __getitem__ (line 205) | def __getitem__(self, idx):
    method get (line 229) | def get(self, idx, offset=0, length=None):
    method __len__ (line 247) | def __len__(self):
    method num_tokens (line 250) | def num_tokens(self, index):
    method size (line 253) | def size(self, index):
    method exists (line 257) | def exists(path):
    method supports_prefetch (line 261) | def supports_prefetch(self):
    method doc_idx (line 265) | def doc_idx(self):
    method get_doc_idx (line 268) | def get_doc_idx(self):
    method set_doc_idx (line 271) | def set_doc_idx(self, doc_idx_):
  class IndexedDatasetBuilder (line 275) | class IndexedDatasetBuilder(object):
    method __init__ (line 287) | def __init__(self, out_file, dtype=np.int32):
    method add_item (line 296) | def add_item(self, tensor):
    method end_document (line 305) | def end_document(self):
    method merge_file_ (line 308) | def merge_file_(self, another_file):
    method finalize (line 333) | def finalize(self, index_file):
  function _warmup_mmap_file (line 354) | def _warmup_mmap_file(path):
  class MMapIndexedDataset (line 360) | class MMapIndexedDataset(paddle.io.Dataset):
    class Index (line 361) | class Index(object):
      method writer (line 365) | def writer(cls, path, dtype):
      method __init__ (line 410) | def __init__(self, path, skip_warmup=False):
      method __del__ (line 448) | def __del__(self):
      method dtype (line 453) | def dtype(self):
      method sizes (line 457) | def sizes(self):
      method doc_idx (line 461) | def doc_idx(self):
      method __getitem__ (line 465) | def __getitem__(self, i):
      method __len__ (line 468) | def __len__(self):
    method __init__ (line 471) | def __init__(self, path, skip_warmup=False):
    method __getstate__ (line 481) | def __getstate__(self):
    method __setstate__ (line 484) | def __setstate__(self, state):
    method _do_init (line 487) | def _do_init(self, path, skip_warmup):
    method __del__ (line 506) | def __del__(self):
    method __len__ (line 514) | def __len__(self):
    method __getitem__ (line 518) | def __getitem__(self, idx):
    method get (line 537) | def get(self, idx, offset=0, length=None):
    method sizes (line 555) | def sizes(self):
    method doc_idx (line 559) | def doc_idx(self):
    method get_doc_idx (line 562) | def get_doc_idx(self):
    method set_doc_idx (line 565) | def set_doc_idx(self, doc_idx_):
    method supports_prefetch (line 569) | def supports_prefetch(self):
    method exists (line 573) | def exists(path):
  class SFTMMapIndexedDataset (line 577) | class SFTMMapIndexedDataset(paddle.io.Dataset):
    class Index (line 578) | class Index(object):
      method writer (line 582) | def writer(cls, path, dtype):
      method __init__ (line 624) | def __init__(self, path, skip_warmup=False):
      method __del__ (line 662) | def __del__(self):
      method dtype (line 667) | def dtype(self):
      method sizes (line 671) | def sizes(self):
      method doc_idx (line 675) | def doc_idx(self):
      method __getitem__ (line 679) | def __getitem__(self, i):
      method __len__ (line 682) | def __len__(self):
    method __init__ (line 685) | def __init__(self, path, dataclass, skip_warmup=False):
    method __getstate__ (line 694) | def __getstate__(self):
    method __setstate__ (line 697) | def __setstate__(self, state):
    method _do_init (line 700) | def _do_init(self, path, skip_warmup):
    method __del__ (line 719) | def __del__(self):
    method __len__ (line 726) | def __len__(self):
    method __getitem__ (line 729) | def __getitem__(self, idx):
    method sizes (line 767) | def sizes(self):
    method doc_idx (line 771) | def doc_idx(self):
    method get_doc_idx (line 774) | def get_doc_idx(self):
    method set_doc_idx (line 777) | def set_doc_idx(self, doc_idx_):
    method supports_prefetch (line 781) | def supports_prefetch(self):
    method exists (line 785) | def exists(path, dataclass):
  function make_builder (line 794) | def make_builder(out_file, impl, save_dtype, loss_mask_file=None):
  class SFTMMapIndexedDatasetBuilder (line 801) | class SFTMMapIndexedDatasetBuilder(object):
    method __init__ (line 802) | def __init__(self, output_file_dict, dtype, index_file=None):
    method add_item (line 818) | def add_item(self, sequence):
    method add_item_bytes (line 827) | def add_item_bytes(self, serialized):
    method end_document (line 835) | def end_document(self):
    method finalize (line 842) | def finalize(self, index_file):
  class MMapIndexedDatasetBuilder (line 849) | class MMapIndexedDatasetBuilder(object):
    method __init__ (line 850) | def __init__(self, out_file, dtype, loss_mask_file=None):
    method flush_loss_mask_item (line 859) | def flush_loss_mask_item(self, loss_mask_lst):
    method add_item (line 864) | def add_item(self, tensor):
    method add_doc (line 869) | def add_doc(self, tensor, sizes):
    method end_document (line 875) | def end_document(self):
    method merge_file_ (line 878) | def merge_file_(self, another_file):
    method finalize (line 891) | def finalize(self, index_file):
  function get_indexed_dataset_ (line 903) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
  class CompatibleIndexedDataset (line 919) | class CompatibleIndexedDataset(paddle.io.Dataset):
    method __init__ (line 920) | def __init__(self, path):
    method __getstate__ (line 934) | def __getstate__(self):
    method __len__ (line 937) | def __len__(self):
    method __getitem__ (line 941) | def __getitem__(self, idx):
    method get (line 960) | def get(self, idx, offset=0, length=None):
    method sizes (line 976) | def sizes(self):
    method doc_idx (line 980) | def doc_idx(self):
    method get_doc_idx (line 983) | def get_doc_idx(self):
    method set_doc_idx (line 986) | def set_doc_idx(self, doc_idx_):
    method exists (line 990) | def exists(path):

FILE: paddleformers/data/sampler.py
  class SamplerHelper (line 22) | class SamplerHelper(object):
    method __init__ (line 46) | def __init__(self, dataset, iterable=None):
    method __iter__ (line 53) | def __iter__(self):
    method __len__ (line 63) | def __len__(self):
    method length (line 72) | def length(self):
    method length (line 86) | def length(self, length):
    method apply (line 89) | def apply(self, fn):
    method shuffle (line 105) | def shuffle(self, buffer_size=-1, seed=None):
    method sort (line 171) | def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1):
    method batch (line 247) | def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=N...
    method shard (line 335) | def shard(self, num_replicas=None, rank=None):
    method list (line 406) | def list(self):

FILE: paddleforme

Copy disabled (too large) Download .json

Condensed preview — 1045 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,981K chars).

[
  {
    "path": ".copyright.hook",
    "chars": 4281,
    "preview": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".flake8",
    "chars": 184,
    "preview": "[flake8]\nignore = E203, E402, E501, E731, E741, W503, W605, E722\nmax-line-length = 119\n\n# E402: module level import not "
  },
  {
    "path": ".github/CODE_OF_CONDUCT.md",
    "chars": 2023,
    "preview": "**简体中文**🀄 | [English🌎](./CODE_OF_CONDUCT_en.md)\n\n# 贡献者公约\n\n## 我们的承诺\n\n身为社区成员、贡献者和领袖，我们承诺使社区参与者不受骚扰，无论其年龄、体型、可见或不可见的缺陷、族裔、性"
  },
  {
    "path": ".github/CODE_OF_CONDUCT_en.md",
    "chars": 5531,
    "preview": "[简体中文🀄](./CODE_OF_CONDUCT.md) |  **English**🌎\n\n# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, con"
  },
  {
    "path": ".github/CONTRIBUTING_en.md",
    "chars": 7954,
    "preview": "[简体中文🀄](../CONTRIBUTING.md) |  **English**🌎\n\n# Contributing to PaddleFormers\n\nWe highly welcome and value your contribut"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/ask-question.yml",
    "chars": 638,
    "preview": "name: 🐛 Ask Question\ndescription: 请描述您使用PaddleFormers时遇到的问题\ntitle: \"[Question]: \"\nlabels: \n  - question\nbody:\n- type: ma"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug-report.yml",
    "chars": 1002,
    "preview": "name: 🐛 Bug Report\ndescription: PaddleFormers问题反馈\ntitle: \"[Bug]: \"\nlabels: bug\nbody: \n  - type: textarea\n    id: environ"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/docs-report.yml",
    "chars": 633,
    "preview": "name: 🐛 Docs Report\ndescription: PaddleFormers文档反馈\ntitle: \"[Docs]: \"\nlabels: \n  - documentation\n\nbody: \n  - type: textar"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature-request.yml",
    "chars": 790,
    "preview": "name: \"\\U0001F680 Feature request\"\ndescription: 请详细描述您所需功能\nlabels: [ \"feature\" ]\nbody:\n  - type: textarea\n    id: featur"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/new-model.yaml",
    "chars": 522,
    "preview": "name: \"\\U0001F31F 添加新模型\"\ndescription: 请为新模型提交一份说明\nlabels: [ \"New model\" ]\n\nbody:\n  - type: textarea\n    id: description-"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/others.yml",
    "chars": 343,
    "preview": "name: 🧩 其他 Others\ndescription: 提出其他问题。\nlabels: [others]\n\nbody:\n- type: markdown\n  attributes:\n    value: >\n      #### 你可"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "chars": 712,
    "preview": "<!-- Demo: https://github.com/PaddlePaddle/PaddleFormers/pull/ -->\n#### Before submitting\n\n- [ ] Lint code. If there are"
  },
  {
    "path": ".github/actions/rerun-workflow/action.yml",
    "chars": 758,
    "preview": "name: 'Rerun Workflow'\ndescription: 'Re-run GitHub Actions workflow for a given Pull Request'\ninputs:\n  GITHUB_TOKEN:\n  "
  },
  {
    "path": ".github/actions/rerun-workflow/rerun.sh",
    "chars": 2955,
    "preview": "# Copyright (c) 2025 PaddleFormers Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": ".github/codecov.yml",
    "chars": 322,
    "preview": "codecov:\n  notify:\n    wait_for_ci: false\n\ncoverage:\n  status:\n    project:\n      default: \n        target: 30% # overal"
  },
  {
    "path": ".github/workflows/_clone_linux.yml",
    "chars": 3382,
    "preview": "name: PaddleFormers Code Clone\ndescription: \"PaddleFormers clone and upload\"\n\non:\n  workflow_call:\n    inputs:\n        b"
  },
  {
    "path": ".github/workflows/_xpu_ci_test.yml",
    "chars": 5430,
    "preview": "name: xpu_ci_test\n\non:\n  workflow_call:\n    inputs:\n      DOCKER_IMAGE:\n        description: \"Build Images\"\n        requ"
  },
  {
    "path": ".github/workflows/ce-build-ci-workflow.yml",
    "chars": 1655,
    "preview": "name: Build CI Images\n\non:\n  schedule:\n    - cron: \"0 22 * * *\"     # every day at 06:00 Beijing time (UTC+8)\n  workflow"
  },
  {
    "path": ".github/workflows/ce-build-images.yml",
    "chars": 5528,
    "preview": "name: Build CI Images For Test\n\non:\n  workflow_call:\n    inputs:\n      flag_build:       # test||update\n        required"
  },
  {
    "path": ".github/workflows/ce-build-whl.yml",
    "chars": 4072,
    "preview": "name: Build Whl CE\n\non:\n  push:\n    branches:\n      - develop\n      - release/*\n\nenv:\n  BRANCH: ${{ github.ref_name }}\n "
  },
  {
    "path": ".github/workflows/ce-deadlink.yml",
    "chars": 4922,
    "preview": "name: Deadlink CE\n\non:\n  schedule:\n    - cron: \"0 8 * * 6\"     # every Saturday at 16:00\n  workflow_dispatch:        # a"
  },
  {
    "path": ".github/workflows/ce-unittest-gpu.yml",
    "chars": 7598,
    "preview": "name: Unittest GPU CE\n\non:\n  schedule:\n    - cron: \"0 1 * * *\"     # every day at 09:00 Beijing time (UTC+8)\n  workflow_"
  },
  {
    "path": ".github/workflows/check-release-pr.yaml",
    "chars": 1371,
    "preview": "name: Check Release PR\n\non:\n  pull_request:\n    branches:\n      - 'release/*'\n    types:\n      - opened\n      - edited\n "
  },
  {
    "path": ".github/workflows/cherry-pick.yml",
    "chars": 7772,
    "preview": "name: Cherry Pick\n\non:\n  pull_request_target:\n    branches: [develop]\n    types: [closed, labeled]\n\npermissions:\n  conte"
  },
  {
    "path": ".github/workflows/ci_iluvatar.yml",
    "chars": 4388,
    "preview": "name: CI_ILUVATAR\n\non:\n  pull_request:\n    types: [opened, synchronize]\n    branches: [develop, release/**]\npermissions:"
  },
  {
    "path": ".github/workflows/ci_xpu.yml",
    "chars": 1534,
    "preview": "name: CI_XPU\n\non:\n  pull_request:\n    types: [opened, synchronize]\n    branches: [develop, release/**]\npermissions: read"
  },
  {
    "path": ".github/workflows/debug-unittest-gpu.yml",
    "chars": 4422,
    "preview": "name: Debug Unittest GPU with SSH\n\non:\n  workflow_dispatch:\n\nenv:\n  PR_ID: ${{ github.event.pull_request.number }}\n  COM"
  },
  {
    "path": ".github/workflows/fleet-model-test.yml",
    "chars": 33007,
    "preview": "name: Fleet Model Test\n\non:\n  pull_request:\n    branches:\n      - develop\n      - release/**\n\npermissions: read-all\n\ncon"
  },
  {
    "path": ".github/workflows/lint.yml",
    "chars": 3924,
    "preview": "name: Codestyle Check\n\non: [push, pull_request]\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.event.pull_requ"
  },
  {
    "path": ".github/workflows/model-unittest-gpu.yml",
    "chars": 7984,
    "preview": "name: Model Unittest GPU CI\n\non:\n  pull_request:\n  schedule:\n    - cron: \"0 18 * * *\"\n  workflow_call:\n    inputs:\n     "
  },
  {
    "path": ".github/workflows/requirements-review.yml",
    "chars": 2028,
    "preview": "name: Check Requirements Need Approval\n\non:\n  pull_request:\n    types: [opened, synchronize, reopened, ready_for_review]"
  },
  {
    "path": ".github/workflows/rerun.yml",
    "chars": 2007,
    "preview": "name: Re-run\n\non:\n  issue_comment:\n    types: [created]\n\njobs:\n  re-run:\n    if: ${{ github.event.issue.pull_request && "
  },
  {
    "path": ".github/workflows/stale.yml",
    "chars": 1099,
    "preview": "name: Stale\n\non:\n  # Allow manual run via GitHub web or CLI\n  workflow_dispatch:\n  schedule:\n    # Run daily at midnight"
  },
  {
    "path": ".github/workflows/unittest-gpu.yml",
    "chars": 12937,
    "preview": "name: Unittest GPU CI\n\non:\n  pull_request:\n  schedule:\n    - cron: \"0 18 * * *\"\n  workflow_call:\n    inputs:\n      runne"
  },
  {
    "path": ".github/workflows/update-precision.yml",
    "chars": 2211,
    "preview": "name: update precision\n\non:\n  push:\n    branches:\n      - develop\n\njobs:\n  determine-whether-update:\n    name: Determine"
  },
  {
    "path": ".gitignore",
    "chars": 2096,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": ".pre-commit-config.yaml",
    "chars": 1703,
    "preview": "repos:\n# For Python files\n-   repo: https://github.com/psf/black.git\n    rev: 22.8.0\n    hooks:\n    -   id: black\n      "
  },
  {
    "path": ".readthedocs.yaml",
    "chars": 586,
    "preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 5959,
    "preview": "**简体中文**🀄 | [English🌎](.github/CONTRIBUTING_en.md)\n\n# Contributing to PaddleFormers\n\n我们非常欢迎并希望您对`PaddleFormers`做出开源贡献。在您"
  },
  {
    "path": "LICENSE",
    "chars": 11438,
    "preview": "Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n\n                                 Apache License\n          "
  },
  {
    "path": "Makefile",
    "chars": 2595,
    "preview": "# Makefile for PaddleFormers\n#\n# \tGitHb: https://github.com/PaddlePaddle/PaddleFormers\n# \tAuthor: Paddle Team https://gi"
  },
  {
    "path": "README.md",
    "chars": 12033,
    "preview": "<p align=\"center\">\n  <img src=\"https://github.com/user-attachments/assets/9d1c1937-7fac-48f8-9d61-f7ac67b61b18\" align=\"m"
  },
  {
    "path": "docs/en/cli_usage.md",
    "chars": 3130,
    "preview": "# CLI\n\n## Overview\n\nCLI (Command Line Interface) provides terminal-based interaction with the program, enabling efficien"
  },
  {
    "path": "docs/en/datasets.md",
    "chars": 5280,
    "preview": "# Data Format Specification\n\n## Pre-training offline dataset\n\n- **CLI**: Modify the following fields in the YAML configu"
  },
  {
    "path": "docs/en/datasets_format.md",
    "chars": 18115,
    "preview": "# Data Stream Format Documentation\n\n## Data Stream File Format Support\n\nCurrently, pre-training and post-training data s"
  },
  {
    "path": "docs/en/image_processors.md",
    "chars": 2139,
    "preview": "### 🏞️ Image Processor\n\n`Image Processor` is an image preprocessing tool responsible for preparing input features for vi"
  },
  {
    "path": "docs/en/processors.md",
    "chars": 6044,
    "preview": "### ⚙️ Processors\n\n`Processor` is a multimodal preprocessing tool responsible for preparing inputs that combine more tha"
  },
  {
    "path": "docs/en/video_processors.md",
    "chars": 2387,
    "preview": "### 🎬 Video Processor\n\n`Video Processor` is a video preprocessing tool responsible for preparing input features for mult"
  },
  {
    "path": "docs/zh/ILUVATAR-GPU_installation_guide.md",
    "chars": 7579,
    "preview": "# 1. 安装\n\n**环境依赖**\n\n|Chip type|Driver version|\n|-|-|\n|BI150|4.3.8|\n\n* **机器：** BI150/BI150s 64GB 8-card machine\n* **镜像：** "
  },
  {
    "path": "docs/zh/ILUVATAR-GPU_usage_guide.md",
    "chars": 869,
    "preview": "# 模型列表和使用说明\n\n|Model Name|Training Method|Context Length|Quantization|ILUVATAR-GPUs Required|Deployment Commands|Applicab"
  },
  {
    "path": "docs/zh/Metax-GPU_installation_guide.md",
    "chars": 5595,
    "preview": "# 1. 安装\n\n**环境依赖**\n\n|Chip type|Driver version|\n|-|-|\n|MetaX C550|2.15.9|\n\n* **机器：** MetaX C550 64GB 8-card machine\n* **镜像"
  },
  {
    "path": "docs/zh/Metax-GPU_usage_guide.md",
    "chars": 555,
    "preview": "# 模型列表和使用说明\n\n|Model Name|Training Method|Context Length|Quantization|Metax-GPUs Required|Deployment Commands|Applicable "
  },
  {
    "path": "docs/zh/XPU_installation_guide.md",
    "chars": 6045,
    "preview": "# 1. 安装\n\n**环境依赖**\n\n|Chip type|Driver version|\n|-|-|\n|KunlunxinP800|5.0.21.26|\n\n* **机器：** KunlunxinP800 96GB 8-card machi"
  },
  {
    "path": "docs/zh/XPU_usage_guide.md",
    "chars": 885,
    "preview": "# 模型列表和使用说明\n\n|Model Name|Training Method|Context Length|Quantization|XPUs Required|Deployment Commands|Applicable Versio"
  },
  {
    "path": "docs/zh/chat_template_guide.md",
    "chars": 5737,
    "preview": "# 1. 背景说明\n\n大语言模型依托对话交互能力，能够输出符合人类语境的智能回复。而这一能力需依托 **Chat Template **结构化标注角色与上下文，定义多轮对话数据如何被转换为模型可训练的 token 序列，从而确保交互逻辑精准"
  },
  {
    "path": "docs/zh/cli_usage.md",
    "chars": 2624,
    "preview": "# 1. 命令行界面\n\n## 1.1. 概述\n\nPaddleFormers CLI（Command Line Interface）提供了基于终端的程序交互，通过配置文件来管理各类参数，高效灵活地执行模型训练、推理和评估任务。\n\n## 1.2"
  },
  {
    "path": "docs/zh/custom_datasets_format_zh.md",
    "chars": 1443,
    "preview": "# 当前文件格式支持\n\n当前支持 json、jsonl、parquet 三种格式，需保证文件名后缀和文件内容保持一致\n\n# 新增文件格式支持\n\n在 paddleformers/datasets/reader/io.py 里面实现各种类型文件"
  },
  {
    "path": "docs/zh/data_processing_guide.md",
    "chars": 5981,
    "preview": "# 1. 数据流基础参数说明\n\n## 1.1. 参数说明\n\n* 参数\n\n|参数名|参数说明|\n|-|-|\n|train_dataset_path|训练数据集路径：允许指定多个路径，通过`,`分隔不同的数据集。|\n|eval_dataset_"
  },
  {
    "path": "docs/zh/dataset_format.md",
    "chars": 23594,
    "preview": "# 1. 文件格式说明\n\n当前 PaddleFormers 的预训练、后训练数据流支持 `jsonl` 、 `json` 等格式的数据，训练时需确保文件名后缀和文件内容格式保持一致。\n\n## 1.1. 新增文件格式支持\n\n如果您有格外的文件"
  },
  {
    "path": "docs/zh/deployment_guide.md",
    "chars": 4427,
    "preview": "# 1. 引言\n\n当模型训练完成，用于推理时，需要基于高效的推理引擎进行部署，以满足 低时延 / 高吞吐 等需求。基于 PaddleFormers 训练完成的模型可以直接使用 vLLM 和 FastDeploy 等工具推理。本文档介绍如何使"
  },
  {
    "path": "docs/zh/dpo_and_lora_guide.md",
    "chars": 5924,
    "preview": "# 1. 背景说明\n\n大模型的训练流程通常包含四个关键阶段：\n\n1. 预训练（Pre-training）：通过海量无标注数据进行预训练，学习语言能力和世界知识，构建通用基座；\n2. 后预训练（Post-Pre-training）：通过注入特"
  },
  {
    "path": "docs/zh/ernie4.5_pretraining.md",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "docs/zh/how_to_download_model.md",
    "chars": 2731,
    "preview": "# 1. PaddleFormers 自动下载\n\n当使用 PaddleFormers 训练时，无论是使用 API 接口，还是通过命令行工具中的配置文件，在指定了正确的`repo_id/model_id`后，都会自动下载模型文件到本地并缓存。"
  },
  {
    "path": "docs/zh/image_processors_zh.md",
    "chars": 1421,
    "preview": "### 🏞️ Image Processor\n\n`Image Processor`是一个图像预处理工具，负责为视觉或多模态模型准备输入特征。它提供多种变换操作，例如调整大小和归一化，并支持输出 Paddle 张量。\n\nImage Proce"
  },
  {
    "path": "docs/zh/model_capability.md",
    "chars": 861,
    "preview": "# 训练能力支持\n|模型|PT / CPT|SFT|SFT-LoRA|DPO|DPO-LoRA|\n|-|-|-|-|-|-|\n|DeepSeekv3|✓|✓|✓|✓|✓|\n|🏛️ERNIE-4.5|✓|✓|✓|✓|✓|\n|Gemma3|✓|"
  },
  {
    "path": "docs/zh/processors_zh.md",
    "chars": 3747,
    "preview": "### ⚙️ Processors\n\n`Processor`是一种多模态预处理工具，用于为包含多种模态（如文本、图像等）的模型准备输入。它提供了一个统一的接口来执行不同类型的转换操作，例如文本分词、图像的调整大小与归一化，同时支持输出 Pa"
  },
  {
    "path": "docs/zh/pt_and_cpt_guide.md",
    "chars": 4541,
    "preview": "# 1. 背景说明\n\n大模型的训练流程通常包含四个关键阶段：\n\n1. **预训练（Pre-training）：通过海量无标注数据进行预训练，学习语言能力和世界知识，构建通用基座；（本文讲解）**\n2. 后预训练（Post-Pre-train"
  },
  {
    "path": "docs/zh/sft_and_lora_guide.md",
    "chars": 6864,
    "preview": "# 1. 背景说明\n\n大模型的训练流程通常包含四个关键阶段：\n\n1. 预训练（Pre-training）：通过海量无标注数据进行预训练，学习语言能力和世界知识，构建通用基座；\n2. 后预训练（Post-Pre-training）：通过注入特"
  },
  {
    "path": "docs/zh/template.md",
    "chars": 1113,
    "preview": "## 指定训练使用的 template\n\n| 参数 | 类型 | 描述 |\n| --- | --- | --- |\n| `template_backend` | str | 指定为`custom`表示使用自定义的 template，`jin"
  },
  {
    "path": "docs/zh/template_zh.md",
    "chars": 3412,
    "preview": "# 1. 注册 template\n\n## 1.1. 注册方法\n\n### 1.1.1 源代码修改（适用于 git clone 安装用户）\n在 `paddleformers/datasets/template/template.py` 文件中实"
  },
  {
    "path": "docs/zh/training_arguments.md",
    "chars": 20727,
    "preview": "# 1. 基础配置与训练控制\n\n```shell\n  --output_dir\n                        模型预测结果和检查点的输出目录。(`str`, 必须)\n\n  --overwrite_output_dir\n  "
  },
  {
    "path": "docs/zh/video_processors_zh.md",
    "chars": 1521,
    "preview": "### 🎬 Video Processor\n\n`Video Processor`是一个视频预处理工具，负责为多模模型准备输入特征，并处理其输出。它提供各种转换，例如调整大小、归一化等功能，同时支持输出 Paddle 张量。\n\nVideo P"
  },
  {
    "path": "examples/FAQ.md",
    "chars": 1216,
    "preview": "# 常见问题\n## 1. 多卡并行训练时出现通信问题\n### 问题描述\n在使用 examples 中的多卡训练指令时，出现类似以下错误信息：\n```\nLAUNCH INFO 2025-10-29 19: 08: 08, 155 Waitin"
  },
  {
    "path": "examples/README.md",
    "chars": 2775,
    "preview": "## 0. 环境变量\n\n在运行前，可以通过设置环境变量 `DOWNLOAD_SOURCE` 来指定模型的下载源，默认使用 **huggingface**。\n\n目前支持的下载源包括：\n- [huggingface](https://huggi"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/README.md",
    "chars": 3263,
    "preview": "# 1. 背景说明\n\nPaddleFormers 提供了 DeepSeek-V3 的预训练加速版本模型。基于 PaddlePaddle 框架在 DeepSeek-V3 上的实战经验，我们将整套高效训练优化能力做成了 **“开箱即用”** 的"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/SFT-Practice.md",
    "chars": 4804,
    "preview": "# DeepSeek-V3 全参数微调实践\n\n近期，我们成功组织并完成了 DeepSeek-V3（671B）模型的全参数微调实验。本次实践旨在验证超大规模模型在特定业务场景下的可控性与实际落地能力，同时系统探索全参数微调在性能优化、训练效率"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml",
    "chars": 1879,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: \"/root/train.json\" # 数据集存放路径\ntrain"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml",
    "chars": 1875,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: \"/root/train.json\" # 数据集存放路径\ntrain"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml",
    "chars": 1884,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: \"/root/train.json\" # 数据集存放路径\ntrain"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/config/config.json",
    "chars": 2331,
    "preview": "{\n    \"architectures\": [\n      \"DeepseekV2ForCausalLM\"\n    ],\n    \"attention_bias\": false,\n    \"attention_dropout\": 0.0,"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml",
    "chars": 1412,
    "preview": "stage: dsv3_pretrain\nmodel_name_or_path: \"./config\"\ntokenizer_name_or_path: \"./config\"\ninput_dir: \"./data\"\noutput_dir: \""
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/config/tokenizer.json",
    "chars": 7011306,
    "preview": "{\n    \"version\": \"1.0\",\n    \"truncation\": null,\n    \"padding\": null,\n    \"added_tokens\": [\n        {\n            \"id\": 0"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/config/tokenizer_config.json",
    "chars": 2948,
    "preview": "{\n  \"add_bos_token\": true,\n  \"add_eos_token\": false,\n  \"bos_token\": {\n    \"__type\": \"AddedToken\",\n    \"content\": \"<｜begi"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/run.sh",
    "chars": 919,
    "preview": "# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/pretrain/train_gpu.sh",
    "chars": 1481,
    "preview": "#!/bin/bash\n\n# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Ver"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/run_dsv3_128k.sh",
    "chars": 1117,
    "preview": "# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/run_dsv3_32k.sh",
    "chars": 1116,
    "preview": "# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/DeepSeek-V3/run_dsv3_4k.sh",
    "chars": 1115,
    "preview": "# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/ERNIE-4.5/README.md",
    "chars": 6029,
    "preview": "# 1. 背景说明\n\nPaddleFormers 提供了 ERNIE-4.5 的预训练加速版本模型，当前支持 [ERNIE-4.5-21B-A3B](https://huggingface.co/baidu/ERNIE-4.5-21B-A3"
  },
  {
    "path": "examples/best_practices/ERNIE-4.5-VL/README.md",
    "chars": 3786,
    "preview": "# ERNIE-4.5-VL-28B-A3B-Thinking 微调指南\n本文旨在指导用户如何微调 ERNIE-4.5-VL-28B-A3B-Thinking 模型。ERNIE-4.5-VL-28B-A3B-Thinking 是一个强大的多"
  },
  {
    "path": "examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml",
    "chars": 1618,
    "preview": "### data\ntrain_dataset_type: messages\ntrain_dataset_path: tests/fixtures/dummy/sft-vl/thinking_safety_demo.jsonl\ntrain_d"
  },
  {
    "path": "examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml",
    "chars": 1619,
    "preview": "### data\ntrain_dataset_type: messages\ntrain_dataset_path: tests/fixtures/dummy/sft-vl/thinking_safety_demo.jsonl\ntrain_d"
  },
  {
    "path": "examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml",
    "chars": 1644,
    "preview": "### data\ntrain_dataset_type: messages\ntrain_dataset_path: tests/fixtures/dummy/sft-vl/thinking_safety_demo.jsonl\ntrain_d"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/README.md",
    "chars": 24816,
    "preview": "# 1. 任务简介\n\nPaddleOCR-VL 是一款为文档解析任务量身打造的、性能顶尖 (SOTA) 且轻量高效的模型。它的核心是 PaddleOCR-VL-0.9B——一个紧凑而强大的视觉语言模型 (VLM)。该模型创新地集成了 NaV"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml",
    "chars": 1620,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml",
    "chars": 1654,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_export.yaml",
    "chars": 257,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: PaddlePaddle/PaddleOCR-VL\noutput_dir: ./PaddleOCR-VL-SFT-Bengali-lora\nco"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k.sh",
    "chars": 909,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k_4090D.sh",
    "chars": 1123,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k.sh",
    "chars": 909,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k_4090D.sh",
    "chars": 1123,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_export.sh",
    "chars": 705,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/README.md",
    "chars": 1075,
    "preview": "# PaddleOCR-VL-1.5 微调最佳实践\n\n本目录提供了基于 **PaddleOCR-VL-1.5** 模型进行微调的最佳实践教程。\n\n## 模型简介\n\nPaddleOCR-VL-1.5 是 PaddleOCR-VL 的全新升级版"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/paddleocr-vl-v15_full_16k_region_config.yaml",
    "chars": 1707,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./region_visual/region_visual_trai"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/paddleocr-vl-v15_full_16k_table_config.yaml",
    "chars": 1681,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./complex_table/complex_table_trai"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/paddleocr-vl-v15_lora_16k_region_config.yaml",
    "chars": 1741,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./region_visual/region_visual_trai"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/paddleocr-vl-v15_lora_16k_table_config.yaml",
    "chars": 1715,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./complex_table/complex_table_trai"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/region_ocr.md",
    "chars": 34830,
    "preview": "# 基于 PaddleOCR-VL-1.5微调实现区域识别能力\n\n## 任务简介\nPaddleOCR-VL-1.5 是 PaddleOCR-VL 的全新升级版本，作为一款 0.9B 参数量的超轻量级视觉语言模型 (VLM)，它在 OmniD"
  },
  {
    "path": "examples/best_practices/PaddleOCR-VL-1.5/table_ocr.md",
    "chars": 31602,
    "preview": "# 基于 PaddleOCR-VL-1.5微调表格数据\n\n## 任务简介\nPaddleOCR-VL-1.5 是 PaddleOCR-VL 的全新升级版本，作为一款 0.9B 参数量的超轻量级视觉语言模型 (VLM)，它在 OmniDocBe"
  },
  {
    "path": "examples/best_practices/function_call.md",
    "chars": 4489,
    "preview": "# Function Call Support\n\n## Data Format\n\n### SFT Data Format\n\nDemo data for function call training:\n\n```json\n[\n    {\n   "
  },
  {
    "path": "examples/best_practices/tutorials/how_to_train_a_function_call_model.md",
    "chars": 17357,
    "preview": "# 1. 任务简介\n\nFunction Calling 是一种让大模型能够调用外部函数的机制，当模型遇到自身知识外的内容，需要通过工具查询时，会输出结构化的调用信息，引导使用者调用工具进行上下文的补充。这种机制使得大模型不再局限于自身的知识"
  },
  {
    "path": "examples/best_practices/tutorials/how_to_train_a_reasoning_model.md",
    "chars": 13357,
    "preview": "# 1. 任务简介\n\n大模型的“思考模式”是指其在生成答案前，通过多步逻辑推理、信息检索与自我验证等过程，提升复杂任务的准确性与可解释性。这种模式让模型在面对复杂问题时，能够像人类一样进行逐步分析、推理，从而得出更为准确和合理的答案。\n\n大"
  },
  {
    "path": "examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md",
    "chars": 27615,
    "preview": "# 1. 任务简介\n\nVisual Grounding（视觉定位）是一种让多模态大模型能够将自然语言描述精确映射到图像具体区域（Bounding Box）的机制，通过文本指令与像素坐标的语义对齐，提升模型对物理世界的感知与交互能力。这种机制"
  },
  {
    "path": "examples/best_practices/tutorials/how_to_train_an_emoji_model.md",
    "chars": 15307,
    "preview": "# 1. 任务简介\n\n在大模型训练过程中，**监督微调（Supervised Fine-Tuning，SFT）** 与 **偏好对齐优化（Direct Preference Optimization，DPO）** 是提升模型指令遵循能力与输"
  },
  {
    "path": "examples/config/dpo/full.yaml",
    "chars": 1470,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo/full_function_call.yaml",
    "chars": 1595,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/functio"
  },
  {
    "path": "examples/config/dpo/full_tp_pp.yaml",
    "chars": 1614,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo/full_tp_pp_ep.yaml",
    "chars": 1846,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo/lora.yaml",
    "chars": 1329,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo/lora_tp_pp.yaml",
    "chars": 1441,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo/lora_tp_pp_ep.yaml",
    "chars": 1495,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/dpo/train.j"
  },
  {
    "path": "examples/config/dpo-vl/full.yaml",
    "chars": 1498,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/dpo-vl/full_fsdp.yaml",
    "chars": 1502,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/dpo-vl/full_tp.yaml",
    "chars": 1524,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/dpo-vl/lora.yaml",
    "chars": 1357,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/dpo-vl/lora_fsdp.yaml",
    "chars": 1362,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/dpo-vl/lora_tp.yaml",
    "chars": 1384,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/dpo-vl/trai"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml",
    "chars": 1252,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml",
    "chars": 1276,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_export.yaml",
    "chars": 136,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: baidu/ERNIE-4.5-0.3B-PT\noutput_dir: checkpoints/ernie-0.3B-sft-lora\ndevi"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/run_full_8k.sh",
    "chars": 800,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/run_lora_8k.sh",
    "chars": 799,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/run_lora_export.sh",
    "chars": 702,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml",
    "chars": 1224,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml",
    "chars": 1308,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_export.yaml",
    "chars": 140,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT\noutput_dir: checkpoints/ernie-sft-lora-tp-pp\n"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/run_full_8k.sh",
    "chars": 777,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/run_lora_8k.sh",
    "chars": 848,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/run_lora_export.sh",
    "chars": 705,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml",
    "chars": 1647,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml",
    "chars": 1681,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml",
    "chars": 278,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: PaddlePaddle/PaddleOCR-VL\noutput_dir: ./PaddleOCR-VL-SFT-Bengali-lora\nco"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh",
    "chars": 939,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh",
    "chars": 914,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh",
    "chars": 710,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml",
    "chars": 1253,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-0.3B/sft/run_lora.sh",
    "chars": 711,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-0.3B/sft/run_sft.sh",
    "chars": 710,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml",
    "chars": 1229,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml",
    "chars": 1288,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-21B-A3B/sft/run_lora.sh",
    "chars": 720,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-21B-A3B/sft/run_sft.sh",
    "chars": 727,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml",
    "chars": 1264,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/21b_8_gpus.yaml",
    "chars": 2416,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/300b_2016_gpus.yaml",
    "chars": 2761,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/300b_4_nodes_ce.yaml",
    "chars": 3031,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/300b_8_gpus_ci.yaml",
    "chars": 3000,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/300b_96gpus.yaml",
    "chars": 2781,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/eb45_pretrain/300b_96gpus_small_acc.yaml",
    "chars": 2920,
    "preview": "# stage\nstage: ernie_pretrain\n\n# data\ndataset_type: pretrain\ninput_dir: \"0.4 ./demo_data/data-1-part0 0.6 ./demo_data/da"
  },
  {
    "path": "examples/config/pt/full.yaml",
    "chars": 1146,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/pt/full_offline_data.yaml",
    "chars": 1008,
    "preview": "### data\ndataset_type: \"pretrain\"\ninput_dir: \"1.0 ./data/pre-training/demo_data/data-1-part0\"\nsplit: \"998,2\"\nmax_seq_len"
  },
  {
    "path": "examples/config/pt/full_tp_pp.yaml",
    "chars": 1176,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/pt/full_tp_pp_ep.yaml",
    "chars": 1522,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/pt/lora.yaml",
    "chars": 1170,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/pt/lora_tp_pp.yaml",
    "chars": 1200,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/pt/lora_tp_pp_ep.yaml",
    "chars": 1522,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/pt/train.js"
  },
  {
    "path": "examples/config/run_export.yaml",
    "chars": 112,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: Qwen3-0.6B-base\noutput_dir: checkpoints/qwen3_hf_0p6b_lora_ckpts"
  },
  {
    "path": "examples/config/sft/full.yaml",
    "chars": 1192,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft/full_function_call.yaml",
    "chars": 1317,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./tests/fixtures/dummy/sft/functio"
  },
  {
    "path": "examples/config/sft/full_tp_pp.yaml",
    "chars": 1221,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft/full_tp_pp_ep.yaml",
    "chars": 1568,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft/lora.yaml",
    "chars": 1216,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft/lora_tp_pp.yaml",
    "chars": 1241,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft/lora_tp_pp_ep.yaml",
    "chars": 1299,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/sft-vl/full.yaml",
    "chars": 1265,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/sft-vl/full_fsdp.yaml",
    "chars": 1268,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/sft-vl/full_tp.yaml",
    "chars": 1290,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/sft-vl/lora.yaml",
    "chars": 1289,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/sft-vl/lora_fsdp.yaml",
    "chars": 1293,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/sft-vl/lora_tp.yaml",
    "chars": 1315,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft-vl/trai"
  },
  {
    "path": "examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml",
    "chars": 1827,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: \"/root/train.json\" # 数据集存放路径\ntrain"
  },
  {
    "path": "examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml",
    "chars": 1855,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: \"/root/train.json\" # 数据集存放路径\ntrain"
  },
  {
    "path": "examples/config/xpu/DeepseekV3/sft/run_full_32k.sh",
    "chars": 1029,
    "preview": "#!/bin/bash\nexport XPU_BLACK_LIST=\"index_elementwise_put,index_elementwise_put_with_tensor,index_elementwise_put_with_te"
  },
  {
    "path": "examples/config/xpu/DeepseekV3/sft/run_full_4k.sh",
    "chars": 1028,
    "preview": "#!/bin/bash\nexport XPU_BLACK_LIST=\"index_elementwise_put,index_elementwise_put_with_tensor,index_elementwise_put_with_te"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml",
    "chars": 1193,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml",
    "chars": 1222,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k_export.yaml",
    "chars": 131,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: baidu/ERNIE-4.5-0.3B-PT\noutput_dir: checkpoints/ernie-0.3b-sft-lora-8k\n\n"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml",
    "chars": 1275,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml",
    "chars": 1305,
    "preview": "### data\ntrain_dataset_type: erniekit\neval_dataset_type: erniekit\ntrain_dataset_path: ./tests/fixtures/dummy/sft/train.j"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k_export.yaml",
    "chars": 134,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT\noutput_dir: checkpoints/ernie-21b-sft-lora-32"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-21B-A3B/sft/run_lora_32k.sh",
    "chars": 730,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-21B-A3B-Thinking/sft/full_8k.yaml",
    "chars": 1026,
    "preview": "### data\ntrain_dataset_type: messages\ntrain_dataset_path: tests/fixtures/dummy/sft/thinksafe_converted.jsonl\ntrain_datas"
  },
  {
    "path": "examples/config/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/full_32k.yaml",
    "chars": 1697,
    "preview": "### data\ntrain_dataset_type: messages\ntrain_dataset_path: tests/fixtures/dummy/sft-vl/thinking_safety_demo.jsonl\ntrain_d"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml",
    "chars": 1642,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml",
    "chars": 1676,
    "preview": "### data\ntrain_dataset_type: messages\neval_dataset_type: messages\ntrain_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl\nt"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml",
    "chars": 269,
    "preview": "### model\nfine_tuning: LoRA\nmodel_name_or_path: PaddlePaddle/PaddleOCR-VL\noutput_dir: ./PaddleOCR-VL-SFT-Bengali-lora\nco"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh",
    "chars": 971,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh",
    "chars": 946,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh",
    "chars": 705,
    "preview": "# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "examples/experiments/deepseek_v3_pretrain/README.md",
    "chars": 3051,
    "preview": "# DeepSeek-V3 预训练模型使用指南\n\n## 1. 硬件资源要求\n\n### 最低配置\n\nGPU: NVIDIA H100 80GB (推荐) 或 H800、H20等\n\n数量: 可根据配置调整 GPU 数量，一般需8卡以上, 多机多"
  }
]

// ... and 845 more files (download for full content)

About this extraction

This page contains the full source code of the PaddlePaddle/PaddleFormers GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1045 files (23.2 MB), approximately 4.0M tokens, and a symbol index with 6547 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo